# Imports:

In [241]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier

In [204]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Models 1-15 are attempting to predict the average rating of a book based off the author and title of a book, which is information that anyone with a book would have access to

#### Read in the data:

In [205]:
all_books = pd.read_csv('../data/authors_and_titles_all.csv')

In [206]:
all_books.head()

Unnamed: 0,authors,title,average_rating
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.0
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.0
2,Stephenie Meyer,"Twilight (Twilight, #1)",4.0
3,Harper Lee,To Kill a Mockingbird,4.0
4,F. Scott Fitzgerald,The Great Gatsby,4.0


In [207]:
all_books.shape

(8411, 3)

Combine Authors & Titles:

In [208]:
all_books['authors_and_titles'] = (all_books['authors'] + ' : ' + all_books['title'])
all_books.head(10)

Unnamed: 0,authors,title,average_rating,authors_and_titles
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.0,Suzanne Collins : The Hunger Games (The Hunger...
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.0,"J.K. Rowling, Mary GrandPré : Harry Potter and..."
2,Stephenie Meyer,"Twilight (Twilight, #1)",4.0,"Stephenie Meyer : Twilight (Twilight, #1)"
3,Harper Lee,To Kill a Mockingbird,4.0,Harper Lee : To Kill a Mockingbird
4,F. Scott Fitzgerald,The Great Gatsby,4.0,F. Scott Fitzgerald : The Great Gatsby
5,John Green,The Fault in Our Stars,4.0,John Green : The Fault in Our Stars
6,J.R.R. Tolkien,The Hobbit,4.0,J.R.R. Tolkien : The Hobbit
7,J.D. Salinger,The Catcher in the Rye,4.0,J.D. Salinger : The Catcher in the Rye
8,Jane Austen,Pride and Prejudice,4.0,Jane Austen : Pride and Prejudice
9,Khaled Hosseini,The Kite Runner,4.0,Khaled Hosseini : The Kite Runner


Label X and y:

In [209]:
X = all_books['authors_and_titles']
y = all_books['average_rating']

Baseline accuracy according to GA Week 4, Thurs breakfast hour repo:

In [210]:
y.value_counts(normalize=True)

4.0    0.954821
3.0    0.032457
5.0    0.012603
2.0    0.000119
Name: average_rating, dtype: float64

In [211]:
base_preds = [y.mode()[0]] * len(y)

In [212]:
base_acc = accuracy_score(y, base_preds)
base_acc

0.9548210676495066

Baseline accuracy according to [this Medium Article]('https://towardsdatascience.com/calculating-a-baseline-accuracy-for-a-classification-model-a4b342ceb88f'):

In [213]:
0.954821**2 + 0.032457**2 + 0.012603**2 + 0.000119**2

0.9128954486600002

Train-test split:

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [215]:
X_train.shape

(6308,)

In [216]:
y_train.shape

(6308,)

# Model #1 & #2: Basic MNB

In [217]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [218]:
pipe.fit(X_train, y_train)

In [219]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9559289790741915
0.9514978601997147


In [220]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [221]:
pipe.fit(X_train, y_train)

In [222]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9662333544705136
0.9519733713742273


> Count Vectorizer outperforms the TfidVectorizer ever so slightly

# Models #3 & #4: Basic KNN

In [223]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [224]:
pipe.fit(X_train, y_train)

In [225]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9605263157894737
0.9481692819781264


In [226]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [227]:
pipe.fit(X_train, y_train)

In [228]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9594166138237159
0.9481692819781264


> KNN does not out perform MNB

# Models #5 & #6: Basic Logistic Regression

In [309]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [230]:
pipe.fit(X_train, y_train)

In [231]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9800253646163601
0.9514978601997147


In [310]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [233]:
pipe.fit(X_train, y_train)

In [234]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9578313253012049
0.9519733713742273


# Models #7 & #8: Basic Decision Tree

In [235]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [236]:
pipe.fit(X_train, y_train)

In [237]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.9343794579172611


In [238]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [239]:
pipe.fit(X_train, y_train)

In [240]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.9286733238231099


> These Decision Trees are too overfit

# Models #9 & #10: Basic Bagging

In [242]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bag', BaggingClassifier())
])

In [243]:
pipe.fit(X_train, y_train)

In [244]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.994134432466709
0.9419876367094627


In [247]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('bag', BaggingClassifier())
])

In [248]:
pipe.fit(X_train, y_train)

In [249]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9958782498414711
0.9438896814075131


> Again, these are too overfit

# Models #11 & 12: Basic Random Forrest

In [250]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

In [251]:
pipe.fit(X_train, y_train)

In [252]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.9529243937232525


In [253]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [254]:
pipe.fit(X_train, y_train)

In [255]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.953399904897765


> These are the best models so far

# Models #13 & #14: Basic AdaBoost

In [256]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [257]:
pipe.fit(X_train, y_train)

In [258]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9072606214331008
0.896338563956253


In [259]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [260]:
pipe.fit(X_train, y_train)

In [261]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9083703233988586
0.879695672848312


> These are the worst models so far

### Random Forrest was the best basic model, let's try to make it better...

# Model 15

In [262]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [264]:
pipe.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()), ('rf', RandomForestClassifier())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'rf': RandomForestClassifier(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impu

In [284]:
param = {
    'tfidf__ngram_range': [(1, 1), (1,2), (2,2)],
    'tfidf__stop_words': [None, 'english'],
    'rf__n_estimators': [100, 200, 300]
}

In [285]:
gs = GridSearchCV(pipe, param, cv=10, n_jobs = 4)

In [286]:
gs.fit(X_train, y_train)



In [287]:
gs.best_params_

{'rf__n_estimators': 200,
 'tfidf__ngram_range': (2, 2),
 'tfidf__stop_words': None}

In [288]:
gs.best_score_

0.9570392171660002

In [289]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9998414711477489
0.9514978601997147


Can I make a better model with different data?

### Models 16 - will again predict the average rating but instead will be using other features such as isbn, publication year, ratings count (total and of each star)

#### Read in the data:

In [291]:
books_by_isbn = pd.read_csv('../data/books_by_isbn.csv')

In [292]:
books_by_isbn.head()

Unnamed: 0,isbn,original_publication_year,language_code,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,439023483,2008.0,eng,4.0,4780653,66715,127936,560092,1481305,2706317
1,439554934,1997.0,eng,4.0,4602479,75504,101676,455024,1156318,3011543
2,316015849,2005.0,en-US,4.0,3866839,456191,436802,793319,875073,1355439
3,61120081,1960.0,eng,4.0,3198671,60427,117415,446835,1001952,1714267
4,743273567,1925.0,eng,4.0,2683664,86236,197621,606158,936012,947718


In [295]:
books_by_isbn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7851 entries, 0 to 7850
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       7851 non-null   object 
 1   original_publication_year  7851 non-null   float64
 2   language_code              7851 non-null   object 
 3   average_rating             7851 non-null   float64
 4   ratings_count              7851 non-null   int64  
 5   ratings_1                  7851 non-null   int64  
 6   ratings_2                  7851 non-null   int64  
 7   ratings_3                  7851 non-null   int64  
 8   ratings_4                  7851 non-null   int64  
 9   ratings_5                  7851 non-null   int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 613.5+ KB


In [293]:
books_by_isbn.shape

(7851, 10)

Label X and y:

In [294]:
X = books_by_isbn.drop(columns=['language_code', 'average_rating'])
y = books_by_isbn['average_rating']

In [298]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7851 entries, 0 to 7850
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       7851 non-null   object 
 1   original_publication_year  7851 non-null   float64
 2   ratings_count              7851 non-null   int64  
 3   ratings_1                  7851 non-null   int64  
 4   ratings_2                  7851 non-null   int64  
 5   ratings_3                  7851 non-null   int64  
 6   ratings_4                  7851 non-null   int64  
 7   ratings_5                  7851 non-null   int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 490.8+ KB


In [299]:
X = pd.get_dummies(columns = ['isbn'], data = X, drop_first = True)

In [300]:
X.shape

(7851, 7857)

Train-test split:

In [301]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [302]:
y.value_counts(normalize=True)

4.0    0.954401
3.0    0.033626
5.0    0.011846
2.0    0.000127
Name: average_rating, dtype: float64

In [304]:
base_preds = [y.mode()[0]] * len(y)

In [305]:
base_acc = accuracy_score(y, base_preds)
base_acc

0.9544007132849318

In [308]:
0.954401**2 + 0.033626**2 + 0.011846**2 + 0.000127**2

0.9121523205220001

# Model 16 - Basic KNN

In [311]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

In [312]:
pipe.fit(X_train, y_train)

In [314]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9840353260869565
0.9735099337748344


In [316]:
pipe.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()), ('knn', KNeighborsClassifier())],
 'verbose': False,
 'ss': StandardScaler(),
 'knn': KNeighborsClassifier(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 5,
 'knn__p': 2,
 'knn__weights': 'uniform'}

In [317]:
param = {
    'knn__n_neighbors' : [3, 5, 7, 9],
    'knn__n_jobs' : [3, 5, 7]
}

In [318]:
gs = GridSearchCV(pipe, param, cv=10, n_jobs = 4)

In [319]:
gs.fit(X_train, y_train)



In [320]:
gs.best_params_

{'knn__n_jobs': 3, 'knn__n_neighbors': 3}

In [321]:
gs.best_score_

0.9762228151022718

In [322]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.986922554347826
0.9730005094243505


# Model 17 - Decision Tree

In [323]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [324]:
pipe.fit(X_train, y_train)

In [325]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
0.9847172694854814


> A very overfit model

# Model 18 - Basic Random Forest

In [326]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', RandomForestClassifier())
])

In [327]:
pipe.fit(X_train, y_train)

In [328]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
0.9709628120224146


> Another very overfit model