# Imports:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Models 1-15 are attempting to predict the average rating of a book based off the author and title of a book, which is information that anyone with a book would have access to

#### Read in the data:

In [3]:
all_books = pd.read_csv('../scrap/authors_and_titles_all.csv')

In [4]:
all_books.head()

Unnamed: 0,authors,title,average_rating
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.0
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.0
2,Stephenie Meyer,"Twilight (Twilight, #1)",4.0
3,Harper Lee,To Kill a Mockingbird,4.0
4,F. Scott Fitzgerald,The Great Gatsby,4.0


In [5]:
all_books.shape

(8411, 3)

Combine Authors & Titles:

In [6]:
all_books['authors_and_titles'] = (all_books['authors'] + ' : ' + all_books['title'])
all_books.head(10)

Unnamed: 0,authors,title,average_rating,authors_and_titles
0,Suzanne Collins,"The Hunger Games (The Hunger Games, #1)",4.0,Suzanne Collins : The Hunger Games (The Hunger...
1,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,4.0,"J.K. Rowling, Mary GrandPré : Harry Potter and..."
2,Stephenie Meyer,"Twilight (Twilight, #1)",4.0,"Stephenie Meyer : Twilight (Twilight, #1)"
3,Harper Lee,To Kill a Mockingbird,4.0,Harper Lee : To Kill a Mockingbird
4,F. Scott Fitzgerald,The Great Gatsby,4.0,F. Scott Fitzgerald : The Great Gatsby
5,John Green,The Fault in Our Stars,4.0,John Green : The Fault in Our Stars
6,J.R.R. Tolkien,The Hobbit,4.0,J.R.R. Tolkien : The Hobbit
7,J.D. Salinger,The Catcher in the Rye,4.0,J.D. Salinger : The Catcher in the Rye
8,Jane Austen,Pride and Prejudice,4.0,Jane Austen : Pride and Prejudice
9,Khaled Hosseini,The Kite Runner,4.0,Khaled Hosseini : The Kite Runner


Label X and y:

In [7]:
X = all_books['authors_and_titles']
y = all_books['average_rating']

Baseline accuracy according to GA Week 4, Thurs breakfast hour repo:

In [8]:
y.value_counts(normalize=True)

4.0    0.954821
3.0    0.032457
5.0    0.012603
2.0    0.000119
Name: average_rating, dtype: float64

In [9]:
base_preds = [y.mode()[0]] * len(y)

In [10]:
base_acc = accuracy_score(y, base_preds)
base_acc

0.9548210676495066

Baseline accuracy according to [this Medium Article]('https://towardsdatascience.com/calculating-a-baseline-accuracy-for-a-classification-model-a4b342ceb88f'):

In [11]:
0.954821**2 + 0.032457**2 + 0.012603**2 + 0.000119**2

0.9128954486600002

Train-test split:

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [13]:
X_train.shape

(6308,)

In [14]:
y_train.shape

(6308,)

# Model #1 & #2: Basic MNB

In [15]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [16]:
pipe.fit(X_train, y_train)

In [17]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9559289790741915
0.9514978601997147


In [18]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [19]:
pipe.fit(X_train, y_train)

In [20]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9662333544705136
0.9519733713742273


> Count Vectorizer outperforms the TfidVectorizer ever so slightly

# Models #3 & #4: Basic KNN

In [21]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [22]:
pipe.fit(X_train, y_train)

In [23]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9605263157894737
0.9481692819781264


In [24]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [25]:
pipe.fit(X_train, y_train)

In [26]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9594166138237159
0.9481692819781264


> KNN does not out perform MNB

# Models #5 & #6: Basic Logistic Regression

In [27]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [28]:
pipe.fit(X_train, y_train)

In [29]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9800253646163601
0.9514978601997147


In [30]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [31]:
pipe.fit(X_train, y_train)

In [32]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9578313253012049
0.9519733713742273


# Models #7 & #8: Basic Decision Tree

In [33]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [34]:
pipe.fit(X_train, y_train)

In [35]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.932001902044698


In [36]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [37]:
pipe.fit(X_train, y_train)

In [38]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.9262957679505468


> These Decision Trees are too overfit

# Models #9 & #10: Basic Bagging

In [39]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bag', BaggingClassifier())
])

In [40]:
pipe.fit(X_train, y_train)

In [41]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9950856055802156
0.9467427484545887


In [42]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('bag', BaggingClassifier())
])

In [43]:
pipe.fit(X_train, y_train)

In [44]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9954026632847178
0.9419876367094627


> Again, these are too overfit

# Models #11 & 12: Basic Random Forrest

In [45]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

In [46]:
pipe.fit(X_train, y_train)

In [47]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.9510223490252021


In [48]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [49]:
pipe.fit(X_train, y_train)

In [50]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.9538754160722777


> These are the best models so far

# Models #13 & #14: Basic AdaBoost

In [51]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [52]:
pipe.fit(X_train, y_train)

In [53]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9072606214331008
0.896338563956253


In [54]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [55]:
pipe.fit(X_train, y_train)

In [56]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9083703233988586
0.879695672848312


> These are the worst models so far

### Random Forrest was the best basic model, let's try to make it better...

# Model 15

In [57]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [58]:
pipe.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()), ('rf', RandomForestClassifier())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'rf': RandomForestClassifier(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': None,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impu

In [59]:
param = {
    'tfidf__ngram_range': [(1, 1), (1,2), (2,2)],
    'tfidf__stop_words': [None, 'english'],
    'rf__n_estimators': [100, 200, 300]
}

In [60]:
gs = GridSearchCV(pipe, param, cv=10, n_jobs = 4)

In [61]:
gs.fit(X_train, y_train)



In [62]:
gs.best_params_

{'rf__n_estimators': 200,
 'tfidf__ngram_range': (2, 2),
 'tfidf__stop_words': None}

In [63]:
gs.best_score_

0.9570392171660002

In [64]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9998414711477489
0.9514978601997147


Can I make a better model with different data?

### Models 16 - will again predict the average rating but instead will be using other features such as isbn, publication year, ratings count (total and of each star)

#### Read in the data:

In [65]:
books_by_isbn = pd.read_csv('../scrap/books_by_isbn.csv')

In [66]:
books_by_isbn.head()

Unnamed: 0,isbn,original_publication_year,language_code,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
0,439023483,2008.0,eng,4.0,4780653,66715,127936,560092,1481305,2706317
1,439554934,1997.0,eng,4.0,4602479,75504,101676,455024,1156318,3011543
2,316015849,2005.0,en-US,4.0,3866839,456191,436802,793319,875073,1355439
3,61120081,1960.0,eng,4.0,3198671,60427,117415,446835,1001952,1714267
4,743273567,1925.0,eng,4.0,2683664,86236,197621,606158,936012,947718


In [67]:
books_by_isbn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7851 entries, 0 to 7850
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       7851 non-null   object 
 1   original_publication_year  7851 non-null   float64
 2   language_code              7851 non-null   object 
 3   average_rating             7851 non-null   float64
 4   ratings_count              7851 non-null   int64  
 5   ratings_1                  7851 non-null   int64  
 6   ratings_2                  7851 non-null   int64  
 7   ratings_3                  7851 non-null   int64  
 8   ratings_4                  7851 non-null   int64  
 9   ratings_5                  7851 non-null   int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 613.5+ KB


In [68]:
books_by_isbn.shape

(7851, 10)

Label X and y:

In [69]:
X = books_by_isbn.drop(columns=['language_code', 'average_rating'])
y = books_by_isbn['average_rating']

In [70]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7851 entries, 0 to 7850
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   isbn                       7851 non-null   object 
 1   original_publication_year  7851 non-null   float64
 2   ratings_count              7851 non-null   int64  
 3   ratings_1                  7851 non-null   int64  
 4   ratings_2                  7851 non-null   int64  
 5   ratings_3                  7851 non-null   int64  
 6   ratings_4                  7851 non-null   int64  
 7   ratings_5                  7851 non-null   int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 490.8+ KB


In [71]:
X = pd.get_dummies(columns = ['isbn'], data = X, drop_first = True)

In [72]:
X.shape

(7851, 7857)

Train-test split:

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [74]:
y.value_counts(normalize=True)

4.0    0.954401
3.0    0.033626
5.0    0.011846
2.0    0.000127
Name: average_rating, dtype: float64

In [75]:
base_preds = [y.mode()[0]] * len(y)

In [76]:
base_acc = accuracy_score(y, base_preds)
base_acc

0.9544007132849318

In [77]:
0.954401**2 + 0.033626**2 + 0.011846**2 + 0.000127**2

0.9121523205220001

# Model 16 - Basic KNN

In [78]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

In [79]:
pipe.fit(X_train, y_train)

In [80]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9840353260869565
0.9735099337748344


In [81]:
pipe.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()), ('knn', KNeighborsClassifier())],
 'verbose': False,
 'ss': StandardScaler(),
 'knn': KNeighborsClassifier(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 5,
 'knn__p': 2,
 'knn__weights': 'uniform'}

In [82]:
param = {
    'knn__n_neighbors' : [3, 5, 7, 9],
    'knn__n_jobs' : [3, 5, 7]
}

In [83]:
gs = GridSearchCV(pipe, param, cv=10, n_jobs = 4)

In [84]:
gs.fit(X_train, y_train)



In [85]:
gs.best_params_

{'knn__n_jobs': 3, 'knn__n_neighbors': 3}

In [86]:
gs.best_score_

0.9762228151022718

In [87]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.986922554347826
0.9730005094243505


# Model 17 - Decision Tree

In [88]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [89]:
pipe.fit(X_train, y_train)

In [90]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
0.9842078451349975


> A very overfit model

# Model 18 - Basic Random Forest

In [91]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', RandomForestClassifier())
])

In [92]:
pipe.fit(X_train, y_train)

In [93]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
0.9719816607233825


> Another very overfit model

# Predicitons

I have created two dataframes of test values based off of my "To Be Read" list. Let's see how these models predict the ratings:

#### Authors & Titles only:

Using the "best params" from model 15:

In [94]:
X = all_books['authors_and_titles']
y = all_books['average_rating']

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [96]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('rf', RandomForestClassifier(n_estimators=300))
])

In [97]:
pipe.fit(X_train, y_train)

In [98]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998414711477489
0.9524488825487399


In [99]:
preds = pipe.predict(X_test)

In [100]:
results = pd.DataFrame({
    'actual': y_test,
    'predicted': preds
})

In [101]:
wrongs = results[results.actual != results.predicted]
wrongs.sort_values(by='actual')

Unnamed: 0,actual,predicted
4721,3.0,4.0
971,3.0,4.0
4053,3.0,4.0
7450,3.0,4.0
5294,3.0,4.0
4589,3.0,4.0
6712,3.0,4.0
7765,3.0,4.0
7947,3.0,4.0
2288,3.0,4.0


In [102]:
wrongs.shape

(100, 2)

> 100 incorrect predictions. The majority of the incorrect predictions are books with an average 3 star rating being rated 4 stars. Also, there are many average 5 star ratings being rated as 4.

**What do these incorrect predictions mean:**
- Scenerio 1: A book is rated lower than it actually is
    - In this scenerio, I could potentially opt not to read it and miss out on a great book
- Scenerio 2: A book is rated higher than it actually is
    - In this scenerio, I could potentially read a book that I dislike
    

Personally, scenerio 2 is not an issue in my eyes. I will never regret reading a book, regardless if I like it or not. Time spent exploring other worlds via books is time well spent. Scenerio 1 is what is bothersome to me. I would be upset to learn that I opted out of reading a book based on a incorrect rating. 

Now, let's test this model on my "to be read" dataframe that I created!

Read in the data:

In [103]:
tbr = pd.read_csv('../data/tbr_list.csv')
tbr.head()

Unnamed: 0,author,title,isbn,original_publication_year,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,average_rating,should_i_read
0,Diana Gabaldon,Drangonfly in Amber,440215625,1992.0,347569,2241,7552,42197,114917,180662,4.335582,1
1,Caroline Peckham,The Awakening,914425022,2019.0,127605,3946,7343,28740,44885,42691,3.901469,0
2,Jennifer L. Armentrout,From Blood and Ash,952457769,2020.0,492577,10670,19053,60622,150507,251725,4.24562,1
3,Jon Krakauer,Under the Banner of Heaven: A Story of Violent...,330419129,2004.0,202595,3348,8245,39207,85127,66669,4.0046,1
4,Georgia Hunter,We Were the Lucky Ones,399563083,2017.0,142715,1062,2124,12669,48620,78240,4.407364,1


In [104]:
tbr['authors_and_titles'] = (tbr['author'] + ' : ' + tbr['title'])
tbr.head()

Unnamed: 0,author,title,isbn,original_publication_year,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,average_rating,should_i_read,authors_and_titles
0,Diana Gabaldon,Drangonfly in Amber,440215625,1992.0,347569,2241,7552,42197,114917,180662,4.335582,1,Diana Gabaldon : Drangonfly in Amber
1,Caroline Peckham,The Awakening,914425022,2019.0,127605,3946,7343,28740,44885,42691,3.901469,0,Caroline Peckham : The Awakening
2,Jennifer L. Armentrout,From Blood and Ash,952457769,2020.0,492577,10670,19053,60622,150507,251725,4.24562,1,Jennifer L. Armentrout : From Blood and Ash
3,Jon Krakauer,Under the Banner of Heaven: A Story of Violent...,330419129,2004.0,202595,3348,8245,39207,85127,66669,4.0046,1,Jon Krakauer : Under the Banner of Heaven: A S...
4,Georgia Hunter,We Were the Lucky Ones,399563083,2017.0,142715,1062,2124,12669,48620,78240,4.407364,1,Georgia Hunter : We Were the Lucky Ones


In [105]:
test = tbr['authors_and_titles']

In [106]:
preds = pipe.predict(test)

In [107]:
results = pd.DataFrame({
    'authors_and_titles': test,
    'predicted_average_rating': preds,
    'actual_average_rating': tbr['average_rating']
})

In [108]:
results

Unnamed: 0,authors_and_titles,predicted_average_rating,actual_average_rating
0,Diana Gabaldon : Drangonfly in Amber,4.0,4.335582
1,Caroline Peckham : The Awakening,4.0,3.901469
2,Jennifer L. Armentrout : From Blood and Ash,4.0,4.24562
3,Jon Krakauer : Under the Banner of Heaven: A S...,4.0,4.0046
4,Georgia Hunter : We Were the Lucky Ones,4.0,4.407364
5,Hotel del Coronado Heritage Department : Beaut...,4.0,3.743243
6,Gene Miller : 83 Hours Till Dawn,4.0,3.766667
7,Claudia Oshry : Girl With No Job: The Crazy Be...,4.0,3.899354
8,"Andrew E. Kaufman : The Lion, The Lamb, The Hu...",4.0,3.777386
9,Micheal Connelly : The Lincoln Lawyer,4.0,4.201631


> All predicted ratings are 4 stars, which is peculiar, but not super surprising, due to the fact that the model was created off of 95% 4 star ratings

#### Books by isbn:

Let's try again with the model that used isbn + additional data

In [109]:
X = books_by_isbn.drop(columns=['language_code', 'average_rating'])
y = books_by_isbn['average_rating']

In [110]:
X = pd.get_dummies(columns = ['isbn'], data = X, drop_first = True)

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [112]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier(n_jobs=3, n_neighbors = 3))
])

In [113]:
pipe.fit(X_train, y_train)

In [114]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.986922554347826
0.9730005094243505


In [115]:
preds = pipe.predict(X_test)

In [116]:
results = pd.DataFrame({
    'actual': y_test,
    'predicted': preds
})

In [117]:
wrongs = results[results.actual != results.predicted]
wrongs.sort_values(by='actual')

Unnamed: 0,actual,predicted
1374,3.0,4.0
6437,3.0,4.0
2344,3.0,4.0
1805,3.0,4.0
6937,3.0,4.0
1008,3.0,4.0
4627,3.0,4.0
838,3.0,4.0
4744,3.0,4.0
3715,3.0,4.0


In [118]:
wrongs.shape

(53, 2)

> 53 incorrect predictions. Better than the NLP model

This model cannot be tested on my TBR list due to having to dummy the isbn column...

# Binary Classification Models:

#### Let's re-create some of these models using binary classification

The way that I rounded the average ratings in the cleaning stage heavily influenced the class weights. 95% of the average ratings were a 4 star rating, hence creating an unbalanced model. I reworked the data to make it a binary classification instead. Those with an average rating of above 4 is labeled under "should_i_read" as a 1 (meaning yes) and anything else is labeled a 0 (meaning no, I should not read)

#### Read in the data:

In [119]:
binary = pd.read_csv('../data/books_for_binary_classification.csv')
binary.head()

Unnamed: 0,isbn,authors,original_publication_year,title,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,should_i_read
0,439023483,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,66715,127936,560092,1481305,2706317,1
1,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,75504,101676,455024,1156318,3011543,1
2,316015849,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,3866839,456191,436802,793319,875073,1355439,0
3,61120081,Harper Lee,1960.0,To Kill a Mockingbird,4.25,3198671,60427,117415,446835,1001952,1714267,1
4,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,2683664,86236,197621,606158,936012,947718,0


## NLP models using Author and Title

Combine Authors & Titles:

In [120]:
binary['authors_and_titles'] = (binary['authors'] + ' : ' + binary['title'])
binary.head()

Unnamed: 0,isbn,authors,original_publication_year,title,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,should_i_read,authors_and_titles
0,439023483,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,66715,127936,560092,1481305,2706317,1,Suzanne Collins : The Hunger Games (The Hunger...
1,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,75504,101676,455024,1156318,3011543,1,"J.K. Rowling, Mary GrandPré : Harry Potter and..."
2,316015849,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,3866839,456191,436802,793319,875073,1355439,0,"Stephenie Meyer : Twilight (Twilight, #1)"
3,61120081,Harper Lee,1960.0,To Kill a Mockingbird,4.25,3198671,60427,117415,446835,1001952,1714267,1,Harper Lee : To Kill a Mockingbird
4,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,2683664,86236,197621,606158,936012,947718,0,F. Scott Fitzgerald : The Great Gatsby


Label X and y:

In [121]:
X = binary['authors_and_titles']
y = binary['should_i_read']

In [122]:
y.value_counts(normalize=True)

1    0.525793
0    0.474207
Name: should_i_read, dtype: float64

> Not perfect 50% - 50%, but balance much better than before!

Baseline:

In [123]:
y.mean()

0.5257928926251433

Train-test split:

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [125]:
X_train.shape

(5888,)

In [126]:
y_train.shape

(5888,)

# Model #1 & #2: Basic MNB

In [127]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [128]:
pipe.fit(X_train, y_train)

In [129]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9006453804347826
0.6607233825776873


In [130]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [131]:
pipe.fit(X_train, y_train)

In [132]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.903702445652174
0.6576668364747835


> Count Vectorizer outperforms the TfidVectorizer ever so slightly

# Models #3 & #4: Basic KNN

In [133]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [134]:
pipe.fit(X_train, y_train)

In [135]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.774796195652174
0.6464595007641365


In [136]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [137]:
pipe.fit(X_train, y_train)

In [138]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.7878736413043478
0.674477840040754


# Models #5 & #6: Basic Logistic Regression

In [139]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [140]:
pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [141]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9794497282608695
0.6780438104941416


In [142]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [143]:
pipe.fit(X_train, y_train)

In [144]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9268002717391305
0.6795720835455935


# Models #7 & #8: Basic Decision Tree

In [145]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [146]:
pipe.fit(X_train, y_train)

In [147]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
0.6490066225165563


In [148]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])

In [149]:
pipe.fit(X_train, y_train)

In [150]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
0.6357615894039735


> These Decision Trees are too overfit

# Models #9 & #10: Basic Bagging

In [151]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bag', BaggingClassifier())
])

In [152]:
pipe.fit(X_train, y_train)

In [153]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9707880434782609
0.6627610799796231


In [154]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('bag', BaggingClassifier())
])

In [155]:
pipe.fit(X_train, y_train)

In [156]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.977921195652174
0.6408558329088131


# Models #11 & 12: Basic Random Forrest

In [157]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

In [158]:
pipe.fit(X_train, y_train)

In [159]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998301630434783
0.6821192052980133


In [160]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [161]:
pipe.fit(X_train, y_train)

In [162]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998301630434783
0.6546102903718798


# Models #13 & #14: Basic AdaBoost

In [163]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [164]:
pipe.fit(X_train, y_train)

In [165]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.6134510869565217
0.5792154865002547


In [166]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [167]:
pipe.fit(X_train, y_train)

In [168]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.6141304347826086
0.5746306673458992


> These are the worst models so far

### Logistic Regression was the best basic model, let's try to make it better...

# Model 15

In [169]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

In [170]:
pipe.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'lr': LogisticRegression(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty': '

In [171]:
param = {
    'tfidf__ngram_range': [(1, 1), (1,2), (2,2), (2, 3)],
    'tfidf__stop_words': [None, 'english'],
    'lr__penalty': ['l1', 'l2']
}

In [172]:
gs = GridSearchCV(pipe, param, cv=10, n_jobs = 4)

In [173]:
gs.fit(X_train, y_train)

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/kierste

In [174]:
gs.best_params_

{'lr__penalty': 'l2',
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [175]:
gs.best_score_

0.686314288024208

In [176]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9526154891304348
0.6887417218543046


## Models 16 - will again predict the average rating but instead will be using other features such as isbn, publication year, ratings count (total and of each star)

#### Read in the data:

In [177]:
binary.head()

Unnamed: 0,isbn,authors,original_publication_year,title,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,should_i_read,authors_and_titles
0,439023483,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,66715,127936,560092,1481305,2706317,1,Suzanne Collins : The Hunger Games (The Hunger...
1,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,75504,101676,455024,1156318,3011543,1,"J.K. Rowling, Mary GrandPré : Harry Potter and..."
2,316015849,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,3866839,456191,436802,793319,875073,1355439,0,"Stephenie Meyer : Twilight (Twilight, #1)"
3,61120081,Harper Lee,1960.0,To Kill a Mockingbird,4.25,3198671,60427,117415,446835,1001952,1714267,1,Harper Lee : To Kill a Mockingbird
4,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,2683664,86236,197621,606158,936012,947718,0,F. Scott Fitzgerald : The Great Gatsby


Label X and y:

In [178]:
X = binary.drop(columns=['isbn', 'authors', 'title', 'should_i_read', 'authors_and_titles'])
y = binary['should_i_read']

In [179]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7851 entries, 0 to 7850
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   original_publication_year  7851 non-null   float64
 1   average_rating             7851 non-null   float64
 2   ratings_count              7851 non-null   int64  
 3   ratings_1                  7851 non-null   int64  
 4   ratings_2                  7851 non-null   int64  
 5   ratings_3                  7851 non-null   int64  
 6   ratings_4                  7851 non-null   int64  
 7   ratings_5                  7851 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 490.8 KB


In [180]:
# # I have re-edited the above X so no longer include isbn
# X = pd.get_dummies(columns = ['isbn'], data = X, drop_first = True)

In [181]:
X.shape

(7851, 8)

Train-test split:

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [183]:
y.value_counts(normalize=True)

1    0.525793
0    0.474207
Name: should_i_read, dtype: float64

# Model 16 - Basic KNN

In [184]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

In [185]:
pipe.fit(X_train, y_train)

In [186]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9949048913043478
0.9923586347427407


# Model 17 - Decision Tree

In [187]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [188]:
pipe.fit(X_train, y_train)

In [189]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
1.0


> A very overfit model

# Model 18 - Basic Random Forest

In [190]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', RandomForestClassifier())
])

In [191]:
pipe.fit(X_train, y_train)

In [192]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

1.0
1.0


> Another very overfit model

# Model 19 - Logistic Regession

In [193]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rf', LogisticRegression())
])

In [194]:
pipe.fit(X_train, y_train)

In [195]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9998301630434783
0.999490575649516


# Predicitons

I have created two dataframes of test values based off of my "To Be Read" list. Let's see how these models predict the ratings:

#### Authors & Titles only:

Using the "best params" from model 15:

In [196]:
X = binary['authors_and_titles']
y = binary['should_i_read']

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [198]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), stop_words='english')),
    ('lr', LogisticRegression(penalty='l2'))
])

In [199]:
pipe.fit(X_train, y_train)

In [200]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9526154891304348
0.6887417218543046


In [201]:
preds = pipe.predict(X_test)

In [202]:
results = pd.DataFrame({
    'actual': y_test,
    'predicted': preds
})

In [203]:
wrongs = results[results.actual != results.predicted]
# wrongs.sort_values(by='actual')

In [204]:
wrongs.shape

(611, 2)

> WOW, 611 incorrect predictions! Many were predicted that I should read them, when I shouldn't

**What do these incorrect predictions mean:**
- Scenerio 1: A book is rated lower than it actually is
    - In this scenerio, I could potentially opt not to read it and miss out on a great book
- Scenerio 2: A book is rated higher than it actually is
    - In this scenerio, I could potentially read a book that I dislike
    

Personally, scenerio 2 is not an issue in my eyes. I will never regret reading a book, regardless if I like it or not. Time spent exploring other worlds via books is time well spent. Scenerio 1 is what is bothersome to me. I would be upset to learn that I opted out of reading a book based on a incorrect rating. 

Now, let's test this model on my "to be read" dataframe that I created!

Read in the data:

In [205]:
tbr = pd.read_csv('../data/tbr_list.csv')
tbr.head()

Unnamed: 0,author,title,isbn,original_publication_year,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,average_rating,should_i_read
0,Diana Gabaldon,Drangonfly in Amber,440215625,1992.0,347569,2241,7552,42197,114917,180662,4.335582,1
1,Caroline Peckham,The Awakening,914425022,2019.0,127605,3946,7343,28740,44885,42691,3.901469,0
2,Jennifer L. Armentrout,From Blood and Ash,952457769,2020.0,492577,10670,19053,60622,150507,251725,4.24562,1
3,Jon Krakauer,Under the Banner of Heaven: A Story of Violent...,330419129,2004.0,202595,3348,8245,39207,85127,66669,4.0046,1
4,Georgia Hunter,We Were the Lucky Ones,399563083,2017.0,142715,1062,2124,12669,48620,78240,4.407364,1


In [206]:
tbr['authors_and_titles'] = (tbr['author'] + ' : ' + tbr['title'])
tbr.head()

Unnamed: 0,author,title,isbn,original_publication_year,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,average_rating,should_i_read,authors_and_titles
0,Diana Gabaldon,Drangonfly in Amber,440215625,1992.0,347569,2241,7552,42197,114917,180662,4.335582,1,Diana Gabaldon : Drangonfly in Amber
1,Caroline Peckham,The Awakening,914425022,2019.0,127605,3946,7343,28740,44885,42691,3.901469,0,Caroline Peckham : The Awakening
2,Jennifer L. Armentrout,From Blood and Ash,952457769,2020.0,492577,10670,19053,60622,150507,251725,4.24562,1,Jennifer L. Armentrout : From Blood and Ash
3,Jon Krakauer,Under the Banner of Heaven: A Story of Violent...,330419129,2004.0,202595,3348,8245,39207,85127,66669,4.0046,1,Jon Krakauer : Under the Banner of Heaven: A S...
4,Georgia Hunter,We Were the Lucky Ones,399563083,2017.0,142715,1062,2124,12669,48620,78240,4.407364,1,Georgia Hunter : We Were the Lucky Ones


In [207]:
test = tbr['authors_and_titles']

In [208]:
preds = pipe.predict(test)

In [209]:
results = pd.DataFrame({
    'authors_and_titles': test,
    'predicted_average_rating': preds,
    'actual_average_rating': tbr['average_rating']
})

In [210]:
results

Unnamed: 0,authors_and_titles,predicted_average_rating,actual_average_rating
0,Diana Gabaldon : Drangonfly in Amber,1,4.335582
1,Caroline Peckham : The Awakening,0,3.901469
2,Jennifer L. Armentrout : From Blood and Ash,1,4.24562
3,Jon Krakauer : Under the Banner of Heaven: A S...,0,4.0046
4,Georgia Hunter : We Were the Lucky Ones,1,4.407364
5,Hotel del Coronado Heritage Department : Beaut...,0,3.743243
6,Gene Miller : 83 Hours Till Dawn,1,3.766667
7,Claudia Oshry : Girl With No Job: The Crazy Be...,0,3.899354
8,"Andrew E. Kaufman : The Lion, The Lamb, The Hu...",1,3.777386
9,Micheal Connelly : The Lincoln Lawyer,1,4.201631


> All predicted ratings are 4 stars, which is peculiar, but not super surprising, due to the fact that the model was created off of 95% 4 star ratings

#### Books by isbn:

Let's try again with the model that used isbn + additional data

In [211]:
X = books_by_isbn.drop(columns=['language_code', 'average_rating'])
y = books_by_isbn['average_rating']

In [212]:
X = pd.get_dummies(columns = ['isbn'], data = X, drop_first = True)

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [214]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier(n_jobs=3, n_neighbors = 3))
])

In [215]:
pipe.fit(X_train, y_train)

In [216]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.986922554347826
0.9730005094243505


In [217]:
preds = pipe.predict(X_test)

In [218]:
results = pd.DataFrame({
    'actual': y_test,
    'predicted': preds
})

In [219]:
wrongs = results[results.actual != results.predicted]
wrongs.sort_values(by='actual')

Unnamed: 0,actual,predicted
1374,3.0,4.0
6437,3.0,4.0
2344,3.0,4.0
1805,3.0,4.0
6937,3.0,4.0
1008,3.0,4.0
4627,3.0,4.0
838,3.0,4.0
4744,3.0,4.0
3715,3.0,4.0


In [220]:
wrongs.shape

(53, 2)

> 53 incorrect predictions. Better than the NLP model

This model cannot be tested on my TBR list due to having to dummy the isbn column...