# Imports:

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
import pickle

In [8]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

_Before we get to modeling, here is the process that got me to this point..._

> I originally built multi-class classification models to predict the average rating of a book. I rounded the average rating feature to the nearest whole number. This created very unbalanced classes and predictions (all predictions were 4 stars (the heaviest weighted class)).

> These models were broken down into NLP models (using Author and Title) and other models that used information such as isbn, publishing year, number of ratings, etc. 

> Once the best model was established for NLP and otherwise, I used the models to make prediction on my "to be read" list of books that I created in the "02-Cleaning" notebook. However, using the feature "isbn" meant dummifying that column, which then prevented me from using those models to make predictions on unseen data. This number is not relevent to the predictive power of the model, so I will not be using that feature in this notebook.

> In this notebook I will be composing binary classification models on the book data to determine if I should read said book or not

should_i_read_
- 1 --> "Yes"
- 0 --> "No"

If you wish to follow this process for yourself, all previous work can be found in the scatch folder under 04-Modeling.ipbynb

## NLP models using Author and Title

#### Read in the data:

In [9]:
binary = pd.read_csv('../data/books_for_binary_classification.csv')

In [10]:
binary.head()

Unnamed: 0,isbn,authors,original_publication_year,title,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,should_i_read
0,439023483,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,66715,127936,560092,1481305,2706317,1
1,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,75504,101676,455024,1156318,3011543,1
2,316015849,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,3866839,456191,436802,793319,875073,1355439,0
3,61120081,Harper Lee,1960.0,To Kill a Mockingbird,4.25,3198671,60427,117415,446835,1001952,1714267,1
4,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,2683664,86236,197621,606158,936012,947718,0


Combine Authors & Titles:

In [11]:
binary['authors_and_titles'] = (binary['authors'] + ' : ' + binary['title'])
binary.head()

Unnamed: 0,isbn,authors,original_publication_year,title,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,should_i_read,authors_and_titles
0,439023483,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,66715,127936,560092,1481305,2706317,1,Suzanne Collins : The Hunger Games (The Hunger...
1,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,75504,101676,455024,1156318,3011543,1,"J.K. Rowling, Mary GrandPré : Harry Potter and..."
2,316015849,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,3866839,456191,436802,793319,875073,1355439,0,"Stephenie Meyer : Twilight (Twilight, #1)"
3,61120081,Harper Lee,1960.0,To Kill a Mockingbird,4.25,3198671,60427,117415,446835,1001952,1714267,1,Harper Lee : To Kill a Mockingbird
4,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,2683664,86236,197621,606158,936012,947718,0,F. Scott Fitzgerald : The Great Gatsby


Label X and y:

In [12]:
X = binary['authors_and_titles']
y = binary['should_i_read']

In [13]:
y.value_counts(normalize=True)

1    0.525793
0    0.474207
Name: should_i_read, dtype: float64

Baseline:

In [14]:
y.mean()

0.5257928926251433

Train-test split:

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [16]:
X_train.shape

(5888,)

In [17]:
y_train.shape

(5888,)

# Model #1 & #2: Basic MNB

In [18]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [19]:
pipe.fit(X_train, y_train)

In [20]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9006453804347826
0.6607233825776873


In [21]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [22]:
pipe.fit(X_train, y_train)

In [23]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.903702445652174
0.6576668364747835


# Models #3 & #4: Basic KNN

In [24]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [25]:
pipe.fit(X_train, y_train)

In [26]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.774796195652174
0.6464595007641365


In [27]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

In [28]:
pipe.fit(X_train, y_train)

In [29]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.7878736413043478
0.674477840040754


# Models #5 & #6: Basic Logistic Regression

In [30]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [31]:
pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9794497282608695
0.6780438104941416


In [33]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(multi_class='multinomial'))
])

In [34]:
pipe.fit(X_train, y_train)

In [35]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9268002717391305
0.6795720835455935


# Models #7 & #8: Basic Bagging

In [36]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bag', BaggingClassifier())
])

In [37]:
pipe.fit(X_train, y_train)

In [38]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9750339673913043
0.6602139582272033


In [39]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('bag', BaggingClassifier())
])

In [40]:
pipe.fit(X_train, y_train)

In [41]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9757133152173914
0.6444218033622007


# Models #9 & #10: Basic AdaBoost

In [42]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [43]:
pipe.fit(X_train, y_train)

In [44]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.6134510869565217
0.5792154865002547


In [45]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ab', AdaBoostClassifier())
])

In [46]:
pipe.fit(X_train, y_train)

In [47]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.6141304347826086
0.5741212429954152


### Logistic Regression was the best basic model, let's try to make it better...

# Model 11

In [48]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

In [49]:
pipe.get_params()

{'memory': None,
 'steps': [('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())],
 'verbose': False,
 'tfidf': TfidfVectorizer(),
 'lr': LogisticRegression(),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty': '

In [50]:
param = {
    'tfidf__ngram_range': [(1, 1), (1,2), (2,2), (2, 3)],
    'tfidf__stop_words': [None, 'english'],
    'lr__penalty': ['l1', 'l2']
}

In [51]:
gs = GridSearchCV(pipe, param, cv=10, n_jobs = 4)

In [52]:
gs.fit(X_train, y_train)

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/kierste

In [53]:
gs.best_params_

{'lr__penalty': 'l2',
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [54]:
gs.best_score_

0.686314288024208

In [55]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9526154891304348
0.6887417218543046


#### NLP Findings:
> These models are all very weak and overfit due to the uniqueness of an author's name and a book's title. While they do beat baseline, they are still not great. The best model only has a 68.9% accuracy. 

### Pickling:

I am going to pickle model 11 so that it can be used in a streamlit app:

In [56]:
X = binary['authors_and_titles']
y = binary['should_i_read']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [58]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), stop_words='english')),
    ('lr', LogisticRegression(penalty='l2'))
])

pipe.fit(X_train, y_train)

In [59]:
with open('../code/book_pipe.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(pipe, pickle_out)

## Other models:

Let's remind ourselves what the data looks like:

In [50]:
binary.head()

Unnamed: 0,isbn,authors,original_publication_year,title,average_rating,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,should_i_read,authors_and_titles
0,439023483,Suzanne Collins,2008.0,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,66715,127936,560092,1481305,2706317,1,Suzanne Collins : The Hunger Games (The Hunger...
1,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,75504,101676,455024,1156318,3011543,1,"J.K. Rowling, Mary GrandPré : Harry Potter and..."
2,316015849,Stephenie Meyer,2005.0,"Twilight (Twilight, #1)",3.57,3866839,456191,436802,793319,875073,1355439,0,"Stephenie Meyer : Twilight (Twilight, #1)"
3,61120081,Harper Lee,1960.0,To Kill a Mockingbird,4.25,3198671,60427,117415,446835,1001952,1714267,1,Harper Lee : To Kill a Mockingbird
4,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,3.89,2683664,86236,197621,606158,936012,947718,0,F. Scott Fitzgerald : The Great Gatsby


Label X and y:

In [51]:
X = binary.drop(columns=['isbn', 'authors', 'title', 'average_rating', 'should_i_read', 'authors_and_titles'])
y = binary['should_i_read']

Train-test split:

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [53]:
X_train.shape

(5888, 7)

In [54]:
y_train.shape

(5888,)

# Models #1: Basic KNN

In [55]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

In [56]:
pipe.fit(X_train, y_train)

In [57]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9651834239130435
0.9429444727457973


# Models #2: Basic Logistic Regression

In [58]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression())
])

In [59]:
pipe.fit(X_train, y_train)

In [60]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9723165760869565
0.9735099337748344


# Models #3: Basic Bagging

In [61]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('bag', BaggingClassifier())
])

In [62]:
pipe.fit(X_train, y_train)

In [63]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9979619565217391
0.9648497198166073


# Models #4: Basic AdaBoost

In [64]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('ab', AdaBoostClassifier())
])

In [65]:
pipe.fit(X_train, y_train)

In [66]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.959578804347826
0.9470198675496688


### Logistic Regression was the best basic model, let's try to make it better...

# Model 5:

In [67]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression())
])

In [68]:
pipe.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()), ('lr', LogisticRegression())],
 'verbose': False,
 'ss': StandardScaler(),
 'lr': LogisticRegression(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty': 'l2',
 'lr__random_state': None,
 'lr__solver': 'lbfgs',
 'lr__tol': 0.0001,
 'lr__verbose': 0,
 'lr__warm_start': False}

In [69]:
param = {
    'lr__C': [0.25, 0.5, 0.75, 1],
    'lr__n_jobs': [1, 3, 5, 7],
    'lr__penalty': ['l1', 'l2']
}

In [70]:
gs = GridSearchCV(pipe, param, cv=10, n_jobs = 4)

In [71]:
gs.fit(X_train, y_train)

160 fits failed out of a total of 320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kierstensouth/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/kiers

In [72]:
gs.best_params_

{'lr__C': 1, 'lr__n_jobs': 1, 'lr__penalty': 'l2'}

In [73]:
gs.best_score_

0.970786990517769

In [74]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

0.9723165760869565
0.9735099337748344


> Tweaking with the model doesn't seem to make a difference

#### Other Models Findings:
> These models are very strong. The Logistic Regression model produces a 97.4% accuracy rate

# Predicitons

I have created two dataframes of unseen test values that are based off of my "To Be Read" list. Let's see how these models predict the ratings:

#### Authors & Titles only:

Using the "best params" from model 11:

In [75]:
X = binary['authors_and_titles']
y = binary['should_i_read']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [77]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), stop_words='english')),
    ('lr', LogisticRegression(penalty='l2'))
])

In [78]:
pipe.fit(X_train, y_train)

In [79]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9526154891304348
0.6887417218543046


In [80]:
preds = pipe.predict(X_test)

In [81]:
results = pd.DataFrame({
    'actual': y_test,
    'predicted': preds
})

In [82]:
wrongs = results[results.actual != results.predicted]
# wrongs.sort_values(by='actual')

In [83]:
y_test.shape

(1963,)

In [84]:
wrongs.shape

(611, 2)

In [85]:
wrongs.value_counts()

actual  predicted
0       1            348
1       0            263
dtype: int64

WOW, 611 incorrect predictions! That is a 68.87% accuracy rate.

- 348 books were predicated as "should read" books, when they are "should not read" books
- 263 books were predicted as "should not read" books, when they are "should read" books

Now, let's test this model on my "to be read" dataframe that I created!

Read in the data:

In [86]:
tbr = pd.read_csv('../data/tbr_list.csv')
tbr.head()

Unnamed: 0,author,title,isbn,original_publication_year,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,average_rating,should_i_read
0,Diana Gabaldon,Drangonfly in Amber,440215625,1992.0,347569,2241,7552,42197,114917,180662,4.335582,1
1,Caroline Peckham,The Awakening,914425022,2019.0,127605,3946,7343,28740,44885,42691,3.901469,0
2,Jennifer L. Armentrout,From Blood and Ash,952457769,2020.0,492577,10670,19053,60622,150507,251725,4.24562,1
3,Jon Krakauer,Under the Banner of Heaven: A Story of Violent...,330419129,2004.0,202595,3348,8245,39207,85127,66669,4.0046,1
4,Georgia Hunter,We Were the Lucky Ones,399563083,2017.0,142715,1062,2124,12669,48620,78240,4.407364,1


In [87]:
tbr['authors_and_titles'] = (tbr['author'] + ' : ' + tbr['title'])
tbr.head()

Unnamed: 0,author,title,isbn,original_publication_year,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,average_rating,should_i_read,authors_and_titles
0,Diana Gabaldon,Drangonfly in Amber,440215625,1992.0,347569,2241,7552,42197,114917,180662,4.335582,1,Diana Gabaldon : Drangonfly in Amber
1,Caroline Peckham,The Awakening,914425022,2019.0,127605,3946,7343,28740,44885,42691,3.901469,0,Caroline Peckham : The Awakening
2,Jennifer L. Armentrout,From Blood and Ash,952457769,2020.0,492577,10670,19053,60622,150507,251725,4.24562,1,Jennifer L. Armentrout : From Blood and Ash
3,Jon Krakauer,Under the Banner of Heaven: A Story of Violent...,330419129,2004.0,202595,3348,8245,39207,85127,66669,4.0046,1,Jon Krakauer : Under the Banner of Heaven: A S...
4,Georgia Hunter,We Were the Lucky Ones,399563083,2017.0,142715,1062,2124,12669,48620,78240,4.407364,1,Georgia Hunter : We Were the Lucky Ones


In [88]:
test = tbr['authors_and_titles']

In [89]:
preds = pipe.predict(test)

In [90]:
results = pd.DataFrame({
    'authors_and_titles': test,
    'should_i_read_prediction': preds,
    'should_i_read_actual': tbr['should_i_read'],
    'correct' : preds - tbr['should_i_read']
})

In [91]:
results

Unnamed: 0,authors_and_titles,should_i_read_prediction,should_i_read_actual,correct
0,Diana Gabaldon : Drangonfly in Amber,1,1,0
1,Caroline Peckham : The Awakening,0,0,0
2,Jennifer L. Armentrout : From Blood and Ash,1,1,0
3,Jon Krakauer : Under the Banner of Heaven: A S...,0,1,-1
4,Georgia Hunter : We Were the Lucky Ones,1,1,0
5,Hotel del Coronado Heritage Department : Beaut...,0,0,0
6,Gene Miller : 83 Hours Till Dawn,1,0,1
7,Claudia Oshry : Girl With No Job: The Crazy Be...,0,0,0
8,"Andrew E. Kaufman : The Lion, The Lamb, The Hu...",1,0,1
9,Micheal Connelly : The Lincoln Lawyer,1,1,0


In [92]:
results.correct.value_counts()

 0    17
 1     6
-1     5
Name: correct, dtype: int64

11 incorrect predictions on my TBR list. That is a 60.7% accuracy rate on the unseen data:

- 17 correct predictions
- 11 incorrect predictions 
    - 6 labeled as "should read" when they are "should not read"
    - 5 labeled as "should not read" when they are "should read"

#### Books by additional data:

Let's try again with the model that used additional data

In [93]:
X = binary.drop(columns=['isbn', 'authors', 'title', 'average_rating', 'should_i_read', 'authors_and_titles'])
y = binary['should_i_read']

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1002)

In [95]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression())
])

In [96]:
pipe.fit(X_train, y_train)

In [97]:
print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.9723165760869565
0.9735099337748344


In [98]:
preds = pipe.predict(X_test)

In [99]:
results = pd.DataFrame({
    'actual': y_test,
    'predicted': preds
})

In [100]:
wrongs = results[results.actual != results.predicted]
wrongs.sort_values(by='actual')

Unnamed: 0,actual,predicted
6348,0,1
2091,0,1
4210,0,1
4070,0,1
2003,0,1
6172,0,1
4920,0,1
5324,0,1
7067,0,1
116,0,1


In [101]:
y_test.shape

(1963,)

In [102]:
wrongs.shape

(52, 2)

In [103]:
wrongs.value_counts()

actual  predicted
0       1            30
1       0            22
dtype: int64

52 incorrect predictions. Better than the NLP model! This has a 97.35% accuracy rate:

- 30 books were predicted as "should read" books, when they are "should not read" books
- 22 books were predicted as "should not read" books, when they are "should read" books

Now, let's test this model on my "to be read" dataframe that I created!

Read in the data:

In [104]:
tbr.head()

Unnamed: 0,author,title,isbn,original_publication_year,ratings_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,average_rating,should_i_read,authors_and_titles
0,Diana Gabaldon,Drangonfly in Amber,440215625,1992.0,347569,2241,7552,42197,114917,180662,4.335582,1,Diana Gabaldon : Drangonfly in Amber
1,Caroline Peckham,The Awakening,914425022,2019.0,127605,3946,7343,28740,44885,42691,3.901469,0,Caroline Peckham : The Awakening
2,Jennifer L. Armentrout,From Blood and Ash,952457769,2020.0,492577,10670,19053,60622,150507,251725,4.24562,1,Jennifer L. Armentrout : From Blood and Ash
3,Jon Krakauer,Under the Banner of Heaven: A Story of Violent...,330419129,2004.0,202595,3348,8245,39207,85127,66669,4.0046,1,Jon Krakauer : Under the Banner of Heaven: A S...
4,Georgia Hunter,We Were the Lucky Ones,399563083,2017.0,142715,1062,2124,12669,48620,78240,4.407364,1,Georgia Hunter : We Were the Lucky Ones


In [105]:
test = tbr[['original_publication_year', 'ratings_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4' ,'ratings_5']]

In [106]:
preds = pipe.predict(test)

In [107]:
results = pd.DataFrame({
    'authors_and_titles': tbr['authors_and_titles'],
    'should_i_read_prediction': preds,
    'should_i_read_actual': tbr['should_i_read'],
    'correct' : preds - tbr['should_i_read']
})

In [108]:
results

Unnamed: 0,authors_and_titles,should_i_read_prediction,should_i_read_actual,correct
0,Diana Gabaldon : Drangonfly in Amber,1,1,0
1,Caroline Peckham : The Awakening,0,0,0
2,Jennifer L. Armentrout : From Blood and Ash,1,1,0
3,Jon Krakauer : Under the Banner of Heaven: A S...,1,1,0
4,Georgia Hunter : We Were the Lucky Ones,1,1,0
5,Hotel del Coronado Heritage Department : Beaut...,1,0,1
6,Gene Miller : 83 Hours Till Dawn,1,0,1
7,Claudia Oshry : Girl With No Job: The Crazy Be...,0,0,0
8,"Andrew E. Kaufman : The Lion, The Lamb, The Hu...",0,0,0
9,Micheal Connelly : The Lincoln Lawyer,1,1,0


In [109]:
results.correct.value_counts()

0    26
1     2
Name: correct, dtype: int64

92.9% correct predictions on my TBR list!

Only 2 incorrect predictions on my TBR list:

- 26 correct predictions
- 2 incorrect predictions 
    - 2 labeled as "should read" when they are "should not read"

**What do incorrect predictions mean in this context?**
- Scenario 1: A book is predicted that I should not read it immediately (an average rating of less than 4 stars)
    - In this scenario, I could potentially opt not to read a book that could end my book hangover
- Scenario 2: A book is predicted that I should read it immediately (an average rating of higher than 4 stars)
    - In this scenario, I could potentially read a book that I dislike and it could end up not helping my book hangover


Truthfully, neither scenario is life or death. All books will get read eventually, this just determines the timeline of that. Scenario 2 is especially not an issue in my eyes. I will never regret reading a book, regardless if I like it or not. Time spent exploring other worlds via books is time well spent. If anything, Scenario 1 is what is bothersome to me. I would be upset to learn that I opted out of reading a book that could end my sorrows, based on a incorrect rating. 