In [1]:
# !pip install lazypredict

In [2]:
import pandas as pd
url = 'https://raw.githubusercontent.com/lisb020/scifi_book_game/main/BookList.csv'
pd_df = pd.read_csv(url)

pd_df.head()

Unnamed: 0.1,Unnamed: 0,Book_Title,Rating_score,Rating_votes,Book_Description,subgenre
0,0,Obsidian,4.17,236780,Starting over sucks When we moved to West Virg...,sf_aliens
1,1,Onyx,4.27,153429,BEING CONNECTED TO DAEMON BLACK SUCKS… Thanks ...,sf_aliens
2,2,The 5th Wave,4.03,400600,After the 1st wave only darkness remains Aft...,sf_aliens
3,3,The Host,3.84,915026,Melanie Stryder refuses to fade away The eart...,sf_aliens
4,5,Origin,4.35,93979,Daemon will do anything to get Katy back After...,sf_aliens


In [3]:
pd_df.drop(columns="Unnamed: 0", inplace=True)
pd_df.head()

Unnamed: 0,Book_Title,Rating_score,Rating_votes,Book_Description,subgenre
0,Obsidian,4.17,236780,Starting over sucks When we moved to West Virg...,sf_aliens
1,Onyx,4.27,153429,BEING CONNECTED TO DAEMON BLACK SUCKS… Thanks ...,sf_aliens
2,The 5th Wave,4.03,400600,After the 1st wave only darkness remains Aft...,sf_aliens
3,The Host,3.84,915026,Melanie Stryder refuses to fade away The eart...,sf_aliens
4,Origin,4.35,93979,Daemon will do anything to get Katy back After...,sf_aliens


In [4]:
import re
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", str(elem)))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df
data_clean = clean_text(pd_df, 'Book_Description', 'text_clean')
data_clean.head()

Unnamed: 0,Book_Title,Rating_score,Rating_votes,Book_Description,subgenre,text_clean
0,Obsidian,4.17,236780,Starting over sucks When we moved to West Virg...,sf_aliens,starting over sucks when we moved to west virg...
1,Onyx,4.27,153429,BEING CONNECTED TO DAEMON BLACK SUCKS… Thanks ...,sf_aliens,being connected to daemon black sucks thanks t...
2,The 5th Wave,4.03,400600,After the 1st wave only darkness remains Aft...,sf_aliens,after the st wave only darkness remains afte...
3,The Host,3.84,915026,Melanie Stryder refuses to fade away The eart...,sf_aliens,melanie stryder refuses to fade away the eart...
4,Origin,4.35,93979,Daemon will do anything to get Katy back After...,sf_aliens,daemon will do anything to get katy back after...


In [5]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data_clean['text_clean'] = data_clean['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head()

ModuleNotFoundError: No module named 'nltk'

In [None]:
def remove_nums(row):
  output = ''.join(c for c in str(row['text_clean']) if not c.isdigit())
  return(output)

data_clean['text_clean'] = data_clean.apply (lambda row: remove_nums(row), axis=1)

In [None]:
def remove_multi_spaces(row):
  single_spaces = " ".join(str(row['text_clean']).split())
  return(single_spaces)

data_clean['text_clean'] = data_clean.apply (lambda row: remove_multi_spaces(row), axis=1)

In [None]:
def remove_unicode(row):
  string_encode = str(row['text_clean']).encode("ascii", "ignore")
  return(string_encode.decode())

data_clean['text_clean'] = data_clean.apply (lambda row: remove_unicode(row), axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
X= data_clean['text_clean']
y= data_clean['subgenre']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
data = data_clean.values
y = data[:, 4]
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)
print(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_clean['text_clean'], y, random_state=1)
print(X_train)
print(y_train)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


In [None]:
word_freq_df = pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names())
top_words_df = pd.DataFrame(word_freq_df.sum()).sort_values(0, ascending=False)
top_words_df.head(10)

In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_cv, y_train)
predictions = naive_bayes.predict(X_test_cv)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print('Accuracy score: ', accuracy_score(y_test, predictions))
print('Precision score: ', precision_score(y_test, predictions,average='micro'))
print('Recall score: ', recall_score(y_test, predictions,average='micro'))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, square=True, annot=True, cmap='RdBu', cbar=False,
xticklabels=['sf_aliens', 'sf_alternate_history', 'sf_alternate_universe',
       'sf_apocalyptic', 'sf_cyberpunk', 'sf_dystopia', 'sf_hard',
       'sf_military', 'sf_robots', 'sf_space_opera', 'sf_steampunk',
       'sf_time_travel'], yticklabels=['sf_aliens', 'sf_alternate_history', 'sf_alternate_universe',
       'sf_apocalyptic', 'sf_cyberpunk', 'sf_dystopia', 'sf_hard',
       'sf_military', 'sf_robots', 'sf_space_opera', 'sf_steampunk',
       'sf_time_travel'])
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
# import lazypredict
# from lazypredict.Supervised import LazyClassifier
# clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
# models,predictions = clf.fit(X_train_cv, X_test_cv, y_train, y_test)

# print(models)