In [94]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tqdm.notebook as tq

# Adding book title to each review

In [3]:
rev = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')
books = pd.read_csv('csc2515-project-main/csc2515-project-main/data/goodreads_book_info.csv')

rev.shape, books.shape

((275607, 7), (2360655, 3))

In [4]:
rev.head()

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id
0,5b78fd996dd2af3e863bec412a410512,2012-01-02,"[[0, Essential reading for anyone serious abou...",5,False,29999,ad2bf9e7e72997ec053fe9d666a76240
1,e21627c07b1c16a64b1d55afb0801cd3,2015-12-22,"[[0, Very informative.], [0, I feel like this ...",3,False,6882274,a0ce4c9e8a6e644d90851330494615e4
2,40272245ceb3404a973ab053f636dcf8,2016-02-29,"[[0, Really 4.4 stars - but goodreads only let...",4,False,17565845,78c8562226c80c7b7ca89afffa2c17dc
3,788c16647e9ba6f337a76f51b92cadc9,2013-08-23,"[[0, This was on my wish list a long time ago,...",1,False,9681214,6e7a6c60e7cdca91430dfb6113ad50b3
4,60ba13d8c742b84b2ec2445d9e04afe6,2013-11-23,"[[0, I loved this book!!], [0, When I first he...",5,True,11614718,2abcd2f0c63ea8de00024f7f22e167d4


In [34]:
books.head()

Unnamed: 0,description,book_id,title,reviews
115,The future world is at peace.\nElla Shepherd h...,22642971,The Body Electric,Oooh this is coming on my birthday. PERFECT BI...
282,Ready or not...love will find a way \nSingle d...,32336119,Worth the Wait (Guthrie Brothers #2),I love small town romances. A place where ever...
367,This is Maddy Turner's lucky day. The civilize...,2741853,Slow Hands,"Cute story, if a little predictable. For a FRE..."
376,A classic work of science fiction by renowned ...,12077902,Solaris: The Definitive Edition,Having been confused by both movie versions of...
383,After a series of explosive encounters with tw...,7843586,"More (More, #1)",A new favorite author! This is going into my f...


In [12]:
rev.book_id = rev.book_id.apply(np.int64)

In [16]:
books = books[books.book_id.isin(rev.book_id)]

In [17]:
books.shape

(24873, 3)

In [18]:
rev = rev.merge(books[['book_id', 'title']], on='book_id')

In [22]:
rev.to_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')

In [33]:
rev['flat_reviews'] = rev.review_sentences.apply(lambda x: ' '.join([s[1] for s in x]))
books['reviews'] = books.book_id.apply(lambda x: ' '.join(rev[rev.book_id == x].flat_reviews.values))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['reviews'] = books.book_id.apply(lambda x: ' '.join(rev[rev.book_id == x].flat_reviews.values))


In [37]:
books.reviews.values[3]

"Having been confused by both movie versions of this book I decided to give the original a go and see if it's any better. Before I started on this I imagining it was going to be a total mess and I might have to abandon it. But, it was the exact opposite! The book quickly gets to the heart of the matter and all the confusion of the movies was explained in the first 10-20%. Having now read Solaris, I seriously cannot see how this could be made into an effective movie without tons of voice overs. Most of the drama is psychological in nature and revolves around the central characters thoughts and emotions. What both the movies kind of left out is the nature of the planet Solaris. It contains a vast alien intelligence, totally unlike any thing encountered before and mostly incomprehensible to the scientists trying to study it. It's a great example of alien contact and world building rolled into one. Anyone who didn't like the movie, but is still wondering why this is such a renowned work sh

In [39]:
books.to_csv('csc2515-project-main/csc2515-project-main/data/goodreads_books_revs.csv', index=False)

# building tf-idf matrix

In [48]:
count_vect = CountVectorizer()
word_idx = count_vect.fit_transform(books.reviews.values)
word_idx.shape

(24873, 168886)

In [50]:
count_vect.vocabulary_.get('snout')

137821

In [52]:
tfidf_transformer = TfidfTransformer().fit(word_idx)
books_tfidf = tfidf_transformer.transform(word_idx)
books_tfidf.shape

(24873, 168886)

In [54]:
books_tfidf

<24873x168886 sparse matrix of type '<class 'numpy.float64'>'
	with 14092982 stored elements in Compressed Sparse Row format>

In [87]:
df = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads_sent_spoil.pkl')

In [88]:
df.head()

Unnamed: 0,sentence,has_spoiler
0,Essential reading for anyone serious about mys...,0
1,Hammett is the grandfather of the hard-bitten ...,0
2,While his Continental Op was the prototype for...,0
3,"Expect sharp dialog, vivid characters and a vo...",0
4,If you've seen John Huston's 1941 film version...,0


In [89]:
# df = df.sample(frac=0.1)

In [90]:
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

In [79]:
X_train = tfidf_transformer.transform(count_vect.transform(df_train.sentence.values))

# Fitting NB classifier

In [80]:
nb = MultinomialNB().fit(X_train, df_train.has_spoiler)

In [91]:
X_test = tfidf_transformer.transform(count_vect.transform(df_test.sentence.values))
predicted = nb.predict(X_test)

In [92]:
roc_auc_score(df_test.has_spoiler, predicted)

0.7153545101231146

In [83]:
predicted[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

# that didn't go to well! Let's try with a balanced dataset

In [85]:
df = pd.concat([df[df.has_spoiler==1], df[df.has_spoiler==0].sample(len(df[df.has_spoiler==1]))]).sample(frac=1)
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

X_train = tfidf_transformer.transform(count_vect.transform(df_train.sentence.values))
X_test = tfidf_transformer.transform(count_vect.transform(df_test.sentence.values))

nb = MultinomialNB().fit(X_train, df_train.has_spoiler)
predicted = nb.predict(X_test)

In [86]:
roc_auc_score(df_test.has_spoiler, predicted)

0.6870193939592415

# Now that's much better, but somehow doesn't cut it
Looks like the task is a wee bit too tough for a simple model like this, but the transformer-based models might benefit from these features, or at least book titles

In [93]:
# roc-auc: 69% for a balanced test set, 72% for the original proportions set

# Preparing transformer input sentences: adding book titles

In [96]:
rev = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')
sent = []
spoil = []
for i in tq.tqdm(range(len(rev))):
    for j in range(len(rev.iloc[i]['review_sentences'])):
        sent.append(rev.iloc[i].title + ' [SEP] ' + rev.iloc[i]['review_sentences'][j][1])   
        spoil.append(rev.iloc[i]['review_sentences'][j][0])

  0%|          | 0/275607 [00:00<?, ?it/s]

In [97]:
df = pd.DataFrame(list(zip(sent, spoil)), 
               columns =['sentence', 'has_spoiler']) 

In [99]:
df.to_pickle("csc2515-project-main/csc2515-project-main/data/goodreads_sent_spoil_titles.pkl")

In [100]:
df.head()

Unnamed: 0,sentence,has_spoiler
0,The Maltese Falcon [SEP] Essential reading for...,0
1,The Maltese Falcon [SEP] Hammett is the grandf...,0
2,The Maltese Falcon [SEP] While his Continental...,0
3,"The Maltese Falcon [SEP] Expect sharp dialog, ...",0
4,The Maltese Falcon [SEP] If you've seen John H...,0
