In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tqdm.notebook as tq

# Adding book title to each review

In [3]:
rev = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')
books = pd.read_csv('csc2515-project-main/csc2515-project-main/data/goodreads_book_info.csv')

rev.shape, books.shape

((275607, 7), (2360655, 3))

In [4]:
rev.head()

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id
0,5b78fd996dd2af3e863bec412a410512,2012-01-02,"[[0, Essential reading for anyone serious abou...",5,False,29999,ad2bf9e7e72997ec053fe9d666a76240
1,e21627c07b1c16a64b1d55afb0801cd3,2015-12-22,"[[0, Very informative.], [0, I feel like this ...",3,False,6882274,a0ce4c9e8a6e644d90851330494615e4
2,40272245ceb3404a973ab053f636dcf8,2016-02-29,"[[0, Really 4.4 stars - but goodreads only let...",4,False,17565845,78c8562226c80c7b7ca89afffa2c17dc
3,788c16647e9ba6f337a76f51b92cadc9,2013-08-23,"[[0, This was on my wish list a long time ago,...",1,False,9681214,6e7a6c60e7cdca91430dfb6113ad50b3
4,60ba13d8c742b84b2ec2445d9e04afe6,2013-11-23,"[[0, I loved this book!!], [0, When I first he...",5,True,11614718,2abcd2f0c63ea8de00024f7f22e167d4


In [34]:
books.head()

Unnamed: 0,description,book_id,title,reviews
115,The future world is at peace.\nElla Shepherd h...,22642971,The Body Electric,Oooh this is coming on my birthday. PERFECT BI...
282,Ready or not...love will find a way \nSingle d...,32336119,Worth the Wait (Guthrie Brothers #2),I love small town romances. A place where ever...
367,This is Maddy Turner's lucky day. The civilize...,2741853,Slow Hands,"Cute story, if a little predictable. For a FRE..."
376,A classic work of science fiction by renowned ...,12077902,Solaris: The Definitive Edition,Having been confused by both movie versions of...
383,After a series of explosive encounters with tw...,7843586,"More (More, #1)",A new favorite author! This is going into my f...


In [12]:
rev.book_id = rev.book_id.apply(np.int64)

In [16]:
books = books[books.book_id.isin(rev.book_id)]

In [17]:
books.shape

(24873, 3)

In [18]:
rev = rev.merge(books[['book_id', 'title']], on='book_id')

In [22]:
rev.to_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')

In [33]:
rev['flat_reviews'] = rev.review_sentences.apply(lambda x: ' '.join([s[1] for s in x]))
books['reviews'] = books.book_id.apply(lambda x: ' '.join(rev[rev.book_id == x].flat_reviews.values))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['reviews'] = books.book_id.apply(lambda x: ' '.join(rev[rev.book_id == x].flat_reviews.values))


In [37]:
books.reviews.values[3]

"Having been confused by both movie versions of this book I decided to give the original a go and see if it's any better. Before I started on this I imagining it was going to be a total mess and I might have to abandon it. But, it was the exact opposite! The book quickly gets to the heart of the matter and all the confusion of the movies was explained in the first 10-20%. Having now read Solaris, I seriously cannot see how this could be made into an effective movie without tons of voice overs. Most of the drama is psychological in nature and revolves around the central characters thoughts and emotions. What both the movies kind of left out is the nature of the planet Solaris. It contains a vast alien intelligence, totally unlike any thing encountered before and mostly incomprehensible to the scientists trying to study it. It's a great example of alien contact and world building rolled into one. Anyone who didn't like the movie, but is still wondering why this is such a renowned work sh

In [39]:
books.to_csv('csc2515-project-main/csc2515-project-main/data/goodreads_books_revs.csv', index=False)

# building tf-idf matrix

In [48]:
count_vect = CountVectorizer()
word_idx = count_vect.fit_transform(books.reviews.values)
word_idx.shape

(24873, 168886)

In [50]:
count_vect.vocabulary_.get('snout')

137821

In [52]:
tfidf_transformer = TfidfTransformer().fit(word_idx)
books_tfidf = tfidf_transformer.transform(word_idx)
books_tfidf.shape

(24873, 168886)

In [54]:
books_tfidf

<24873x168886 sparse matrix of type '<class 'numpy.float64'>'
	with 14092982 stored elements in Compressed Sparse Row format>

In [87]:
df = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads_sent_spoil.pkl')

In [88]:
df.head()

Unnamed: 0,sentence,has_spoiler
0,Essential reading for anyone serious about mys...,0
1,Hammett is the grandfather of the hard-bitten ...,0
2,While his Continental Op was the prototype for...,0
3,"Expect sharp dialog, vivid characters and a vo...",0
4,If you've seen John Huston's 1941 film version...,0


In [89]:
# df = df.sample(frac=0.1)

In [90]:
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

In [79]:
X_train = tfidf_transformer.transform(count_vect.transform(df_train.sentence.values))

# Fitting NB classifier

In [80]:
nb = MultinomialNB().fit(X_train, df_train.has_spoiler)

In [91]:
X_test = tfidf_transformer.transform(count_vect.transform(df_test.sentence.values))
predicted = nb.predict(X_test)

In [92]:
roc_auc_score(df_test.has_spoiler, predicted)

0.7153545101231146

In [83]:
predicted[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

# that didn't go to well! Let's try with a balanced dataset

In [85]:
df = pd.concat([df[df.has_spoiler==1], df[df.has_spoiler==0].sample(len(df[df.has_spoiler==1]))]).sample(frac=1)
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

X_train = tfidf_transformer.transform(count_vect.transform(df_train.sentence.values))
X_test = tfidf_transformer.transform(count_vect.transform(df_test.sentence.values))

nb = MultinomialNB().fit(X_train, df_train.has_spoiler)
predicted = nb.predict(X_test)

In [86]:
roc_auc_score(df_test.has_spoiler, predicted)

0.6870193939592415

# Now that's much better, but somehow doesn't cut it
Looks like the task is a wee bit too tough for a simple model like this, but the transformer-based models might benefit from these features, or at least book titles

In [93]:
# roc-auc: 69% for a balanced test set, 72% for the original proportions set

# Preparing transformer input sentences: adding book titles

In [96]:
rev = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')
sent = []
spoil = []
for i in tq.tqdm(range(len(rev))):
    for j in range(len(rev.iloc[i]['review_sentences'])):
        sent.append(rev.iloc[i].title + ' [SEP] ' + rev.iloc[i]['review_sentences'][j][1])   
        spoil.append(rev.iloc[i]['review_sentences'][j][0])

  0%|          | 0/275607 [00:00<?, ?it/s]

In [97]:
df = pd.DataFrame(list(zip(sent, spoil)), 
               columns =['sentence', 'has_spoiler']) 

In [99]:
df.to_pickle("csc2515-project-main/csc2515-project-main/data/goodreads_sent_spoil_titles.pkl")

In [100]:
df.head()

Unnamed: 0,sentence,has_spoiler
0,The Maltese Falcon [SEP] Essential reading for...,0
1,The Maltese Falcon [SEP] Hammett is the grandf...,0
2,The Maltese Falcon [SEP] While his Continental...,0
3,"The Maltese Falcon [SEP] Expect sharp dialog, ...",0
4,The Maltese Falcon [SEP] If you've seen John H...,0


In [3]:
rev = pd.read_pickle('data/flattened_15K_test.pkl')
rev.head()

Unnamed: 0,user_id,timestamp,book_id,book_title,sent_num,sent_spoil,sentence,rating,has_spoiler,review_id
111980,bdca7a98aaf41a86e637c63ae39660e1,2015-06-13,22308717,The Last Time We Say Goodbye,9,0,I envy Alexis Riggs for that.,4,False,8f776a27a80e922e4a7d8cd493a41959
173287,c800ba07d8d3d7c81f122cffcc78bc73,2014-06-09,11713449,"Delirium (Delirium, #1)",19,0,Lena also had some fairly solid relationships ...,4,False,20b8a1c85ea93aa32ab404e2820ea80e
52708,24434c68039ba698f1ae91ecb7065f2a,2016-05-12,16034235,"Throne of Glass (Throne of Glass, #1)",8,0,It is believed that magic no longer exists.,4,False,87c6f9a4bd270ba942b74632d3227093
93324,5ffbda5c69a7ed61e185b9878b0d622d,2012-08-13,11919,"Haunted (Women of the Otherworld, #5)",29,0,They simply click.,3,False,c68a5b3433ee9da0021c02373084b497
71643,b012ddad3df33ae31b180d0b70095b42,2014-06-27,30122,The Missing Piece,3,0,Will the circle ever miss that piece it found ...,4,False,66b061efe7e6057792f4042f97712c12


In [5]:
sent = []
spoil = []
for i in tq.tqdm(range(len(rev))):
    sent.append(rev.iloc[i].book_title + ' [SEP] ' + rev.iloc[i].sentence)   
    spoil.append(rev.iloc[i].sent_spoil)

df = pd.DataFrame(list(zip(sent, spoil)), 
               columns =['sentence', 'has_spoiler']) 

  0%|          | 0/15001 [00:00<?, ?it/s]

In [7]:
df.has_spoiler.value_counts()

0    14485
1      516
Name: has_spoiler, dtype: int64

In [8]:
df.to_pickle("data/15ktest_goodreads_sent_spoil_titles.pkl")

In [4]:
df = pd.read_pickle("data/goodreads_sent_spoil_titles.pkl")
df.shape

(3534334, 2)

In [5]:
test = pd.read_pickle('data/15ktest_goodreads_sent_spoil_titles.pkl')
test.shape

(15001, 2)

In [6]:
test.head()

Unnamed: 0,sentence,has_spoiler
0,The Last Time We Say Goodbye [SEP] I envy Alex...,0
1,"Delirium (Delirium, #1) [SEP] Lena also had so...",0
2,"Throne of Glass (Throne of Glass, #1) [SEP] It...",0
3,"Haunted (Women of the Otherworld, #5) [SEP] Th...",0
4,The Missing Piece [SEP] Will the circle ever m...,0


In [8]:
# making sure test and train/val sets are disjoint
df = df[~df.sentence.isin(test.sentence)]
df.shape

(3518910, 2)

In [10]:
df.to_pickle("data/goodreads_sent_spoil_titles.pkl")

In [11]:
# saving a balanced version of the dataset
df = pd.concat([df[df.has_spoiler==1], df[df.has_spoiler==0].sample(len(df[df.has_spoiler==1]))]).sample(frac=1)
df.shape

(228460, 2)

In [12]:
df.to_pickle("data/goodreads_sent_spoil_titles_balanced.pkl")

In [13]:
df.has_spoiler.value_counts()

0    114230
1    114230
Name: has_spoiler, dtype: int64

# Prepare review-level dataset, instead of sentence-level

In [3]:
rev = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')

In [4]:
rev.head()

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id,title
0,5b78fd996dd2af3e863bec412a410512,2012-01-02,"[[0, Essential reading for anyone serious abou...",5,False,29999,ad2bf9e7e72997ec053fe9d666a76240,The Maltese Falcon
1,b492976229a8c400dac0f49e7fdbc089,2011-06-19,"[[0, This is a perfectly good book, but I star...",3,False,29999,a7e8220cddf1ab786d9ba51614d4d092,The Maltese Falcon
2,8b29249b521466c99ce780b0dc664cd7,2016-09-14,"[[0, I had a hard time deciding between 2 and ...",2,False,29999,893c6d3dd7e6694bf59b83e93accfe98,The Maltese Falcon
3,3b73ddb805671095f090c7d3b388b34e,2014-05-28,"[[0, I haven't read any books like this before...",3,False,29999,82f44972acd3a58f3520c9fd49e9e54b,The Maltese Falcon
4,0c2d6326b42e940a8be9f4e3895e7841,2016-03-21,"[[0, I read this book for my english class, an...",2,False,29999,bc07a4b4d36d763228bb29e5f0097ad2,The Maltese Falcon


In [5]:
rev.shape

(275607, 8)

In [7]:
rev = rev[['review_sentences', 'has_spoiler']]

In [13]:
rev['sentence'] = rev.review_sentences.apply(lambda x: ' '.join([s[1] for s in x]))
rev = rev[['sentence', 'has_spoiler']]

In [15]:
rev.has_spoiler = rev.has_spoiler.apply(int)

In [17]:
rev.has_spoiler.value_counts()

0    257628
1     17979
Name: has_spoiler, dtype: int64

In [20]:
rev.sentence.apply(len).describe()

count    275607.000000
mean       1058.580149
std        1252.834231
min           3.000000
25%         216.000000
50%         596.000000
75%        1458.000000
max       19494.000000
Name: sentence, dtype: float64

In [22]:
rev_train, rev_test = train_test_split(rev, test_size=0.05)
#rev.to_pkl('data/goodreads_review')

In [23]:
rev_train.value_counts()

sentence                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [27]:
rev_test = pd.read_pickle('data/goodreads_review_test.pkl')
rev_test.head()

Unnamed: 0,sentence,has_spoiler
235218,I was on a dystopia high once upon a time. Tha...,0
50163,I'm writing a review as I had a sudden desire ...,0
77334,Well this was certainly a wild ride! A Great R...,0
85401,Reread February 2016 for book club. Still disg...,1
109084,"Cora Carmack is a new author, to me. I recentl...",0


In [28]:
rev = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')

In [29]:
rev.head()

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id,title
0,5b78fd996dd2af3e863bec412a410512,2012-01-02,"[[0, Essential reading for anyone serious abou...",5,False,29999,ad2bf9e7e72997ec053fe9d666a76240,The Maltese Falcon
1,b492976229a8c400dac0f49e7fdbc089,2011-06-19,"[[0, This is a perfectly good book, but I star...",3,False,29999,a7e8220cddf1ab786d9ba51614d4d092,The Maltese Falcon
2,8b29249b521466c99ce780b0dc664cd7,2016-09-14,"[[0, I had a hard time deciding between 2 and ...",2,False,29999,893c6d3dd7e6694bf59b83e93accfe98,The Maltese Falcon
3,3b73ddb805671095f090c7d3b388b34e,2014-05-28,"[[0, I haven't read any books like this before...",3,False,29999,82f44972acd3a58f3520c9fd49e9e54b,The Maltese Falcon
4,0c2d6326b42e940a8be9f4e3895e7841,2016-03-21,"[[0, I read this book for my english class, an...",2,False,29999,bc07a4b4d36d763228bb29e5f0097ad2,The Maltese Falcon


In [30]:
str(rev.review_sentences.values[0])

'[[0, \'Essential reading for anyone serious about mysteries or hard-boiled crime fiction.\'], [0, \'Hammett is the grandfather of the hard-bitten private detective.\'], [0, \'While his Continental Op was the prototype for this kind of protagonist, Sam Spade is at the root of this now-large family tree, and The Maltese Falcon is his iconic case.\'], [0, \'Expect sharp dialog, vivid characters and a voice that speaks with the tone of cigarettes and gin at three in the morning.\'], [0, "If you\'ve seen John Huston\'s 1941 film version of this story, you\'ll be amazed how little had to be changed for the screen; Hammett\'s prose, though of its time, is already cinematic."]]'

In [10]:
rev_test['title'] = rev_test.sentence.apply(lambda x: x.split(' [SEP] ')[0])
rev_test['sent'] = rev_test.sentence.apply(lambda x: x.split(' [SEP] ')[1])
rev_test.head()

Unnamed: 0,sentence,has_spoiler,title,sent
0,The Last Time We Say Goodbye [SEP] I envy Alex...,0,The Last Time We Say Goodbye,I envy Alexis Riggs for that.
1,"Delirium (Delirium, #1) [SEP] Lena also had so...",0,"Delirium (Delirium, #1)",Lena also had some fairly solid relationships ...
2,"Throne of Glass (Throne of Glass, #1) [SEP] It...",0,"Throne of Glass (Throne of Glass, #1)",It is believed that magic no longer exists.
3,"Haunted (Women of the Otherworld, #5) [SEP] Th...",0,"Haunted (Women of the Otherworld, #5)",They simply click.
4,The Missing Piece [SEP] Will the circle ever m...,0,The Missing Piece,Will the circle ever miss that piece it found ...


In [31]:
rev['sentence'] = rev.review_sentences.apply(lambda x: ' '.join([s[1] for s in x]))

AttributeError: 'DataFrame' object has no attribute 'sent'

In [32]:
rev_test = rev_test.merge(rev[['sentence', 'review_id', 'title', 'review_sentences']], on='sentence', how='left')
rev_test.shape

(18072, 5)

In [33]:
rev_test.head()

Unnamed: 0,sentence,has_spoiler,review_id,title,review_sentences
0,I was on a dystopia high once upon a time. Tha...,0,568238cfb89ae65a93ba4fe3d359bd23,"Proxy (Proxy, #1)","[[0, I was on a dystopia high once upon a time..."
1,I'm writing a review as I had a sudden desire ...,0,7b6ca39c3991cbd61e8341af92503530,"The Lies of Locke Lamora (Gentleman Bastard, #1)","[[0, I'm writing a review as I had a sudden de..."
2,Well this was certainly a wild ride! A Great R...,0,518c61992da036bfd4a4b8638dab25a2,A Great Reckoning (Chief Inspector Armand Gama...,"[[0, Well this was certainly a wild ride!], [0..."
3,Reread February 2016 for book club. Still disg...,1,8ff24f71cf40b7aaed2fea8920dc2462,"You (You, #1)","[[0, Reread February 2016 for book club.], [0,..."
4,"Cora Carmack is a new author, to me. I recentl...",0,64eded3480171fe8259722b5acf9551e,"All Lined Up (Rusk University, #1)","[[0, Cora Carmack is a new author, to me.], [0..."


In [34]:
rev_test.review_id.nunique()

14759

In [35]:
rev_test = rev_test.groupby('review_id').first().reset_index()

In [36]:
rev_test[rev_test.sentence.apply(len) < 20]

Unnamed: 0,review_id,sentence,has_spoiler,title,review_sentences
5,0019fcc7f7e6a726efcd4fa159219136,Review to come.,0,"The Raven Boys (The Raven Cycle, #1)","[[0, Review to come.]]"
48,00fdd3e56363d04170fb718af9c5b7e2,Review to come!,0,"Awaken (Spiral of Bliss, #3)","[[0, Review to come!]]"
54,011accca3a9aaf35a13edf907fd3d03f,Review to come.,0,P.S. I Like You,"[[0, Review to come.]]"
57,01268d4cc081ef14a08066174110ff7a,Review to come.,0,"The Lost Heiress (Ladies of the Manor, #1)","[[0, Review to come.]]"
85,01b8f4a3b78acf400bb2dcab060ecb57,Review to come,0,"Seduction of a Highland Lass (McCabe Trilogy, #2)","[[0, Review to come]]"
...,...,...,...,...,...
14680,fed1ff44b94cebbc1858aafa78aababb,Review coming soon.,0,"Once Bitten, Twice Shy (Jaz Parks, #1)","[[0, Review coming soon.]]"
14691,feef7403105f7944d998c9bca5a3c430,Review to come!,0,The Player,"[[0, Review to come!]]"
14692,fef67fecbbb7ce590253fc3457c5d9c7,Review to come.,0,I Breathe You,"[[0, Review to come.]]"
14720,ff63f812d638b72887c4404635cada8a,Review to come.,0,"The Maze of Bones (The 39 Clues, #1)","[[0, Review to come.]]"


In [39]:
sent = []
spoil = []
rev_spoil = []
rev_ids = []
for i in tq.tqdm(range(len(rev_test))):
    for j in range(len(rev_test.iloc[i]['review_sentences'])):
        sent.append(rev_test.iloc[i].title + ' [SEP] ' + rev_test.iloc[i]['review_sentences'][j][1])   
        spoil.append(rev_test.iloc[i]['review_sentences'][j][0])
        rev_spoil.append(rev_test.iloc[i].has_spoiler)
        rev_ids.append(rev_test.iloc[i].review_id)


df = pd.DataFrame(list(zip(sent, spoil, rev_spoil, rev_ids)), 
               columns =['sentence', 'has_spoiler', 'rev_has_spoiler', 'review_id'])

  0%|          | 0/14759 [00:00<?, ?it/s]

In [40]:
df.head()

Unnamed: 0,sentence,has_spoiler,rev_has_spoiler,review_id
0,Bird Box [SEP] I love the apocalyptic/post-apo...,0,0,000917af828f16428126427bee50d208
1,Bird Box [SEP] Not that these books aren't awe...,0,0,000917af828f16428126427bee50d208
2,Bird Box [SEP] This is the reason that despite...,0,0,000917af828f16428126427bee50d208
3,Bird Box [SEP] I could kick myself for my resi...,0,0,000917af828f16428126427bee50d208
4,"Bird Box [SEP] First, there is nary a zombie o...",0,0,000917af828f16428126427bee50d208


In [41]:
df.to_csv('data/sent_rev_test.csv', index=False)

In [71]:
res = pd.read_csv('data/sent_rev_res.csv')
res.head()

Unnamed: 0,sentence,pred,pred_prob,true_val
0,"Price of a Kiss (Forbidden Men, #1) [SEP] My m...",tensor([0.]),tensor([-7.0706]),tensor(0)
1,"Transparent (Transparent, #1) [SEP] Before I b...",tensor([0.]),tensor([-7.4107]),tensor(0)
2,Of Fire and Stars [SEP] Despite wanting to dnf...,tensor([0.]),tensor([-7.8571]),tensor(0)
3,Looking for Alibrandi [SEP] She is brilliant.,tensor([0.]),tensor([-7.8387]),tensor(0)
4,"Twisted (Tangled, #2) [SEP] Reviewed by Candac...",tensor([0.]),tensor([-7.8722]),tensor(0)


In [72]:
res['sent'] = res.sentence.apply(lambda x: x.split(' [SEP] ')[1])

In [45]:
df = pd.read_csv('data/sent_rev_test.csv')

In [49]:
res = res.groupby('review_id').max()[['pred_probs']].reset_index()

In [48]:
df = df.groupby('review_id').first().reset_index()
df.shape

(14759, 4)

In [51]:
res_df = res.merge(df[['rev_has_spoiler', 'review_id']], on='review_id')
res_df.shape

(14759, 3)

In [58]:
res_df.pred_probs = res_df.pred_probs.apply(lambda x: float(x.split('[')[1].split(']')[0]))
roc_auc_score(res_df.rev_has_spoiler, res_df.pred_probs)

0.5955073326009127

In [57]:
res_df.pred_probs.apply(lambda x: float(x.split('[')[1].split(']')[0]))

0       -7.6263
1       -7.5885
2       -7.5982
3       -7.6414
4       -7.6446
          ...  
14754    5.2300
14755   -7.6110
14756   -7.6430
14757   -7.6154
14758   -7.5717
Name: pred_probs, Length: 14759, dtype: float64

In [64]:
df.has_spoiler.value_counts()

0    14740
1       19
Name: has_spoiler, dtype: int64

In [65]:
res_df[res_df.pred_probs >= 0]

Unnamed: 0,review_id,pred_probs,rev_has_spoiler
28,0092b9f658ff0fd63608187f0e597693,6.5255,0
43,00e019bf2bc069be5f63a2df82e750cd,6.0762,0
52,011925060855796cc7ac6e83af4e2395,6.9760,0
67,01468c4ad96be9e3e2bca18b1c692098,6.5228,0
87,01bc01ec655b964643ae1ba9e50a1e57,4.0730,0
...,...,...,...
14663,fe85debe595fd704bc510460cc502659,6.6077,1
14694,fef8b84474a91a9f9de893656d177932,1.3559,0
14718,ff60d984b2440f9558f5f99add705c37,1.1328,0
14724,ff6f8cf003cb923e5eabb6b249182d63,6.0234,0


In [66]:
gr = pd.read_pickle('csc2515-project-main/csc2515-project-main/data/goodreads.pkl')
gr.head()

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id,title
0,5b78fd996dd2af3e863bec412a410512,2012-01-02,"[[0, Essential reading for anyone serious abou...",5,False,29999,ad2bf9e7e72997ec053fe9d666a76240,The Maltese Falcon
1,b492976229a8c400dac0f49e7fdbc089,2011-06-19,"[[0, This is a perfectly good book, but I star...",3,False,29999,a7e8220cddf1ab786d9ba51614d4d092,The Maltese Falcon
2,8b29249b521466c99ce780b0dc664cd7,2016-09-14,"[[0, I had a hard time deciding between 2 and ...",2,False,29999,893c6d3dd7e6694bf59b83e93accfe98,The Maltese Falcon
3,3b73ddb805671095f090c7d3b388b34e,2014-05-28,"[[0, I haven't read any books like this before...",3,False,29999,82f44972acd3a58f3520c9fd49e9e54b,The Maltese Falcon
4,0c2d6326b42e940a8be9f4e3895e7841,2016-03-21,"[[0, I read this book for my english class, an...",2,False,29999,bc07a4b4d36d763228bb29e5f0097ad2,The Maltese Falcon


In [89]:
ids = []
for ix, row in res.iterrows():
    try:
        ids.append(gr[gr.rev_str.str.contains(row.sent)].index[0])
    except:
        pass

print(ids)

  ids.append(gr[gr.rev_str.str.contains(row.sent)].index[0])
  if regex and re.compile(pat).groups:


[146799, 66448, 15997, 1655, 142586, 114805, 187469, 114549, 151216, 5162, 141355, 177751, 79176, 45010, 170251, 191812, 28562, 203783, 85245, 133733, 178589, 12243, 194650, 169973, 45259, 127070, 93971, 58411, 133937, 49526, 117562, 183370, 33760, 93620, 109989, 55535, 55086, 172380, 2783, 183431, 119365, 114435, 201588, 202842, 150529, 190317, 86002, 132835, 59102, 203826, 178989, 12722, 121772, 58624, 6527, 151475, 198934, 76872, 141354, 198439, 141279, 109318, 98439, 15006, 104856, 48569, 1029, 116946, 133739, 164960, 9932, 105135, 177165, 206329, 102269, 28565, 79657, 195999, 97715, 91593, 38991, 66937, 181235, 117790, 7867, 108612, 74, 198, 37529, 100401, 0, 28751, 101480, 10202, 145614, 108491, 10491, 37173, 16137, 118710, 188738, 138056, 149345, 132180, 195652, 80432, 183828, 11123, 96064, 25501, 49502, 192110, 359, 26060, 117898, 108847, 116752, 155289, 52624, 109149, 68207, 157620, 71554, 20861, 25371, 9759, 72087, 25370, 142063, 109319, 87282, 49519, 183467, 22052, 194977, 1

In [78]:
gr['rev_str'] = gr.review_sentences.apply(str)

In [84]:
gr.rev_str.iloc[0]

'[[0, \'Essential reading for anyone serious about mysteries or hard-boiled crime fiction.\'], [0, \'Hammett is the grandfather of the hard-bitten private detective.\'], [0, \'While his Continental Op was the prototype for this kind of protagonist, Sam Spade is at the root of this now-large family tree, and The Maltese Falcon is his iconic case.\'], [0, \'Expect sharp dialog, vivid characters and a voice that speaks with the tone of cigarettes and gin at three in the morning.\'], [0, "If you\'ve seen John Huston\'s 1941 film version of this story, you\'ll be amazed how little had to be changed for the screen; Hammett\'s prose, though of its time, is already cinematic."]]'

In [90]:
len(ids)

13801

In [91]:
gr = gr[gr.index.isin(ids)]

In [92]:
gr.shape

(8844, 9)

In [94]:
gr.has_spoiler.value_counts()

False    7901
True      943
Name: has_spoiler, dtype: int64

In [108]:
probs = []
true_vals = []
revs = []
for ix, row in res.iterrows():
    try:
        mrow = gr[gr.rev_str.str.contains(row.sent)]
        true_vals.append(int(mrow.has_spoiler.values[0]))
        revs.append(mrow.review_id.values[0])
        probs.append(float(row.pred_prob.split('[')[1].split(']')[0]))
    except:
        pass
    if ix % 100 == 0:
        print(ix)

0


  mrow = gr[gr.rev_str.str.contains(row.sent)]


100
200
300
400
500
600
700
800
900
1000
1100


  if regex and re.compile(pat).groups:


1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000


In [110]:
df = pd.DataFrame(list(zip(true_vals, revs, probs)), 
               columns =['has_spoiler', 'rev', 'prob'])

df.head()

Unnamed: 0,has_spoiler,rev,prob
0,0,b22fb34039e0e561a053f498fd7dc899,-7.0706
1,0,64fb993f1958f22a3590fde09ddc0b04,-7.4107
2,0,d879f0562dd2fe7ac7afc2d6c8184b24,-7.8571
3,0,3dac84e029f7e361301ed2985f51f6bd,-7.8387
4,0,54a6f62a3c920f74d37a3b0a16d8f5cf,-7.8722


In [111]:
gr_df = df.groupby('rev').max()
gr_df

Unnamed: 0_level_0,has_spoiler,prob
rev,Unnamed: 1_level_1,Unnamed: 2_level_1
0004e3df1979375be29181845d5c2773,0,-7.8367
00050bd196fa8c8d8b17864926c1e9aa,0,-6.5049
0007c4c78fa3b36ae722c7f813d573f2,0,-5.2573
000f9b17e8dfc8da12c4c4c81377ee6d,0,-7.3083
001de285e465ac926844f11aeb09db91,1,-5.1494
...,...,...
ffcc6a95c2680f6e23cff3847739f45d,0,-7.8904
ffd9b7f2cf49afb1d52ad178817bbd50,1,-1.8805
ffe53894804ddd9f5d5bc747b885b3a2,0,-4.4201
ffee497d52fc633d791050e9c2ca389c,0,-7.6895


In [112]:
roc_auc_score(gr_df.has_spoiler, gr_df.prob)

0.760689647859923

In [114]:
gr_df.has_spoiler.value_counts()

0    7901
1     943
Name: has_spoiler, dtype: int64

In [115]:
gr_df[gr_df.prob >= 0]

Unnamed: 0_level_0,has_spoiler,prob
rev,Unnamed: 1_level_1,Unnamed: 2_level_1
009783e8133ab0b4783172c451d66a3f,1,3.0063
0231a329c277654fa179a97455628a97,0,3.1073
030c8a6820065f4c68e70327ea9b3df2,1,3.3629
03c815a082ebfd4a3a9a75eec5954714,1,3.4777
046964f237e07fc5bd5643cadf687453,1,3.1913
...,...,...
f6e2c4498b05865ae38402b3fcdf90cf,1,1.6505
f70d2650191349769029cba86fc10350,1,3.1758
f7964967a2b90f67e0fc8d75c22eedc2,1,0.1532
f8cd7aeed32c5ae0f9ed0a0b705784a1,1,2.6720
