# Train-validation-test split

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_json("goodreads_reviews_spoiler.json", lines=True)

In [3]:
df.head()

Unnamed: 0,user_id,timestamp,review_sentences,rating,has_spoiler,book_id,review_id
0,8842281e1d1347389f2ab93d60773d4d,2017-08-30,"[[0, This is a special book.], [0, It started ...",5,True,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb
1,8842281e1d1347389f2ab93d60773d4d,2017-03-22,"[[0, Recommended by Don Katz.], [0, Avail for ...",3,False,16981,a5d2c3628987712d0e05c4f90798eb67
2,8842281e1d1347389f2ab93d60773d4d,2017-03-20,"[[0, A fun, fast paced science fiction thrille...",3,True,28684704,2ede853b14dc4583f96cf5d120af636f
3,8842281e1d1347389f2ab93d60773d4d,2016-11-09,"[[0, Recommended reading to understand what is...",0,False,27161156,ced5675e55cd9d38a524743f5c40996e
4,8842281e1d1347389f2ab93d60773d4d,2016-04-25,"[[0, I really enjoyed this book, and there is ...",4,True,25884323,332732725863131279a8e345b63ac33e


In [4]:
df.groupby(["has_spoiler", "rating"]).size()

has_spoiler  rating
False        0          45658
             1          41339
             2         102050
             3         267505
             4         449683
             5         382171
True         0           1394
             1           3413
             2          10176
             3          22925
             4          30516
             5          21203
dtype: int64

In [5]:
has_spoilers = df[df["has_spoiler"] == True]
no_spoilers = df[df["has_spoiler"] == False]

print(len(has_spoilers))
print(len(no_spoilers))

89627
1288406


In [6]:
no_spoilers_split = no_spoilers.sample(n=180000-len(has_spoilers), random_state=0)
print(len(no_spoilers_split))

90373


In [7]:
df_joined = pd.concat([has_spoilers, no_spoilers_split])
print(len(df_joined))

180000


In [8]:
def describe_data(df):
    df['num_spoiler_sentences'] = df['review_sentences'].apply(lambda x: sum([sent[0] for sent in x]))
    df['num_sentences'] = df['review_sentences'].apply(lambda x: len(x))
    
    num_books = df['book_id'].nunique()
    num_reviews = df['review_id'].nunique()
    total_sentences = df['num_sentences'].sum()
    avg_review_length = round(total_sentences / num_reviews, 2)
    num_spoilers = df['has_spoiler'].sum()
    pct_spoilers = round((num_spoilers / num_reviews) * 100, 2)
    num_spoiler_sentences = df['num_spoiler_sentences'].sum()
    pct_spoiler_sentences = round((num_spoiler_sentences / total_sentences) * 100, 2)

    summary_table = pd.DataFrame({'Number of Reviews': [num_reviews],
                                  'Number of Books': [num_books],
                                  'Total Number of Sentences': [total_sentences],
                                  'Average Review Length (sentences)': [avg_review_length],
                                  'Reviews with Spoilers': [f"{num_spoilers} ({pct_spoilers}%)"], 
                                  'Sentences with Spoilers': [f"{num_spoiler_sentences} ({pct_spoiler_sentences}%)"]})

    summary_table = summary_table.transpose()
    summary_table.columns = ['Summary']
    return summary_table

In [9]:
describe_data(df_joined)

Unnamed: 0,Summary
Number of Reviews,180000
Number of Books,25216
Total Number of Sentences,3218221
Average Review Length (sentences),17.88
Reviews with Spoilers,89627 (49.79%)
Sentences with Spoilers,569724 (17.7%)


In [10]:
describe_data(df)

Unnamed: 0,Summary
Number of Reviews,1378033
Number of Books,25475
Total Number of Sentences,17672655
Average Review Length (sentences),12.82
Reviews with Spoilers,89627 (6.5%)
Sentences with Spoilers,569724 (3.22%)


In [11]:
train_df, test_df = train_test_split(df_joined, test_size=0.30, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

print(train_df.shape[0] / len(df_joined))
print(val_df.shape[0] / len(df_joined))
print(test_df.shape[0] / len(df_joined))

0.7
0.15
0.15


In [12]:
summary_table = describe_data(train_df).join(describe_data(val_df), lsuffix="_train", rsuffix="_val").join(describe_data(test_df))
summary_table = summary_table.rename(columns={'Summary': 'Summary_test'})
summary_table

Unnamed: 0,Summary_train,Summary_val,Summary_test
Number of Reviews,126000,27000,27000
Number of Books,23681,12423,12313
Total Number of Sentences,2257630,475718,484873
Average Review Length (sentences),17.92,17.62,17.96
Reviews with Spoilers,62824 (49.86%),13342 (49.41%),13461 (49.86%)
Sentences with Spoilers,400411 (17.74%),83936 (17.64%),85377 (17.61%)


In [13]:
train_df = train_df.drop(columns = ["user_id", "timestamp", "rating", "has_spoiler"])
val_df = val_df.drop(columns = ["user_id", "timestamp", "rating", "has_spoiler"])
test_df = test_df.drop(columns = ["user_id", "timestamp", "rating", "has_spoiler"])

In [14]:
train_df = train_df.explode("review_sentences")
train_df["id"] = train_df.reset_index().index
train_df

Unnamed: 0,review_sentences,book_id,review_id,num_spoiler_sentences,num_sentences,id
1166987,"[0, This book picks up right after Midnight, w...",8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,0
1166987,"[0, I wasn't planning to read another Vampire ...",8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,1
1166987,"[0, I finished the last trilogy mostly out of ...",8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,2
1166987,"[0, This one's a small step back in the right ...",8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,3
1166987,"[0, Phantom has a stripped down plot and tight...",8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,4
...,...,...,...,...,...,...
1145302,"[0, How come we haven't been able to change th...",28587957,bf4b2790463c6722bb6548d555189abe,0,11,2257625
1145302,"[0, I know it is not the major important thing...",28587957,bf4b2790463c6722bb6548d555189abe,0,11,2257626
1145302,"[0, I will read more of her book in the future]",28587957,bf4b2790463c6722bb6548d555189abe,0,11,2257627
151080,"[0, Certainly an enjoyable read with its highl...",15803173,1b09ebc338e3dca48b9e90dfe0c2d695,0,2,2257628


In [15]:
train_df2 = pd.DataFrame(train_df['review_sentences'].to_list(), columns = ["label", "sentence"])
train_df2["id"] = train_df2.reset_index().index
train_df2

Unnamed: 0,label,sentence,id
0,0,"This book picks up right after Midnight, with ...",0
1,0,I wasn't planning to read another Vampire Diar...,1
2,0,I finished the last trilogy mostly out of loya...,2
3,0,This one's a small step back in the right dire...,3
4,0,Phantom has a stripped down plot and tighter w...,4
...,...,...,...
2257625,0,How come we haven't been able to change the pu...,2257625
2257626,0,I know it is not the major important thing in ...,2257626
2257627,0,I will read more of her book in the future,2257627
2257628,0,Certainly an enjoyable read with its highly au...,2257628


In [16]:
train_data = train_df.merge(train_df2, on = "id")
train_data = train_data.drop(columns = "review_sentences")
train_data

Unnamed: 0,book_id,review_id,num_spoiler_sentences,num_sentences,id,label,sentence
0,8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,0,0,"This book picks up right after Midnight, with ..."
1,8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,1,0,I wasn't planning to read another Vampire Diar...
2,8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,2,0,I finished the last trilogy mostly out of loya...
3,8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,3,0,This one's a small step back in the right dire...
4,8607205,6a5bc2f7767e0884bc0c01c94e9818f3,0,17,4,0,Phantom has a stripped down plot and tighter w...
...,...,...,...,...,...,...,...
2257625,28587957,bf4b2790463c6722bb6548d555189abe,0,11,2257625,0,How come we haven't been able to change the pu...
2257626,28587957,bf4b2790463c6722bb6548d555189abe,0,11,2257626,0,I know it is not the major important thing in ...
2257627,28587957,bf4b2790463c6722bb6548d555189abe,0,11,2257627,0,I will read more of her book in the future
2257628,15803173,1b09ebc338e3dca48b9e90dfe0c2d695,0,2,2257628,0,Certainly an enjoyable read with its highly au...


In [17]:
val_df = val_df.explode("review_sentences")
val_df["id"] = val_df.reset_index().index
val_df

Unnamed: 0,review_sentences,book_id,review_id,num_spoiler_sentences,num_sentences,id
854177,"[0, O MY GOODNESS PERCY'S BACK!!!]",9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,0
854177,"[0, My little guy is all grown up and defeatin...",9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,1
854177,"[0, I'll admit that it's a little insane how e...",9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,2
854177,"[0, But its PERCY guys!]",9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,3
854177,"[0, He's my favorite little demigod and I just...",9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,4
...,...,...,...,...,...,...
93214,"[0, There wasn't much in The Crown and it felt...",26074181,70c5ee2125108abeba05800903319524,5,20,475713
93214,"[0, Even if it wasn't one novel, The Crown def...",26074181,70c5ee2125108abeba05800903319524,5,20,475714
93214,"[0, I don't think I'd recommend these final tw...",26074181,70c5ee2125108abeba05800903319524,5,20,475715
93214,"[0, I mean, if you're really interested in Ame...",26074181,70c5ee2125108abeba05800903319524,5,20,475716


In [18]:
val_df2 = pd.DataFrame(val_df['review_sentences'].to_list(), columns = ["label", "sentence"])
val_df2["id"] = val_df2.reset_index().index
val_df2

Unnamed: 0,label,sentence,id
0,0,O MY GOODNESS PERCY'S BACK!!!,0
1,0,My little guy is all grown up and defeating mo...,1
2,0,I'll admit that it's a little insane how endea...,2
3,0,But its PERCY guys!,3
4,0,He's my favorite little demigod and I just lov...,4
...,...,...,...
475713,0,There wasn't much in The Crown and it felt so ...,475713
475714,0,"Even if it wasn't one novel, The Crown definit...",475714
475715,0,I don't think I'd recommend these final two no...,475715
475716,0,"I mean, if you're really interested in America...",475716


In [19]:
val_data = val_df.merge(val_df2, on = "id")
val_data = val_data.drop(columns = "review_sentences")
val_data

Unnamed: 0,book_id,review_id,num_spoiler_sentences,num_sentences,id,label,sentence
0,9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,0,0,O MY GOODNESS PERCY'S BACK!!!
1,9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,1,0,My little guy is all grown up and defeating mo...
2,9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,2,0,I'll admit that it's a little insane how endea...
3,9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,3,0,But its PERCY guys!
4,9520360,bf97e3ab107a2894f09d4d92f100455e,2,27,4,0,He's my favorite little demigod and I just lov...
...,...,...,...,...,...,...,...
475713,26074181,70c5ee2125108abeba05800903319524,5,20,475713,0,There wasn't much in The Crown and it felt so ...
475714,26074181,70c5ee2125108abeba05800903319524,5,20,475714,0,"Even if it wasn't one novel, The Crown definit..."
475715,26074181,70c5ee2125108abeba05800903319524,5,20,475715,0,I don't think I'd recommend these final two no...
475716,26074181,70c5ee2125108abeba05800903319524,5,20,475716,0,"I mean, if you're really interested in America..."


In [20]:
test_df = test_df.explode("review_sentences")
test_df["id"] = test_df.reset_index().index
test_df

Unnamed: 0,review_sentences,book_id,review_id,num_spoiler_sentences,num_sentences,id
690952,"[0, 3.5 Stars]",21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,0
690952,"[0, The story was interesting in the beginning...",21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,1
690952,"[0, Once it was revealed, it wasn't bad, but i...",21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,2
690952,"[0, There could have been some much more depth...",21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,3
690952,"[0, It's another alpha male who's dominant all...",21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,4
...,...,...,...,...,...,...
1141765,"[0, If you made a drinking game out of the tes...",7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484868
1141765,"[0, Anyone have a theory?]",7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484869
1141765,"[0, This 4 is more of a 3.5.]",7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484870
1141765,"[0, Usually with halves I round down, but I su...",7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484871


In [21]:
test_df2 = pd.DataFrame(test_df['review_sentences'].to_list(), columns = ["label", "sentence"])
test_df2["id"] = test_df2.reset_index().index
test_df2

Unnamed: 0,label,sentence,id
0,0,3.5 Stars,0
1,0,"The story was interesting in the beginning, dr...",1
2,0,"Once it was revealed, it wasn't bad, but it wa...",2
3,0,"There could have been some much more depth, bu...",3
4,0,It's another alpha male who's dominant all the...,4
...,...,...,...
484868,0,If you made a drinking game out of the testing...,484868
484869,0,Anyone have a theory?,484869
484870,0,This 4 is more of a 3.5.,484870
484871,0,"Usually with halves I round down, but I suspec...",484871


In [22]:
test_data = test_df.merge(test_df2, on = "id")
test_data = test_data.drop(columns = "review_sentences")
test_data

Unnamed: 0,book_id,review_id,num_spoiler_sentences,num_sentences,id,label,sentence
0,21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,0,0,3.5 Stars
1,21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,1,0,"The story was interesting in the beginning, dr..."
2,21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,2,0,"Once it was revealed, it wasn't bad, but it wa..."
3,21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,3,0,"There could have been some much more depth, bu..."
4,21491731,e7af3c817fa930b3510c53a762a6ff10,0,11,4,0,It's another alpha male who's dominant all the...
...,...,...,...,...,...,...,...
484868,7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484868,0,If you made a drinking game out of the testing...
484869,7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484869,0,Anyone have a theory?
484870,7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484870,0,This 4 is more of a 3.5.
484871,7094569,7c0a57b9381e48497ceef84d9ec04a3a,6,21,484871,0,"Usually with halves I round down, but I suspec..."


In [23]:
print(train_data["label"].value_counts() / len(train_data))
print(val_data["label"].value_counts() / len(val_data))
print(test_data["label"].value_counts() / len(test_data))

0    0.822641
1    0.177359
Name: label, dtype: float64
0    0.823559
1    0.176441
Name: label, dtype: float64
0    0.823919
1    0.176081
Name: label, dtype: float64


In [24]:
train_data = train_data.dropna()
val_data = val_data.dropna()
test_data = test_data.dropna()

print(len(train_data))
print(len(val_data))
print(len(test_data))

2257630
475718
484873


In [25]:
train_data.to_csv("train.csv", index=False)

In [26]:
val_data.to_csv("valid.csv", index=False)

In [27]:
test_data.to_csv("test.csv", index=False)