In [1]:
import pandas as pd
import json
import re
import os
import sys
import numpy as np
from nltk.tokenize import sent_tokenize
import random
pd.options.display.float_format = '{:,}'.format
random.seed(42)

### Data Load

In [2]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)
df_raw.shape

(573913, 7)

### Review Filtering

In [3]:
# review filtering
df_review = df_raw.groupby('movie_id').count().sort_values(by='user_id', ascending = False)
df_review = df_review[df_review['review_date'] > 100]

df_list = df_review.index.values.tolist()

df_raw = df_raw[df_raw['movie_id'].isin(df_list)]
df_raw.shape

(562076, 7)

In [4]:
# null replace
df_raw['review_text'].replace('', np.nan, inplace=True)
df_raw.dropna(subset=['review_text'], inplace=True)
df_raw.shape

(562076, 7)

In [5]:
# 
df_raw['review_text'] = df_raw['review_text'].apply(str)

df_raw['text_count'] = df_raw['review_text'].apply(sent_tokenize).tolist()
df_raw['text_count'] = df_raw['text_count'].apply(len)

df_raw = df_raw[df_raw['text_count'] < 10]
df_raw.shape

(272977, 8)

In [6]:
df_raw.reset_index(drop=True, inplace=True)

In [7]:
df_raw['Key'] = df_raw['movie_id'].map(str) + "_" + df_raw['user_id'].map(str)
df_raw.drop(columns = ['review_date','review_summary','text_count'], inplace = True)

### Similarity Combine

In [8]:
# Similarity Data
df1 = pd.read_csv('IMDB/dataset/sim1-spacy-pre-unq.csv')
df2 = pd.read_csv('IMDB/dataset/sim2-spacy-pre-unq.csv')

df = pd.concat([df1,df2], ignore_index=True)
df.shape

(257277, 8)

In [9]:
df.drop(columns = ['review','summary','synopsis'], inplace = True)

In [10]:
df['Key'] = df['movie_id'].map(str) + "_" + df['user_id'].map(str)

In [11]:
df.drop(columns = ['movie_id','user_id','is_spoiler'], inplace = True)
df

Unnamed: 0,summary_sim,synopsis_sim,Key
0,0.8182528627370624,0.9379999551535706,tt0111161_ur6574726
1,0.8248598156132173,0.9410306515057832,tt0111161_ur31182745
2,0.8286599404831738,0.9451207633340328,tt0111161_ur9871443
3,0.8536536626202199,0.929594141329498,tt0111161_ur23169472
4,0.7727762898072235,0.8943991748293878,tt0111161_ur34426359
...,...,...,...
257272,0.7507024106067787,0.7507024106067787,tt0139239_ur0415521
257273,0.7608688119019157,0.7608688119019157,tt0139239_ur0100166
257274,0.7717965122266363,0.7717965122266363,tt0139239_ur0021767
257275,0.7526294445454874,0.7526294445454874,tt0139239_ur0349105


In [12]:
df_raw = pd.merge(df_raw, df, how='inner', on ='Key')

In [13]:
df_raw

Unnamed: 0,movie_id,user_id,is_spoiler,review_text,rating,Key,summary_sim,synopsis_sim
0,tt0111161,ur6574726,True,I have been a fan of this movie for a long tim...,9,tt0111161_ur6574726,0.8182528627370624,0.9379999551535706
1,tt0111161,ur31182745,True,I made my account on IMDb Just to Rate this mo...,10,tt0111161_ur31182745,0.8248598156132173,0.9410306515057832
2,tt0111161,ur9871443,True,"A friend of mine listed ""The Shawshank Redempt...",10,tt0111161_ur9871443,0.8286599404831738,0.9451207633340328
3,tt0111161,ur23169472,True,"To tell the truth, I am speechless. I am a you...",10,tt0111161_ur23169472,0.8536536626202199,0.929594141329498
4,tt0111161,ur34426359,True,Wow! what a film this baby is and yes this fil...,10,tt0111161_ur34426359,0.7727762898072235,0.8943991748293878
...,...,...,...,...,...,...,...,...
257272,tt0139239,ur0415521,False,This type of movie is one that I would not ord...,9,tt0139239_ur0415521,0.7507024106067787,0.7507024106067787
257273,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem...",10,tt0139239_ur0100166,0.7608688119019157,0.7608688119019157
257274,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ...",9,tt0139239_ur0021767,0.7717965122266363,0.7717965122266363
257275,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...,3,tt0139239_ur0349105,0.7526294445454874,0.7526294445454874


In [14]:
df_raw.summary_sim = df_raw.summary_sim.round(3)

In [15]:
df_raw.synopsis_sim = df_raw.synopsis_sim.round(3)

In [16]:
df_1 = df_raw.iloc[0:250]

In [17]:
data = []
# For each of the samples...
for index, row in df_1.iterrows():

    # Piece it together...    
    combined = ""
    
    #combined += "The ID of this item is {:}, ".format(row["Clothing ID"])
    combined += "My review is {:} similar to the summary. ".format(row["summary_sim"])
    
    combined += "And {:} similar to the ending. ".format(row["synopsis_sim"])
       
    # Finally, append the review the text!
    data.append(combined)
    
df = pd.DataFrame(data, columns=['combined'])
df_1 = pd.concat([df_1,df], axis=1)


print('DONE.')

DONE.


In [18]:
df

Unnamed: 0,combined
0,My review is 0.818 similar to the summary. And...
1,My review is 0.825 similar to the summary. And...
2,My review is 0.829 similar to the summary. And...
3,My review is 0.854 similar to the summary. And...
4,My review is 0.773 similar to the summary. And...
...,...
245,My review is 0.769 similar to the summary. And...
246,My review is 0.839 similar to the summary. And...
247,My review is 0.819 similar to the summary. And...
248,My review is 0.801 similar to the summary. And...


In [19]:
df_1

Unnamed: 0,movie_id,user_id,is_spoiler,review_text,rating,Key,summary_sim,synopsis_sim,combined
0,tt0111161,ur6574726,True,I have been a fan of this movie for a long tim...,9,tt0111161_ur6574726,0.818,0.938,My review is 0.818 similar to the summary. And...
1,tt0111161,ur31182745,True,I made my account on IMDb Just to Rate this mo...,10,tt0111161_ur31182745,0.825,0.941,My review is 0.825 similar to the summary. And...
2,tt0111161,ur9871443,True,"A friend of mine listed ""The Shawshank Redempt...",10,tt0111161_ur9871443,0.829,0.945,My review is 0.829 similar to the summary. And...
3,tt0111161,ur23169472,True,"To tell the truth, I am speechless. I am a you...",10,tt0111161_ur23169472,0.854,0.93,My review is 0.854 similar to the summary. And...
4,tt0111161,ur34426359,True,Wow! what a film this baby is and yes this fil...,10,tt0111161_ur34426359,0.773,0.894,My review is 0.773 similar to the summary. And...
...,...,...,...,...,...,...,...,...,...
245,tt0111161,ur59411095,True,"With little bit of suspense,it's make me insan...",10,tt0111161_ur59411095,0.769,0.899,My review is 0.769 similar to the summary. And...
246,tt0111161,ur42888444,True,Very good movie. I can't say anything bad abou...,9,tt0111161_ur42888444,0.839,0.922,My review is 0.839 similar to the summary. And...
247,tt0111161,ur20666327,True,"The best movie of all time! Great acting, grea...",10,tt0111161_ur20666327,0.819,0.883,My review is 0.819 similar to the summary. And...
248,tt0111161,ur58590006,True,ı think it is best movie in the world last end...,10,tt0111161_ur58590006,0.801,0.88,My review is 0.801 similar to the summary. And...


In [20]:
df_1['text'] = df_1.review_text.map(str) + df_1.combined.map(str)

### Data Preprocessing

In [21]:
# column rename
df_raw['is_spoiler']= df_raw['is_spoiler'].replace([True,False],[1,0])
df_raw.rename(columns={'is_spoiler':'class', 'review_text':'text'}, inplace = True)
df = df_raw[['class','text']]

In [22]:
# null replace
df['text'].replace('', np.nan, inplace=True)
df.dropna(subset=['text'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


### Balancing

In [23]:
df_non = df[df['class'] == 0]
df_spoiler = df[df['class'] == 1]
print("nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))

nonspoiler: 208432
spoiler: 48845


In [24]:
df_non = df_non.sample(frac =.5)
print("weight-decreased nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))
print("balanced total:", len(df_non)+len(df_spoiler))

weight-decreased nonspoiler: 104216
spoiler: 48845
balanced total: 153061


### Holdout

In [25]:
# 70/10/20 train/val/test
df_train_non = df_non.sample(frac = .70)
df_non = df_non.drop(df_train_non.index)
df_train_spoiler = df_spoiler.sample(frac = .70)
df_spoiler = df_spoiler.drop(df_train_spoiler.index)

# use 33% of the remaining for validation (which is 10% of whole dataset)
df_dev_non = df_non.sample(frac = .333)
df_non = df_non.drop(df_dev_non.index)
df_dev_spoiler = df_spoiler.sample(frac = .333)
df_spoiler = df_spoiler.drop(df_dev_spoiler.index)

# use all of the remaining for test (which is 20% of whole dataset)
df_test_non = df_non
df_test_spoiler = df_spoiler

In [26]:
# merge and shuffle

df_train = pd.concat([df_train_non, df_train_spoiler])
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_dev = pd.concat([df_dev_non, df_dev_spoiler])
df_dev = df_dev.sample(frac=1).reset_index(drop=True)

df_test = pd.concat([df_test_non, df_test_spoiler])
df_test = df_test.sample(frac=1).reset_index(drop=True)

print("train:",len(df_train))
print("dev:",len(df_dev))
print("test:",len(df_test))
print("all:",len(df_train)+len(df_dev)+len(df_test))

train: 107143
dev: 15290
test: 30628
all: 153061


In [27]:
df_train.to_csv('IMDB/final/train.csv', index=False)
df_dev.to_csv('IMDB/final/dev.csv', index=False)
df_test.to_csv('IMDB/final/test.csv', index=False)

### Sample

In [28]:
df_train_sample = df_train.sample(35000)
df_dev_sample = df_dev.sample(5000)
df_test_sample = df_test.sample(10000)

In [29]:
df_train_sample.to_csv('IMDB/final_sample/train.csv', index=False)
df_dev_sample.to_csv('IMDB/final_sample/dev.csv', index=False)
df_test_sample.to_csv('IMDB/final_sample/test.csv', index=False)