In [1]:
import pandas as pd
import json
import re
import os
import sys
import numpy as np
from nltk.tokenize import sent_tokenize
import random
pd.options.display.float_format = '{:,}'.format
random.seed(2021)

### Data Load

In [2]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)
df_raw.shape

(573913, 7)

### Review Filtering

In [3]:
# review filtering
df_review = df_raw.groupby('movie_id').count().sort_values(by='user_id', ascending = False)
df_review = df_review[df_review['review_date'] > 100]

df_list = df_review.index.values.tolist()

df_raw = df_raw[df_raw['movie_id'].isin(df_list)]
df_raw.shape

(562076, 7)

In [4]:
# null replace
df_raw['review_text'].replace('', np.nan, inplace=True)
df_raw.dropna(subset=['review_text'], inplace=True)
df_raw.shape

(562076, 7)

In [5]:
df_raw['Key'] = df_raw['movie_id'].map(str) + "_" + df_raw['user_id'].map(str)
#df_raw.drop(columns = ['review_date','review_summary','text_count'], inplace = True)

### Similarity Combine

In [6]:
# Similarity Data
df1 = pd.read_csv('IMDB/dataset/sim1-spacy-pre-unq.csv')
df2 = pd.read_csv('IMDB/dataset/sim2-spacy-pre-unq.csv')

df = pd.concat([df1,df2], ignore_index=True)
df.shape

(257277, 8)

In [7]:
df.drop(columns = ['review','summary','synopsis'], inplace = True)

In [8]:
df['Key'] = df['movie_id'].map(str) + "_" + df['user_id'].map(str)

In [9]:
df.drop(columns = ['movie_id','user_id','is_spoiler'], inplace = True)
df

Unnamed: 0,summary_sim,synopsis_sim,Key
0,0.8182528627370624,0.9379999551535706,tt0111161_ur6574726
1,0.8248598156132173,0.9410306515057832,tt0111161_ur31182745
2,0.8286599404831738,0.9451207633340328,tt0111161_ur9871443
3,0.8536536626202199,0.929594141329498,tt0111161_ur23169472
4,0.7727762898072235,0.8943991748293878,tt0111161_ur34426359
...,...,...,...
257272,0.7507024106067787,0.7507024106067787,tt0139239_ur0415521
257273,0.7608688119019157,0.7608688119019157,tt0139239_ur0100166
257274,0.7717965122266363,0.7717965122266363,tt0139239_ur0021767
257275,0.7526294445454874,0.7526294445454874,tt0139239_ur0349105


In [10]:
df_raw

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary,Key
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.,tt0111161_ur1898687
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.,tt0111161_ur0842118
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film,tt0111161_ur1285640
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?,tt0111161_ur1003471
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted",tt0111161_ur0226855
...,...,...,...,...,...,...,...,...
573908,8 August 1999,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem...",10,The best teen movie of the nineties,tt0139239_ur0100166
573909,31 July 1999,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ...",9,Go - see the movie,tt0139239_ur0021767
573910,20 July 1999,tt0139239,ur0392750,False,"Go is the best movie I have ever seen, and I'v...",10,It's the best movie I've ever seen,tt0139239_ur0392750
573911,11 June 1999,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...,3,Haven't we seen this before?,tt0139239_ur0349105


In [11]:
df_raw = pd.merge(df_raw, df, how='inner', on ='Key')

In [12]:
df_raw

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary,Key,summary_sim,synopsis_sim
0,9 October 2005,tt0111161,ur6574726,True,I have been a fan of this movie for a long tim...,9,This Movie Saved My Life.,tt0111161_ur6574726,0.8182528627370624,0.9379999551535706
1,4 February 2012,tt0111161,ur31182745,True,I made my account on IMDb Just to Rate this mo...,10,Movie you can see 1000 times,tt0111161_ur31182745,0.8248598156132173,0.9410306515057832
2,24 October 2008,tt0111161,ur9871443,True,"A friend of mine listed ""The Shawshank Redempt...",10,The Shawshank Redemption,tt0111161_ur9871443,0.8286599404831738,0.9451207633340328
3,1 June 2010,tt0111161,ur23169472,True,"To tell the truth, I am speechless. I am a you...",10,"Blatantly Simple, yet Utterly Beautiful",tt0111161_ur23169472,0.8536536626202199,0.929594141329498
4,16 July 2013,tt0111161,ur34426359,True,Wow! what a film this baby is and yes this fil...,10,Great films come only once in a blue moon,tt0111161_ur34426359,0.7727762898072235,0.8943991748293878
...,...,...,...,...,...,...,...,...,...,...
257272,14 August 1999,tt0139239,ur0415521,False,This type of movie is one that I would not ord...,9,GO see it!,tt0139239_ur0415521,0.7507024106067787,0.7507024106067787
257273,8 August 1999,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem...",10,The best teen movie of the nineties,tt0139239_ur0100166,0.7608688119019157,0.7608688119019157
257274,31 July 1999,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ...",9,Go - see the movie,tt0139239_ur0021767,0.7717965122266363,0.7717965122266363
257275,11 June 1999,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...,3,Haven't we seen this before?,tt0139239_ur0349105,0.7526294445454874,0.7526294445454874


In [13]:
df_raw.summary_sim = df_raw.summary_sim.round(3)

In [14]:
df_raw.synopsis_sim = df_raw.synopsis_sim.round(3)

In [15]:
data = []
# For each of the samples...
for index, row in df_raw.iterrows():

    # Piece it together...    
    combined = ""
    
    #combined += "The ID of this item is {:}, ".format(row["Clothing ID"])
    combined += "My review is {:} similar to the summary. ".format(row["summary_sim"])
    
    combined += "And {:} similar to the ending. ".format(row["synopsis_sim"])
       
    # Finally, append the review the text!
    data.append(combined)
    
df = pd.DataFrame(data, columns=['combined'])
df_raw = pd.concat([df_raw,df], axis=1)


print('DONE.')

DONE.


In [16]:
df

Unnamed: 0,combined
0,My review is 0.818 similar to the summary. And...
1,My review is 0.825 similar to the summary. And...
2,My review is 0.829 similar to the summary. And...
3,My review is 0.854 similar to the summary. And...
4,My review is 0.773 similar to the summary. And...
...,...
257272,My review is 0.751 similar to the summary. And...
257273,My review is 0.761 similar to the summary. And...
257274,My review is 0.772 similar to the summary. And...
257275,My review is 0.753 similar to the summary. And...


In [17]:
df_raw.tail(50)

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary,Key,summary_sim,synopsis_sim,combined
257227,12 June 2015,tt0139239,ur1234929,False,Go! tells the story of the events after a drug...,7,Typical 90s,tt0139239_ur1234929,0.825,0.825,My review is 0.825 similar to the summary. And...
257228,14 November 2004,tt0139239,ur3151829,False,You know how some films mess with the studio l...,8,Permanently altered Paramount-logo perception,tt0139239_ur3151829,0.764,0.764,My review is 0.764 similar to the summary. And...
257229,31 March 2003,tt0139239,ur2326727,False,How could you tell a movie with a dynamic mont...,9,The Independent Spirit Rises,tt0139239_ur2326727,0.754,0.754,My review is 0.754 similar to the summary. And...
257230,23 April 1999,tt0139239,ur0299959,False,In short: Go went nowhere. It said nothing new...,5,Morality?,tt0139239_ur0299959,0.803,0.803,My review is 0.803 similar to the summary. And...
257231,18 April 1999,tt0139239,ur0244097,False,"I can say many bad things about this movie, bu...",1,"""Go"" more like ""Blow""...",tt0139239_ur0244097,0.817,0.817,My review is 0.817 similar to the summary. And...
257232,17 April 1999,tt0139239,ur0161832,False,This movie is a total hack. The opening scene...,2,Edited and acted well; but I can't believe thi...,tt0139239_ur0161832,0.758,0.758,My review is 0.758 similar to the summary. And...
257233,3 March 2002,tt0139239,ur1518199,False,All this and more can be seen in this dark com...,9,"Drugs, Sex, Violence, and Family Circus",tt0139239_ur1518199,0.844,0.844,My review is 0.844 similar to the summary. And...
257234,25 November 2001,tt0139239,ur0448368,False,This movie looks like a first work from a dire...,5,Pretension is its flaw,tt0139239_ur0448368,0.837,0.837,My review is 0.837 similar to the summary. And...
257235,8 October 2001,tt0139239,ur1216786,False,This movie just tried to make a very poor atte...,6,A rip-off of Pulp Fiction,tt0139239_ur1216786,0.815,0.815,My review is 0.815 similar to the summary. And...
257236,28 March 2001,tt0139239,ur0564402,False,"There are four, not three, but four reasons to...",4,"Four reasons to watch ""Go""",tt0139239_ur0564402,0.721,0.721,My review is 0.721 similar to the summary. And...


In [18]:
df_raw['text'] = df_raw.review_text.map(str) + df_raw.combined.map(str)

### Data Preprocessing

In [19]:
# column rename
df_raw['is_spoiler']= df_raw['is_spoiler'].replace([True,False],[1,0])
df_raw.rename(columns={'is_spoiler':'class'}, inplace = True)
df = df_raw[['class','text']]

In [20]:
# null replace
df['text'].replace('', np.nan, inplace=True)
df.dropna(subset=['text'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [21]:
df['text'].iloc[0]

"I have been a fan of this movie for a long time.It seems that ever time my life hits a downward spiral, I can always seem to pop this movie in, and come up with a solution to my pending problem. It somehow gives Me sense of peace and inner strength.So, It wasn't all that strange for me to pop it in when I was going through a rough patch in my marriage. I found myself identifying with many of the characters in this movie.Many of them are trapped in a world of regret and mourning, due to a mistake that had been made early on in their lives. This film gave me the strength to escape from my world of misery. And now I am able to say I, Like Andy broke out of my own personal Shawshank.My Ex isn't too happy about the divorce. But life is much better now. And I feel saved from a life of unhappiness, Due to a mistake of a marriage to early in life.Thank You Frank Durabont.Your film saved my life.My review is 0.818 similar to the summary. And 0.938 similar to the ending. "

In [22]:
# 
df['text'] = df['text'].apply(str)

df['text_count'] = df['text'].apply(sent_tokenize).tolist()
df['text_count'] = df['text_count'].apply(len)

df = df[df['text_count'] < 8]
df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


(137942, 3)

In [23]:
df.reset_index(drop=True, inplace=True)

In [24]:
#df_raw['Key'] = df_raw['movie_id'].map(str) + "_" + df_raw['user_id'].map(str)
df.drop(columns = ['text_count'], inplace = True)

### Balancing

In [25]:
df_non = df[df['class'] == 0]
df_spoiler = df[df['class'] == 1]
print("nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))

nonspoiler: 114721
spoiler: 23221


In [26]:
df_non = df_non.sample(frac =.5)
print("weight-decreased nonspoiler:", len(df_non))
print("spoiler:", len(df_spoiler))
print("balanced total:", len(df_non)+len(df_spoiler))

weight-decreased nonspoiler: 57360
spoiler: 23221
balanced total: 80581


### Holdout

In [27]:
# 70/10/20 train/val/test
df_train_non = df_non.sample(frac = .70)
df_non = df_non.drop(df_train_non.index)
df_train_spoiler = df_spoiler.sample(frac = .70)
df_spoiler = df_spoiler.drop(df_train_spoiler.index)

# use 33% of the remaining for validation (which is 10% of whole dataset)
df_dev_non = df_non.sample(frac = .333)
df_non = df_non.drop(df_dev_non.index)
df_dev_spoiler = df_spoiler.sample(frac = .333)
df_spoiler = df_spoiler.drop(df_dev_spoiler.index)

# use all of the remaining for test (which is 20% of whole dataset)
df_test_non = df_non
df_test_spoiler = df_spoiler

In [28]:
# merge and shuffle

df_train = pd.concat([df_train_non, df_train_spoiler])
df_train = df_train.sample(frac=1).reset_index(drop=True)

df_dev = pd.concat([df_dev_non, df_dev_spoiler])
df_dev = df_dev.sample(frac=1).reset_index(drop=True)

df_test = pd.concat([df_test_non, df_test_spoiler])
df_test = df_test.sample(frac=1).reset_index(drop=True)

print("train:",len(df_train))
print("dev:",len(df_dev))
print("test:",len(df_test))
print("all:",len(df_train)+len(df_dev)+len(df_test))

train: 56407
dev: 8050
test: 16124
all: 80581


In [29]:
df_train.to_csv('IMDB/final/train_sim_sd.csv', index=False)
df_dev.to_csv('IMDB/final/dev_sim_sd.csv', index=False)
df_test.to_csv('IMDB/final/test_sim_sd.csv', index=False)

### Sample

In [30]:
df_train_sample = df_train.sample(35000)
df_dev_sample = df_dev.sample(5000)
df_test_sample = df_test.sample(10000)

In [31]:
df_train_sample.to_csv('IMDB/final_sample/train_sim_sd.csv', index=False)
df_dev_sample.to_csv('IMDB/final_sample/dev_sim_sd.csv', index=False)
df_test_sample.to_csv('IMDB/final_sample/test_sim_sd.csv', index=False)