In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
from collections import OrderedDict

### Data Load

In [2]:
df_detail = pd.read_json('IMDB_movie_details.json', lines=True)
df_detail = df_detail[['movie_id','plot_synopsis']]

df_detail['plot_synopsis'].replace('', np.nan, inplace=True)
df_detail.dropna(subset=['plot_synopsis'], inplace=True)

df_detail.head()

Unnamed: 0,movie_id,plot_synopsis
0,tt0105112,"Jack Ryan (Ford) is on a ""working vacation"" in..."
1,tt1204975,Four boys around the age of 10 are friends in ...
3,tt0040897,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...
4,tt0126886,Jim McAllister (Matthew Broderick) is a much-a...
5,tt0286716,Bruce Banner (Eric Bana) is a research scienti...


In [3]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)
df_raw = df_raw[['movie_id','user_id','is_spoiler','review_text']]

df_raw.head()

Unnamed: 0,movie_id,user_id,is_spoiler,review_text
0,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt..."
1,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...
2,tt0111161,ur1285640,True,I believe that this film is the best story eve...
3,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ..."
4,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...


### Lower-case

In [4]:
df_detail['plot_synopsis'] = df_detail['plot_synopsis'].str.lower()
df_raw['review_text'] = df_raw['review_text'].str.lower()

### Tokenization

In [5]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]    

In [6]:
df_detail['p_tokenized'] = df_detail.apply(lambda x: tokenize(x['plot_synopsis']), axis=1)
df_raw['r_tokenized'] = df_raw.apply(lambda x: tokenize(x['review_text']), axis=1)

df_raw[['review_text', 'r_tokenized']].head()

Unnamed: 0,review_text,r_tokenized
0,"in its oscar year, shawshank redemption (writt...","[in, its, oscar, year, shawshank, redemption, ..."
1,the shawshank redemption is without a doubt on...,"[the, shawshank, redemption, is, without, a, d..."
2,i believe that this film is the best story eve...,"[i, believe, that, this, film, is, the, best, ..."
3,"**yes, there are spoilers here**this film has ...","[yes, there, are, spoilers, here, this, film, ..."
4,at the heart of this extraordinary movie is a ...,"[at, the, heart, of, this, extraordinary, movi..."


### Stopwords

In [20]:
def remove_stopwords(tokenized_column):
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', 'all', 'be', 'know',
                      'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                      'rather', 'easy', 'easilSy', 'lot', 'lack', 'make', 'want', 'seem', 'need', 'even', 'spoiler',
                      'right', 'movie', 'even', 'also', 'may', 'take', 'come', 'yes', 'no', 'oscar','award','imdb','review'])
    stops = set(stop_words)

    
    return [word for word in tokenized_column if not word in stops]

In [21]:
df_detail['p_stopwords_removed'] = df_detail.apply(lambda x: remove_stopwords(x['p_tokenized']), axis=1)
df_raw['r_stopwords_removed'] = df_raw.apply(lambda x: remove_stopwords(x['r_tokenized']), axis=1)

df_raw[['review_text', 'r_stopwords_removed']].head()

Unnamed: 0,review_text,r_stopwords_removed
0,"in its oscar year, shawshank redemption (writt...","[year, shawshank, redemption, written, directe..."
1,the shawshank redemption is without a doubt on...,"[shawshank, redemption, without, doubt, one, b..."
2,i believe that this film is the best story eve...,"[believe, film, best, story, ever, told, film,..."
3,"**yes, there are spoilers here**this film has ...","[spoilers, film, emotional, impact, find, hard..."
4,at the heart of this extraordinary movie is a ...,"[heart, extraordinary, brilliant, indelible, p..."


### Stemming

In [22]:
def apply_snowstemming(tokenized_column):
    stemmer = SnowballStemmer('english') 
    return [stemmer.stem(word) for word in tokenized_column]

In [23]:
def apply_porterstemming(tokenized_column):
    stemmer = PorterStemmer() 
    return [stemmer.stem(word) for word in tokenized_column]

In [24]:
df_detail['p_snowball_stemmed'] = df_detail.apply(lambda x: apply_snowstemming(x['p_stopwords_removed']), axis=1)

In [25]:
df_raw['r_snowball_stemmed'] = df_raw.apply(lambda x: apply_snowstemming(x['r_stopwords_removed']), axis=1)
df_raw['r_porter_stemmed'] = df_raw.apply(lambda x: apply_porterstemming(x['r_stopwords_removed']), axis=1)

df_raw[['review_text', 'r_snowball_stemmed', 'r_porter_stemmed']].head()

Unnamed: 0,review_text,r_snowball_stemmed,r_porter_stemmed
0,"in its oscar year, shawshank redemption (writt...","[year, shawshank, redempt, written, direct, fr...","[year, shawshank, redempt, written, direct, fr..."
1,the shawshank redemption is without a doubt on...,"[shawshank, redempt, without, doubt, one, bril...","[shawshank, redempt, without, doubt, one, bril..."
2,i believe that this film is the best story eve...,"[believ, film, best, stori, ever, told, film, ...","[believ, film, best, stori, ever, told, film, ..."
3,"**yes, there are spoilers here**this film has ...","[spoiler, film, emot, impact, find, hard, writ...","[spoiler, film, emot, impact, find, hard, writ..."
4,at the heart of this extraordinary movie is a ...,"[heart, extraordinari, brilliant, indel, perfo...","[heart, extraordinari, brilliant, indel, perfo..."


### re Stopword

In [26]:
df_detail['p_re_stopwords'] = df_detail.apply(lambda x: remove_stopwords(x['p_snowball_stemmed']), axis=1)
df_raw['r_re_stopwords'] = df_raw.apply(lambda x: remove_stopwords(x['r_snowball_stemmed']), axis=1)

### Rejoin

In [27]:
def rejoin_words(tokenized_column):
    
    return ( " ".join(tokenized_column))

In [28]:
df_detail['synopsis'] = df_detail.apply(lambda x: rejoin_words(x['p_re_stopwords']), axis=1)
df_raw['review'] = df_raw.apply(lambda x: rejoin_words(x['r_re_stopwords']), axis=1)

df_raw[['review_text', 'review']].head()

Unnamed: 0,review_text,review
0,"in its oscar year, shawshank redemption (writt...",year shawshank redempt written direct frank da...
1,the shawshank redemption is without a doubt on...,shawshank redempt without doubt one brilliant ...
2,i believe that this film is the best story eve...,believ film best stori ever told film tell rob...
3,"**yes, there are spoilers here**this film has ...",film emot impact find hard write comment read ...
4,at the heart of this extraordinary movie is a ...,heart extraordinari brilliant indel perform mo...


In [29]:
df_raw['review'].iloc[0]

'year shawshank redempt written direct frank darabont novella rita hayworth shawshank redempt stephen king nomin seven academi walk away zero best pictur went forrest gump shawshank pulp fiction happi nomin cours hindsight histori look back gump film pulp redempt rememb best pulp howev success word huge splash cann american master two film andi dufresn success fortun failur life open screen film fell fast theatr finish mere reason failur first titl clunker icon fan today peopl knew care dvd tim robbin laugh recount fan congratul film nightmar drama tough sell women stori love two best friend spell winner men worst slow molass desson thomson write washington post wander subplot everi opportun ignor abund narrat exit point settl final weak film set open aerial shot prison total amaz piec architectur strong gothic design immedi prison becom charact cast shadow film tall stone wall stretch everi shot tower men contain blot memori outsid world andi robbin hold onto hope music sandi beach zi

### Remove Duplicate

In [30]:
df_raw['review'] = (df_raw['review'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

df_detail['synopsis'] = (df_detail['synopsis'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

In [31]:
df_raw['review'].iloc[0]

'year shawshank redempt written direct frank darabont novella rita hayworth stephen king nomin seven academi walk away zero best pictur went forrest gump pulp fiction happi cours hindsight histori look back film rememb howev success word huge splash cann american master two andi dufresn fortun failur life open screen fell fast theatr finish mere reason first titl clunker icon fan today peopl knew care dvd tim robbin laugh recount congratul nightmar drama tough sell women stori love friend spell winner men worst slow molass desson thomson write washington post wander subplot everi opportun ignor abund narrat exit point settl final weak set aerial shot prison total amaz piec architectur strong gothic design immedi becom charact cast shadow tall stone wall stretch tower contain blot memori outsid world hold onto hope music sandi beach zihuatanejo forget place made someth insid ca touch red morgan freeman much pick glass milk silver spoon ass new fish crack lose bet resent time quick one m

### Merge

In [32]:
df_synopsis = df_detail[['movie_id','synopsis']]

In [33]:
df_synopsis

Unnamed: 0,movie_id,synopsis
0,tt0105112,jack ryan ford work vacat london famili retir ...
1,tt1204975,four boy around age friend brooklyn nicknam fl...
3,tt0040897,fred dobb humphrey bogart bob curtin tim holt ...
4,tt0126886,jim mcallist matthew broderick high school his...
5,tt0286716,bruce banner eric bana research scientist berk...
...,...,...
1563,tt0120655,film open homeless man bud cort desert new jer...
1565,tt0276751,freeman hugh grant bachelor pride cool royalti...
1567,tt0289879,year evan treborn ashton kutcher suffer sever ...
1568,tt1723811,brandon michael fassbend success handsom busin...


In [34]:
df_review = df_raw[['movie_id','user_id','is_spoiler','review']]

In [35]:
df_review

Unnamed: 0,movie_id,user_id,is_spoiler,review
0,tt0111161,ur1898687,True,year shawshank redempt written direct frank da...
1,tt0111161,ur0842118,True,shawshank redempt without doubt one brilliant ...
2,tt0111161,ur1285640,True,believ film best stori ever told tell robbin p...
3,tt0111161,ur1003471,True,film emot impact find hard write comment read ...
4,tt0111161,ur0226855,True,heart extraordinari brilliant indel perform mo...
...,...,...,...,...
573908,tt0139239,ur0100166,False,wise fast pure entertain assembl except cast t...
573909,tt0139239,ur0021767,False,well shall fun rate three plotlin origin one p...
573910,tt0139239,ur0392750,False,best ever seen movi read notic peopl compar pu...
573911,tt0139239,ur0349105,False,call teenag version pulp fiction whatev pleas ...


In [36]:
df_spoiler = pd.merge(df_review, df_synopsis, how='inner', on ='movie_id')

In [37]:
df_spoiler.to_csv('IMDB/dataset/spacy-pre-unq.csv', index= False)