In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict

### Data Load

In [2]:
df_detail = pd.read_json('IMDB_movie_details.json', lines=True)
df_detail = df_detail[['movie_id','plot_synopsis']]

df_detail['plot_synopsis'].replace('', np.nan, inplace=True)
df_detail.dropna(subset=['plot_synopsis'], inplace=True)

df_detail.head()

Unnamed: 0,movie_id,plot_synopsis
0,tt0105112,"Jack Ryan (Ford) is on a ""working vacation"" in..."
1,tt1204975,Four boys around the age of 10 are friends in ...
3,tt0040897,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...
4,tt0126886,Jim McAllister (Matthew Broderick) is a much-a...
5,tt0286716,Bruce Banner (Eric Bana) is a research scienti...


In [3]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)
df_raw = df_raw[['movie_id','user_id','is_spoiler','review_text']]

df_raw.head()

Unnamed: 0,movie_id,user_id,is_spoiler,review_text
0,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt..."
1,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...
2,tt0111161,ur1285640,True,I believe that this film is the best story eve...
3,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ..."
4,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...


### Lower-case

In [4]:
df_detail['plot_synopsis'] = df_detail['plot_synopsis'].str.lower()
df_raw['review_text'] = df_raw['review_text'].str.lower()

### Tokenization

In [5]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]    

In [6]:
df_detail['p_tokenized'] = df_detail.apply(lambda x: tokenize(x['plot_synopsis']), axis=1)
df_raw['r_tokenized'] = df_raw.apply(lambda x: tokenize(x['review_text']), axis=1)

df_raw[['review_text', 'r_tokenized']].head()

Unnamed: 0,review_text,r_tokenized
0,"in its oscar year, shawshank redemption (writt...","[in, its, oscar, year, shawshank, redemption, ..."
1,the shawshank redemption is without a doubt on...,"[the, shawshank, redemption, is, without, a, d..."
2,i believe that this film is the best story eve...,"[i, believe, that, this, film, is, the, best, ..."
3,"**yes, there are spoilers here**this film has ...","[yes, there, are, spoilers, here, this, film, ..."
4,at the heart of this extraordinary movie is a ...,"[at, the, heart, of, this, extraordinary, movi..."


### Stopwords

In [7]:
def remove_stopwords(tokenized_column):
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', 'all', 'be', 'know',
                      'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                      'rather', 'easy', 'easilSy', 'lot', 'lack', 'make', 'want', 'seem', 'need', 'even', 'spoiler',
                      'right', 'movie', 'even', 'also', 'may', 'take', 'come', 'yes', 'no', 'oscar','award','imdb','review'])
    stops = set(stop_words)

    
    return [word for word in tokenized_column if not word in stops]

In [8]:
df_detail['p_stopwords_removed'] = df_detail.apply(lambda x: remove_stopwords(x['p_tokenized']), axis=1)
df_raw['r_stopwords_removed'] = df_raw.apply(lambda x: remove_stopwords(x['r_tokenized']), axis=1)

df_raw[['review_text', 'r_stopwords_removed']].head()

Unnamed: 0,review_text,r_stopwords_removed
0,"in its oscar year, shawshank redemption (writt...","[year, shawshank, redemption, written, directe..."
1,the shawshank redemption is without a doubt on...,"[shawshank, redemption, without, doubt, one, b..."
2,i believe that this film is the best story eve...,"[believe, film, best, story, ever, told, film,..."
3,"**yes, there are spoilers here**this film has ...","[spoilers, film, emotional, impact, find, hard..."
4,at the heart of this extraordinary movie is a ...,"[heart, extraordinary, brilliant, indelible, p..."


### Stemming

In [9]:
def apply_lemmatizing(tokenized_column):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokenized_column]

In [10]:
df_detail['p_lemmatized'] = df_detail.apply(lambda x: apply_lemmatizing(x['p_stopwords_removed']), axis=1)

In [11]:
df_raw['r_lemmatized'] = df_raw.apply(lambda x: apply_lemmatizing(x['r_stopwords_removed']), axis=1)
df_raw[['review_text', 'r_lemmatized', 'r_lemmatized']].head()

Unnamed: 0,review_text,r_lemmatized,r_lemmatized.1
0,"in its oscar year, shawshank redemption (writt...","[year, shawshank, redemption, written, directe...","[year, shawshank, redemption, written, directe..."
1,the shawshank redemption is without a doubt on...,"[shawshank, redemption, without, doubt, one, b...","[shawshank, redemption, without, doubt, one, b..."
2,i believe that this film is the best story eve...,"[believe, film, best, story, ever, told, film,...","[believe, film, best, story, ever, told, film,..."
3,"**yes, there are spoilers here**this film has ...","[spoiler, film, emotional, impact, find, hard,...","[spoiler, film, emotional, impact, find, hard,..."
4,at the heart of this extraordinary movie is a ...,"[heart, extraordinary, brilliant, indelible, p...","[heart, extraordinary, brilliant, indelible, p..."


### re Stopword

In [12]:
df_detail['p_re_stopwords'] = df_detail.apply(lambda x: remove_stopwords(x['p_lemmatized']), axis=1)
df_raw['r_re_stopwords'] = df_raw.apply(lambda x: remove_stopwords(x['r_lemmatized']), axis=1)

### Rejoin

In [13]:
def rejoin_words(tokenized_column):
    
    return ( " ".join(tokenized_column))

In [14]:
df_detail['synopsis'] = df_detail.apply(lambda x: rejoin_words(x['p_re_stopwords']), axis=1)
df_raw['review'] = df_raw.apply(lambda x: rejoin_words(x['r_re_stopwords']), axis=1)

df_raw[['review_text', 'review']].head()

Unnamed: 0,review_text,review
0,"in its oscar year, shawshank redemption (writt...",year shawshank redemption written directed fra...
1,the shawshank redemption is without a doubt on...,shawshank redemption without doubt one brillia...
2,i believe that this film is the best story eve...,believe film best story ever told film tell ro...
3,"**yes, there are spoilers here**this film has ...",film emotional impact find hard write comment ...
4,at the heart of this extraordinary movie is a ...,heart extraordinary brilliant indelible perfor...


In [15]:
df_raw['review'].iloc[0]

'year shawshank redemption written directed frank darabont novella rita hayworth shawshank redemption stephen king nominated seven academy walked away zero best picture went forrest gump shawshank pulp fiction happy nominated course hindsight history look back gump film pulp redemption remembered best pulp however success word making huge splash cannes making american master two film andy dufresne success fortunately failure life opening screen film fell fast theatre finished mere reason failure firstly title clunker iconic fan today people knew cared dvd tim robbins laugh recounting fan congratulating film nightmare drama tough sell woman story love two best friend spell winner men worst slow molasses desson thomson writes washington post wanders subplots every opportunity ignores abundance narrative exit point settling finale weakness film setting opening aerial shot prison total amazing piece architecture strong gothic design immediately prison becomes character cast shadow film tal

### Remove Duplicate

In [16]:
df_raw['review'] = (df_raw['review'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

df_detail['synopsis'] = (df_detail['synopsis'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

In [17]:
df_raw['review'].iloc[0]

'year shawshank redemption written directed frank darabont novella rita hayworth stephen king nominated seven academy walked away zero best picture went forrest gump pulp fiction happy course hindsight history look back film remembered however success word making huge splash cannes american master two andy dufresne fortunately failure life opening screen fell fast theatre finished mere reason firstly title clunker iconic fan today people knew cared dvd tim robbins laugh recounting congratulating nightmare drama tough sell woman story love friend spell winner men worst slow molasses desson thomson writes washington post wanders subplots every opportunity ignores abundance narrative exit point settling finale weakness setting aerial shot prison total amazing piece architecture strong gothic design immediately becomes character cast shadow tall stone wall stretching tower contains blotting memory outside world hold onto hope music sandy beach zihuatanejo forget place made something inside

### Merge

In [18]:
df_synopsis = df_detail[['movie_id','synopsis']]

In [19]:
df_synopsis

Unnamed: 0,movie_id,synopsis
0,tt0105112,jack ryan ford working vacation london family ...
1,tt1204975,four boy around age friend brooklyn nickname f...
3,tt0040897,fred dobbs humphrey bogart bob curtin tim holt...
4,tt0126886,jim mcallister matthew broderick high school h...
5,tt0286716,bruce banner eric bana research scientist berk...
...,...,...
1563,tt0120655,film open homeless man bud cort deserted new j...
1565,tt0276751,freeman hugh grant bachelor pride cool thanks ...
1567,tt0289879,year evan treborn ashton kutcher suffered seve...
1568,tt1723811,brandon michael fassbender successful handsome...


In [20]:
df_review = df_raw[['movie_id','user_id','is_spoiler','review']]

In [21]:
df_review

Unnamed: 0,movie_id,user_id,is_spoiler,review
0,tt0111161,ur1898687,True,year shawshank redemption written directed fra...
1,tt0111161,ur0842118,True,shawshank redemption without doubt one brillia...
2,tt0111161,ur1285640,True,believe film best story ever told tell robbins...
3,tt0111161,ur1003471,True,film emotional impact find hard write comment ...
4,tt0111161,ur0226855,True,heart extraordinary brilliant indelible perfor...
...,...,...,...,...
573908,tt0139239,ur0100166,False,wise fast pure entertainment assembling except...
573909,tt0139239,ur0021767,False,well shall fun rate three plotlines origining ...
573910,tt0139239,ur0392750,False,best ever seen read noticed people compared pu...
573911,tt0139239,ur0349105,False,call teenage version pulp fiction whatever ple...


In [22]:
df_spoiler = pd.merge(df_review, df_synopsis, how='inner', on ='movie_id')

In [23]:
df_spoiler.to_csv('IMDB/dataset/spacy-pre-unq-lem.csv', index= False)