In [1]:
import re
import pandas as pd
import numpy as np
import nltk
#from nltk import sent_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
from collections import OrderedDict

pd.set_option('mode.chained_assignment',  None)

### Data Load

#### Summary, Synopsis Data

In [2]:
df_detail = pd.read_json('IMDB_movie_details.json', lines=True)

# null (1572 to 1339)
df_detail['plot_synopsis'].replace('', np.nan, inplace=True)
df_detail.dropna(subset=['plot_synopsis'], inplace=True)

df_detail

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in..."
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...
3,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...
4,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...
5,tt0286716,"Bruce Banner, a brilliant scientist with a clo...",2h 18min,"[Action, Sci-Fi]",5.7,2003-06-20,Bruce Banner (Eric Bana) is a research scienti...
...,...,...,...,...,...,...,...
1563,tt0120655,An abortion clinic worker with a special herit...,2h 10min,"[Adventure, Comedy, Drama]",7.3,1999-11-12,The film opens with a homeless man (Bud Cort) ...
1565,tt0276751,Twelve year old Marcus Brewer lives with his c...,1h 41min,"[Comedy, Drama, Romance]",7.1,2002-05-17,Will Freeman (Hugh Grant) is a 38-year-old bac...
1567,tt0289879,Evan Treborn grows up in a small town with his...,1h 53min,"[Sci-Fi, Thriller]",7.7,2004-01-23,"In the year 1998, Evan Treborn (Ashton Kutcher..."
1568,tt1723811,Brandon is a 30-something man living in New Yo...,1h 41min,[Drama],7.2,2012-01-13,"Brandon (Michael Fassbender) is a successful, ..."


#### Movie Title Data

In [3]:
# movie title
df_id = pd.read_csv('title.akas.tsv',delimiter ='\t', dtype={'isOriginalTitle':object})

# original title filtering
df_title = df_id[df_id['isOriginalTitle'] == '1']

# add title to movie detail
df_title = df_title[['titleId', 'title']]
df_title.rename(columns = {'titleId':'movie_id'}, inplace = True)
df_title = df_title.drop_duplicates(subset = "movie_id")

df_detail = df_detail.merge(df_title, how ='left', on ='movie_id')

In [4]:
df_detail.head()

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,title
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",Patriot Games
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,Last Vegas
2,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,The Treasure of the Sierra Madre
3,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,Election
4,tt0286716,"Bruce Banner, a brilliant scientist with a clo...",2h 18min,"[Action, Sci-Fi]",5.7,2003-06-20,Bruce Banner (Eric Bana) is a research scienti...,Hulk


#### Review Data

In [5]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)

df_raw

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"
...,...,...,...,...,...,...,...
573908,8 August 1999,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem...",10,The best teen movie of the nineties
573909,31 July 1999,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ...",9,Go - see the movie
573910,20 July 1999,tt0139239,ur0392750,False,"Go is the best movie I have ever seen, and I'v...",10,It's the best movie I've ever seen
573911,11 June 1999,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...,3,Haven't we seen this before?


### Synopsis filtering

In [6]:
# synopsis sentence split
synopsis = df_detail[['movie_id','plot_synopsis']]

sentences = []
for row in synopsis.itertuples():
    for sentence in sent_tokenize(row[2]):
        sentences.append((row[1], sentence))
new_df = pd.DataFrame(sentences, columns=['ID', 'SENTENCE'])

In [7]:
# create count dummy
new_df['count'] = new_df.groupby(new_df['ID']).transform('count')

# group by
gr = new_df.groupby(['ID','SENTENCE']).count()
ggr = gr.reset_index()

# cumulative sum
ggr['cumsum'] = ggr.groupby('ID')['count'].transform(pd.Series.cumsum)

# sentence count
ggr['total'] = ggr.groupby(ggr['ID']).transform('count').iloc[:,0]

# criterion (50%)
ggr['criterion'] = ggr['total']*0.5

# extract synopsis
df_sentence = ggr[ggr['cumsum'] > ggr['criterion']]

# sentence to document
df_sentence['plot_synopsis'] = df_sentence.groupby(['ID'])['SENTENCE'].transform(lambda x : ' '.join(x))
df_sentence = df_sentence.iloc[:,[0,6]]

# to dataframe
df_doc = df_sentence.drop_duplicates()
df_doc.rename(columns = {'ID':'movie_id'}, inplace = True)

In [8]:
# filtered movie syopsis (50%)
df_detail.drop(columns = ['plot_synopsis'], inplace = True)

df_detail = df_detail.merge(df_doc, how ='left', on ='movie_id',)

In [9]:
df_detail

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,title,plot_synopsis
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,Patriot Games,Ryan and Miller fight hand to hand; Miller is ...
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Last Vegas,"Just then Lisa shows up for the wedding, but P..."
2,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,The Treasure of the Sierra Madre,"One day, Curtin sees a gila monster (a venomou..."
3,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Election,"On one cold winter day, Jim is at a conference..."
4,tt0286716,"Bruce Banner, a brilliant scientist with a clo...",2h 18min,"[Action, Sci-Fi]",5.7,2003-06-20,Hulk,He tells her as well about Bruce's power befor...
...,...,...,...,...,...,...,...,...
1334,tt0120655,An abortion clinic worker with a special herit...,2h 10min,"[Adventure, Comedy, Drama]",7.3,1999-11-12,Dogma,Loki also figures that he can get back on God'...
1335,tt0276751,Twelve year old Marcus Brewer lives with his c...,1h 41min,"[Comedy, Drama, Romance]",7.1,2002-05-17,About a Boy,Marcus decides that the only way to help his m...
1336,tt0289879,Evan Treborn grows up in a small town with his...,1h 53min,"[Sci-Fi, Thriller]",7.7,2004-01-23,The Butterfly Effect,"In the year 1998, Evan Treborn (Ashton Kutcher..."
1337,tt1723811,Brandon is a 30-something man living in New Yo...,1h 41min,[Drama],7.2,2012-01-13,Shame,He tries to look away but she keeps staring at...


### Review Filtering

In [10]:
# review filtering
df_review = df_raw.groupby('movie_id').count().sort_values(by='user_id', ascending = False)
df_review = df_review[df_review['review_date'] > 100]

df_list = df_review.index.values.tolist()

df_raw = df_raw[df_raw['movie_id'].isin(df_list)]
df_raw.shape

(562076, 7)

In [11]:
# null replace
df_raw['review_text'].replace('', np.nan, inplace=True)
df_raw.dropna(subset=['review_text'], inplace=True)
df_raw.shape

(562076, 7)

In [12]:
# 
df_raw['review_text'] = df_raw['review_text'].apply(str)

df_raw['text_count'] = df_raw['review_text'].apply(sent_tokenize).tolist()
df_raw['text_count'] = df_raw['text_count'].apply(len)

df_raw = df_raw[df_raw['text_count'] < 10]
df_raw.shape

(272977, 8)

In [13]:
df_raw = df_raw[['movie_id','user_id','is_spoiler','review_text']]
df_raw

Unnamed: 0,movie_id,user_id,is_spoiler,review_text
1,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...
6,tt0111161,ur6574726,True,I have been a fan of this movie for a long tim...
7,tt0111161,ur31182745,True,I made my account on IMDb Just to Rate this mo...
8,tt0111161,ur9871443,True,"A friend of mine listed ""The Shawshank Redempt..."
11,tt0111161,ur23169472,True,"To tell the truth, I am speechless. I am a you..."
...,...,...,...,...
573907,tt0139239,ur0415521,False,This type of movie is one that I would not ord...
573908,tt0139239,ur0100166,False,"Go is wise, fast and pure entertainment. Assem..."
573909,tt0139239,ur0021767,False,"Well, what shall I say. this one´s fun at any ..."
573911,tt0139239,ur0349105,False,Call this 1999 teenage version of Pulp Fiction...


### Lower-case

In [14]:
df_detail['plot_summary'] = df_detail['plot_summary'].str.lower()
df_detail['plot_synopsis'] = df_detail['plot_synopsis'].str.lower()
df_raw['review_text'] = df_raw['review_text'].str.lower()

### Tokenization

In [15]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]    

In [16]:
df_detail['m_tokenized'] = df_detail.apply(lambda x: tokenize(x['plot_summary']), axis=1)
df_detail['p_tokenized'] = df_detail.apply(lambda x: tokenize(x['plot_synopsis']), axis=1)
df_raw['r_tokenized'] = df_raw.apply(lambda x: tokenize(x['review_text']), axis=1)

df_raw[['review_text', 'r_tokenized']].head()

Unnamed: 0,review_text,r_tokenized
1,the shawshank redemption is without a doubt on...,"[the, shawshank, redemption, is, without, a, d..."
6,i have been a fan of this movie for a long tim...,"[i, have, been, a, fan, of, this, movie, for, ..."
7,i made my account on imdb just to rate this mo...,"[i, made, my, account, on, imdb, just, to, rat..."
8,"a friend of mine listed ""the shawshank redempt...","[a, friend, of, mine, listed, the, shawshank, ..."
11,"to tell the truth, i am speechless. i am a you...","[to, tell, the, truth, i, am, speechless, i, a..."


### Stopwords

In [17]:
def remove_stopwords(tokenized_column):
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 're', 'not', 'would', 'could', 'all', 'be',
                      'good', 'many', 'some', 'nice', 'thank',
                      'rather', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'need', 'even', 'spoiler',
                      'right', 'movie', 'even', 'also', 'may', 'take', 'come', 'yes', 'no', 'oscar','award','imdb','review'])
    stops = set(stop_words)

    
    return [word for word in tokenized_column if not word in stops]

In [18]:
df_detail['m_stopwords_removed'] = df_detail.apply(lambda x: remove_stopwords(x['m_tokenized']), axis=1)
df_detail['p_stopwords_removed'] = df_detail.apply(lambda x: remove_stopwords(x['p_tokenized']), axis=1)
df_raw['r_stopwords_removed'] = df_raw.apply(lambda x: remove_stopwords(x['r_tokenized']), axis=1)

df_raw[['review_text', 'r_stopwords_removed']].head()

Unnamed: 0,review_text,r_stopwords_removed
1,the shawshank redemption is without a doubt on...,"[shawshank, redemption, without, doubt, one, b..."
6,i have been a fan of this movie for a long tim...,"[fan, long, seems, ever, time, life, hits, dow..."
7,i made my account on imdb just to rate this mo...,"[made, account, rate, heard, someone, tim, rob..."
8,"a friend of mine listed ""the shawshank redempt...","[friend, mine, listed, shawshank, redemption, ..."
11,"to tell the truth, i am speechless. i am a you...","[tell, truth, speechless, young, fanatic, fact..."


### Stemming

In [19]:
def apply_snowstemming(tokenized_column):
    stemmer = SnowballStemmer('english') 
    return [stemmer.stem(word) for word in tokenized_column]

In [20]:
def apply_porterstemming(tokenized_column):
    stemmer = PorterStemmer() 
    return [stemmer.stem(word) for word in tokenized_column]
#df_raw['r_porter_stemmed'] = df_raw.apply(lambda x: apply_porterstemming(x['r_stopwords_removed']), axis=1)

In [21]:
df_detail['m_snowball_stemmed'] = df_detail.apply(lambda x: apply_snowstemming(x['m_stopwords_removed']), axis=1)
df_detail['p_snowball_stemmed'] = df_detail.apply(lambda x: apply_snowstemming(x['p_stopwords_removed']), axis=1)
df_raw['r_snowball_stemmed'] = df_raw.apply(lambda x: apply_snowstemming(x['r_stopwords_removed']), axis=1)

df_raw[['review_text', 'r_snowball_stemmed']].head()

Unnamed: 0,review_text,r_snowball_stemmed
1,the shawshank redemption is without a doubt on...,"[shawshank, redempt, without, doubt, one, bril..."
6,i have been a fan of this movie for a long tim...,"[fan, long, seem, ever, time, life, hit, downw..."
7,i made my account on imdb just to rate this mo...,"[made, account, rate, heard, someon, tim, robi..."
8,"a friend of mine listed ""the shawshank redempt...","[friend, mine, list, shawshank, redempt, one, ..."
11,"to tell the truth, i am speechless. i am a you...","[tell, truth, speechless, young, fanat, fact, ..."


### re Stopword

In [22]:
df_detail['m_re_stopwords'] = df_detail.apply(lambda x: remove_stopwords(x['m_snowball_stemmed']), axis=1)
df_detail['p_re_stopwords'] = df_detail.apply(lambda x: remove_stopwords(x['p_snowball_stemmed']), axis=1)
df_raw['r_re_stopwords'] = df_raw.apply(lambda x: remove_stopwords(x['r_snowball_stemmed']), axis=1)

### Rejoin

In [23]:
def rejoin_words(tokenized_column):
    
    return ( " ".join(tokenized_column))

In [24]:
df_detail['summary'] = df_detail.apply(lambda x: rejoin_words(x['m_re_stopwords']), axis=1)
df_detail['synopsis'] = df_detail.apply(lambda x: rejoin_words(x['p_re_stopwords']), axis=1)
df_raw['review'] = df_raw.apply(lambda x: rejoin_words(x['r_re_stopwords']), axis=1)

df_raw[['review_text', 'review']].head()

Unnamed: 0,review_text,review
1,the shawshank redemption is without a doubt on...,shawshank redempt without doubt one brilliant ...
6,i have been a fan of this movie for a long tim...,fan long ever time life hit downward spiral al...
7,i made my account on imdb just to rate this mo...,made account rate heard someon tim robin done ...
8,"a friend of mine listed ""the shawshank redempt...",friend mine list shawshank redempt one time fa...
11,"to tell the truth, i am speechless. i am a you...",tell truth speechless young fanat fact film ca...


### Remove Duplicate

In [25]:
df_raw['review'] = (df_raw['review'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

df_detail['summary'] = (df_detail['summary'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

df_detail['synopsis'] = (df_detail['synopsis'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

### Merge

In [26]:
df_sum_syn = df_detail[['movie_id','summary','synopsis']]

In [27]:
df_sum_syn

Unnamed: 0,movie_id,summary,synopsis
0,tt0105112,former cia analyst jack ryan england famili va...,ryan miller fight hand kill impal backward boa...
1,tt1204975,billi michael dougla paddi robert de niro arch...,lisa show wed paddi push billi pool approach s...
2,tt0040897,fred dobb bob curtin luck tampico mexico meet ...,one day curtin see gila monster venom lizard r...
3,tt0126886,traci flick run unoppos year high school stude...,one cold winter day jim confer washington walk...
4,tt0286716,bruce banner brilliant scientist cloudi past f...,tell well bruce power curt show door call done...
...,...,...,...
1334,tt0120655,abort clinic worker special heritag enlist pre...,loki figur get back god side resum posit angel...
1335,tt0276751,twelv year old marcus brewer live chronic depr...,marcus decid way help mother sing school varie...
1336,tt0289879,evan treborn grow small town singl work mother...,year evan treborn ashton kutcher suffer sever ...
1337,tt1723811,brandon man live new york unabl manag sex life...,tri look away keep stare smile clear flirt bat...


In [28]:
df_review = df_raw[['movie_id','user_id','is_spoiler','review']]

In [29]:
df_review

Unnamed: 0,movie_id,user_id,is_spoiler,review
1,tt0111161,ur0842118,True,shawshank redempt without doubt one brilliant ...
6,tt0111161,ur6574726,True,fan long ever time life hit downward spiral al...
7,tt0111161,ur31182745,True,made account rate heard someon tim robin done ...
8,tt0111161,ur9871443,True,friend mine list shawshank redempt one time fa...
11,tt0111161,ur23169472,True,tell truth speechless young fanat fact film ca...
...,...,...,...,...
573907,tt0139239,ur0415521,False,type one ordinarili see howev becom acquaint a...
573908,tt0139239,ur0100166,False,go wise fast pure entertain assembl except cas...
573909,tt0139239,ur0021767,False,well shall say fun rate three plotlin origin o...
573911,tt0139239,ur0349105,False,call teenag version pulp fiction whatev pleas ...


In [30]:
df_spoiler = pd.merge(df_review, df_sum_syn, how='inner', on ='movie_id')

In [31]:
df_spoiler.to_csv('IMDB/dataset/spacy-pre-sum_syn.csv', index= False)

In [32]:
df_spoiler

Unnamed: 0,movie_id,user_id,is_spoiler,review,summary,synopsis
0,tt0111161,ur0842118,True,shawshank redempt without doubt one brilliant ...,chronicl experi former success banker prison g...,andi said larg black stone meanwhil remain ste...
1,tt0111161,ur6574726,True,fan long ever time life hit downward spiral al...,chronicl experi former success banker prison g...,andi said larg black stone meanwhil remain ste...
2,tt0111161,ur31182745,True,made account rate heard someon tim robin done ...,chronicl experi former success banker prison g...,andi said larg black stone meanwhil remain ste...
3,tt0111161,ur9871443,True,friend mine list shawshank redempt one time fa...,chronicl experi former success banker prison g...,andi said larg black stone meanwhil remain ste...
4,tt0111161,ur23169472,True,tell truth speechless young fanat fact film ca...,chronicl experi former success banker prison g...,andi said larg black stone meanwhil remain ste...
...,...,...,...,...,...,...
257274,tt0139239,ur0415521,False,type one ordinarili see howev becom acquaint a...,told three perspect stori bunch young californ...,ronna lie motionless storyearli recruit cover ...
257275,tt0139239,ur0100166,False,go wise fast pure entertain assembl except cas...,told three perspect stori bunch young californ...,ronna lie motionless storyearli recruit cover ...
257276,tt0139239,ur0021767,False,well shall say fun rate three plotlin origin o...,told three perspect stori bunch young californ...,ronna lie motionless storyearli recruit cover ...
257277,tt0139239,ur0349105,False,call teenag version pulp fiction whatev pleas ...,told three perspect stori bunch young californ...,ronna lie motionless storyearli recruit cover ...
