In [2]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
from collections import OrderedDict
from gensim.summarization import summarize

ImportError: DLL load failed: 지정된 모듈을 찾을 수 없습니다.

### Data Load

In [2]:
df_detail = pd.read_json('IMDB_movie_details.json', lines=True)
df_detail = df_detail[['movie_id','plot_synopsis']]

df_detail['plot_synopsis'].replace('', np.nan, inplace=True)
df_detail.dropna(subset=['plot_synopsis'], inplace=True)

df_detail.head()

Unnamed: 0,movie_id,plot_synopsis
0,tt0105112,"Jack Ryan (Ford) is on a ""working vacation"" in..."
1,tt1204975,Four boys around the age of 10 are friends in ...
3,tt0040897,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...
4,tt0126886,Jim McAllister (Matthew Broderick) is a much-a...
5,tt0286716,Bruce Banner (Eric Bana) is a research scienti...


In [3]:
df_raw = pd.read_json('IMDB_reviews.json', lines=True)
df_raw = df_raw[['movie_id','user_id','is_spoiler','review_text']]

df_raw.head()

Unnamed: 0,movie_id,user_id,is_spoiler,review_text
0,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt..."
1,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...
2,tt0111161,ur1285640,True,I believe that this film is the best story eve...
3,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ..."
4,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...


In [4]:
a = []

for i in df_detail['plot_synopsis']:
    i= i + str(". This is second sentence. This is third")             
    a.append(summarize(i, ratio=0.4, split = True))

df_detail['summary'] = a

In [5]:
df_detail['summary'] = [','.join(map(str, l)) for l in df_detail['summary']]

### Lower-case

In [6]:
df_detail['plot_synopsis'] = df_detail['summary'].str.lower()
df_raw['review_text'] = df_raw['review_text'].str.lower()

### Tokenization

In [7]:
def tokenize(column):
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]    

In [8]:
df_detail['p_tokenized'] = df_detail.apply(lambda x: tokenize(x['plot_synopsis']), axis=1)
df_raw['r_tokenized'] = df_raw.apply(lambda x: tokenize(x['review_text']), axis=1)

df_raw[['review_text', 'r_tokenized']].head()

Unnamed: 0,review_text,r_tokenized
0,"in its oscar year, shawshank redemption (writt...","[in, its, oscar, year, shawshank, redemption, ..."
1,the shawshank redemption is without a doubt on...,"[the, shawshank, redemption, is, without, a, d..."
2,i believe that this film is the best story eve...,"[i, believe, that, this, film, is, the, best, ..."
3,"**yes, there are spoilers here**this film has ...","[yes, there, are, spoilers, here, this, film, ..."
4,at the heart of this extraordinary movie is a ...,"[at, the, heart, of, this, extraordinary, movi..."


### Stopwords

In [9]:
def remove_stopwords(tokenized_column):
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [10]:
df_detail['p_stopwords_removed'] = df_detail.apply(lambda x: remove_stopwords(x['p_tokenized']), axis=1)
df_raw['r_stopwords_removed'] = df_raw.apply(lambda x: remove_stopwords(x['r_tokenized']), axis=1)

df_raw[['review_text', 'r_stopwords_removed']].head()

Unnamed: 0,review_text,r_stopwords_removed
0,"in its oscar year, shawshank redemption (writt...","[oscar, year, shawshank, redemption, written, ..."
1,the shawshank redemption is without a doubt on...,"[shawshank, redemption, without, doubt, one, b..."
2,i believe that this film is the best story eve...,"[believe, film, best, story, ever, told, film,..."
3,"**yes, there are spoilers here**this film has ...","[yes, spoilers, film, emotional, impact, find,..."
4,at the heart of this extraordinary movie is a ...,"[heart, extraordinary, movie, brilliant, indel..."


### Stemming

In [11]:
def apply_snowstemming(tokenized_column):
    stemmer = SnowballStemmer('english') 
    return [stemmer.stem(word) for word in tokenized_column]

In [12]:
def apply_porterstemming(tokenized_column):
    stemmer = PorterStemmer() 
    return [stemmer.stem(word) for word in tokenized_column]

In [13]:
df_detail['p_snowball_stemmed'] = df_detail.apply(lambda x: apply_snowstemming(x['p_stopwords_removed']), axis=1)

In [14]:
df_raw['r_snowball_stemmed'] = df_raw.apply(lambda x: apply_snowstemming(x['r_stopwords_removed']), axis=1)
df_raw['r_porter_stemmed'] = df_raw.apply(lambda x: apply_porterstemming(x['r_stopwords_removed']), axis=1)

df_raw[['review_text', 'r_snowball_stemmed', 'r_porter_stemmed']].head()

Unnamed: 0,review_text,r_snowball_stemmed,r_porter_stemmed
0,"in its oscar year, shawshank redemption (writt...","[oscar, year, shawshank, redempt, written, dir...","[oscar, year, shawshank, redempt, written, dir..."
1,the shawshank redemption is without a doubt on...,"[shawshank, redempt, without, doubt, one, bril...","[shawshank, redempt, without, doubt, one, bril..."
2,i believe that this film is the best story eve...,"[believ, film, best, stori, ever, told, film, ...","[believ, film, best, stori, ever, told, film, ..."
3,"**yes, there are spoilers here**this film has ...","[yes, spoiler, film, emot, impact, find, hard,...","[ye, spoiler, film, emot, impact, find, hard, ..."
4,at the heart of this extraordinary movie is a ...,"[heart, extraordinari, movi, brilliant, indel,...","[heart, extraordinari, movi, brilliant, indel,..."


### Rejoin

In [15]:
def rejoin_words(tokenized_column):
    
    return ( " ".join(tokenized_column))

In [16]:
df_detail['synopsis'] = df_detail.apply(lambda x: rejoin_words(x['p_snowball_stemmed']), axis=1)
df_raw['review'] = df_raw.apply(lambda x: rejoin_words(x['r_snowball_stemmed']), axis=1)

df_raw[['review_text', 'review']].head()

Unnamed: 0,review_text,review
0,"in its oscar year, shawshank redemption (writt...",oscar year shawshank redempt written direct fr...
1,the shawshank redemption is without a doubt on...,shawshank redempt without doubt one brilliant ...
2,i believe that this film is the best story eve...,believ film best stori ever told film tell rob...
3,"**yes, there are spoilers here**this film has ...",yes spoiler film emot impact find hard write c...
4,at the heart of this extraordinary movie is a ...,heart extraordinari movi brilliant indel perfo...


In [17]:
df_raw['review'].iloc[0]

'oscar year shawshank redempt written direct frank darabont novella rita hayworth shawshank redempt stephen king nomin seven academi award walk away zero best pictur went forrest gump shawshank pulp fiction happi nomin cours hindsight histori look back gump good film pulp redempt rememb best pulp howev success word go make huge splash cann make american master two film andi dufresn success come easi fortun failur life open screen take film fell fast theatr finish mere reason failur mani first titl clunker icon fan today peopl knew care dvd tim robbin laugh recount fan congratul movi film nightmar drama tough sell women stori love two best friend spell winner men worst movi slow molass desson thomson write washington post wander subplot everi opportun ignor abund narrat exit point settl final weak make film set open aerial shot prison total amaz piec architectur strong gothic design immedi prison becom charact cast shadow film tall stone wall stretch everi shot tower men contain blot me

### Remove Duplicate

In [18]:
df_raw['review'] = (df_raw['review'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

df_detail['synopsis'] = (df_detail['synopsis'].str.split()
                              .apply(lambda x: OrderedDict.fromkeys(x).keys())
                              .str.join(' '))

In [19]:
df_raw['review'].iloc[0]

'oscar year shawshank redempt written direct frank darabont novella rita hayworth stephen king nomin seven academi award walk away zero best pictur went forrest gump pulp fiction happi cours hindsight histori look back good film rememb howev success word go make huge splash cann american master two andi dufresn come easi fortun failur life open screen take fell fast theatr finish mere reason mani first titl clunker icon fan today peopl knew care dvd tim robbin laugh recount congratul movi nightmar drama tough sell women stori love friend spell winner men worst slow molass desson thomson write washington post wander subplot everi opportun ignor abund narrat exit point settl final weak set aerial shot prison total amaz piec architectur strong gothic design immedi becom charact cast shadow tall stone wall stretch tower contain blot memori outsid world hold onto hope music sandi beach zihuatanejo need say forget place made someth insid ca touch red morgan freeman think much pick glass milk

### Merge

In [20]:
df_synopsis = df_detail[['movie_id','synopsis']]

In [21]:
df_synopsis

Unnamed: 0,movie_id,synopsis
0,tt0105112,seen deliv lectur royal naval academi ryan wif...
1,tt1204975,one day get scuffl store young thug steal bott...
3,tt0040897,intrigu local pub stori howard walter huston o...
4,tt0126886,jim mcallist matthew broderick high school his...
5,tt0286716,work focus use nanom gamma radiat cure cancer ...
...,...,...
1563,tt0120655,film open homeless man bud cort desert new jer...
1565,tt0276751,spend free time smoke watch televis read pop s...
1567,tt0289879,year evan treborn ashton kutcher suffer sever ...
1568,tt1723811,wed engag ring exit train disappear work brand...


In [22]:
df_review = df_raw[['movie_id','user_id','is_spoiler','review']]

In [23]:
df_review

Unnamed: 0,movie_id,user_id,is_spoiler,review
0,tt0111161,ur1898687,True,oscar year shawshank redempt written direct fr...
1,tt0111161,ur0842118,True,shawshank redempt without doubt one brilliant ...
2,tt0111161,ur1285640,True,believ film best stori ever told tell robbin p...
3,tt0111161,ur1003471,True,yes spoiler film emot impact find hard write c...
4,tt0111161,ur0226855,True,heart extraordinari movi brilliant indel perfo...
...,...,...,...,...
573908,tt0139239,ur0100166,False,go wise fast pure entertain assembl except cas...
573909,tt0139239,ur0021767,False,well shall say fun rate three plotlin origin o...
573910,tt0139239,ur0392750,False,go best movi ever seen lot read review notic m...
573911,tt0139239,ur0349105,False,call teenag version pulp fiction whatev want p...


In [24]:
df_spoiler = pd.merge(df_review, df_synopsis, how='inner', on ='movie_id')

In [25]:
df_spoiler.to_csv('IMDB/dataset/gensim-spacy-pre-unq.csv', index= False)