In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

In [2]:
# From the cloud

#from google.colab import drive
#drive.mount("/content/drive")
#df = pd.read_json('/content/drive/My Drive/Colab Notebooks/IMDB_reviews.json', lines=True)
#df.head()

In [3]:
# Locally

df = pd.read_json('../../Raw_Data/IMDB_reviews.json', lines=True)
df.head()

Unnamed: 0,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
0,True,tt0111161,10,10 February 2006,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt...",ur1898687
1,True,tt0111161,10,6 September 2000,Simply amazing. The best film of the 90's.,The Shawshank Redemption is without a doubt on...,ur0842118
2,True,tt0111161,8,3 August 2001,The best story ever told on film,I believe that this film is the best story eve...,ur1285640
3,True,tt0111161,10,1 September 2002,Busy dying or busy living?,"**Yes, there are SPOILERS here**This film has ...",ur1003471
4,True,tt0111161,8,20 May 2004,"Great story, wondrously told and acted",At the heart of this extraordinary movie is a ...,ur0226855


In [4]:
df = df.loc[:1000, :].copy()

In [5]:
df = df[['is_spoiler', 'review_summary', 'review_text']]
df.head()

Unnamed: 0,is_spoiler,review_summary,review_text
0,True,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt..."
1,True,Simply amazing. The best film of the 90's.,The Shawshank Redemption is without a doubt on...
2,True,The best story ever told on film,I believe that this film is the best story eve...
3,True,Busy dying or busy living?,"**Yes, there are SPOILERS here**This film has ..."
4,True,"Great story, wondrously told and acted",At the heart of this extraordinary movie is a ...


In [6]:
df['review'] = df['review_summary'] + ' ' + df['review_text']

In [7]:
df = df.drop(columns=['review_summary', 'review_text'])

In [8]:
df['is_spoiler'] = df['is_spoiler'].map({True:1, False:0})

In [9]:
df.head()

Unnamed: 0,is_spoiler,review
0,1,A classic piece of unforgettable film-making. ...
1,1,Simply amazing. The best film of the 90's. The...
2,1,The best story ever told on film I believe tha...
3,1,"Busy dying or busy living? **Yes, there are SP..."
4,1,"Great story, wondrously told and acted At the ..."


In [10]:
df.shape

(1001, 2)

In [11]:
df.nunique()

is_spoiler      2
review        999
dtype: int64

In [12]:
df['is_spoiler'].value_counts()

1    956
0     45
Name: is_spoiler, dtype: int64

In [13]:
df.isna().sum()

is_spoiler    0
review        0
dtype: int64

In [14]:
def clean (text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
        
    tokenized = word_tokenize(lowercased) # Tokenize
    
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    
    stop_words = set(stopwords.words('english')) # Make stopword list
    
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    
    return " ".join(without_stopwords)

In [15]:
nltk.download('stopwords')

df['clean_reviews'] = df['review'].apply(clean)

df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juanchimdo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,is_spoiler,review,clean_reviews
0,1,A classic piece of unforgettable film-making. ...,classic piece unforgettable film making oscar ...
1,1,Simply amazing. The best film of the 90's. The...,simply amazing best film shawshank redemption ...
2,1,The best story ever told on film I believe tha...,best story ever told film believe film best st...
3,1,"Busy dying or busy living? **Yes, there are SP...",busy dying busy living yes spoilers film emoti...
4,1,"Great story, wondrously told and acted At the ...",great story wondrously told acted heart extrao...


In [16]:
df.drop(columns='review', inplace=True)

In [17]:
df.head()

Unnamed: 0,is_spoiler,clean_reviews
0,1,classic piece unforgettable film making oscar ...
1,1,simply amazing best film shawshank redemption ...
2,1,best story ever told film believe film best st...
3,1,busy dying busy living yes spoilers film emoti...
4,1,great story wondrously told acted heart extrao...


In [18]:
nltk.download('wordnet')

def stemm (text):
    tokenized = word_tokenize(text)
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in tokenized]
    return " ".join(stemmed)

df['clean_reviews'] = df['clean_reviews'].apply(stemm)
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juanchimdo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,is_spoiler,clean_reviews
0,1,classic piec unforgett film make oscar year sh...
1,1,simpli amaz best film shawshank redempt withou...
2,1,best stori ever told film believ film best sto...
3,1,busi die busi live ye spoiler film emot impact...
4,1,great stori wondrous told act heart extraordin...
