In [1]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_json('./../../raw_data/IMDB_reviews.json', lines=True)
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [3]:
df_reviews = df[['review_date','review_text','review_summary','is_spoiler']]

In [12]:
df_reviews['reviews'] = df_reviews['review_text'] + ' ' + df_reviews['review_summary']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
df_reviews.head()

Unnamed: 0,review_date,review_text,review_summary,is_spoiler,reviews
0,10 February 2006,"In its Oscar year, Shawshank Redemption (writt...",A classic piece of unforgettable film-making.,True,"In its Oscar year, Shawshank Redemption (writt..."
1,6 September 2000,The Shawshank Redemption is without a doubt on...,Simply amazing. The best film of the 90's.,True,The Shawshank Redemption is without a doubt on...
2,3 August 2001,I believe that this film is the best story eve...,The best story ever told on film,True,I believe that this film is the best story eve...
3,1 September 2002,"**Yes, there are SPOILERS here**This film has ...",Busy dying or busy living?,True,"**Yes, there are SPOILERS here**This film has ..."
4,20 May 2004,At the heart of this extraordinary movie is a ...,"Great story, wondrously told and acted",True,At the heart of this extraordinary movie is a ...


In [21]:
df_reviews_test = df_reviews.iloc[:10000, [3,4]].copy()

In [22]:
df_reviews_test.shape

(10000, 2)

In [23]:
df_reviews_test.head()

Unnamed: 0,is_spoiler,reviews
0,True,"In its Oscar year, Shawshank Redemption (writt..."
1,True,The Shawshank Redemption is without a doubt on...
2,True,I believe that this film is the best story eve...
3,True,"**Yes, there are SPOILERS here**This film has ..."
4,True,At the heart of this extraordinary movie is a ...


# Clean Data

In [24]:
def clean_data(text):
    
    for punctuation in string.punctuation:
        
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    
    tokenized = word_tokenize(lowercased) # Tokenize
    
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    
    stop_words = set(stopwords.words('english')) # Make stopword list
    
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    
    return ' '.join(without_stopwords)

In [25]:
df_reviews_test['reviews'] = df_reviews_test['reviews'].apply(clean_data) 

df_reviews_test.head()

Unnamed: 0,is_spoiler,reviews
0,True,oscar year shawshank redemption written direct...
1,True,shawshank redemption without doubt one brillia...
2,True,believe film best story ever told film tell ti...
3,True,yes spoilers film emotional impact find hard w...
4,True,heart extraordinary movie brilliant indelible ...


In [26]:
df_reviews_test.to_csv(r'test_data.csv', index=False)

In [2]:
df_test = pd.read_csv(r'test_data.csv')
df_test.head()

Unnamed: 0,is_spoiler,reviews
0,True,oscar year shawshank redemption written direct...
1,True,shawshank redemption without doubt one brillia...
2,True,believe film best story ever told film tell ti...
3,True,yes spoilers film emotional impact find hard w...
4,True,heart extraordinary movie brilliant indelible ...


In [3]:
df_test.shape

(10000, 2)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(min_df=0.2, max_df = 0.8)

X = tf_idf_vectorizer.fit_transform(df_test['reviews'])

y = df_test['is_spoiler']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)
nb_model.score(X_test, y_test)

cv = cross_validate(nb_model, X_train, y_train, scoring='precision', cv=10)

cv['test_score'].mean()

0.5929817615609536