# IMDB Review Classification

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

## Data Ingestion and Exploration

In [2]:
data = pd.read_csv("data/labeledTrainData.tsv", sep="\t")

In [None]:
data.head(10)

## Build Bar Graph of Positive and Negative Reviews

In [None]:
pos_data = data[data["sentiment"] == 'pos']
neg_data = data[data["sentiment"] == 'neg']

In [None]:
pos_data.shape, neg_data.shape

In [None]:
plt.bar(10,len(pos_data),3, label="Positive")
plt.bar(15,len(neg_data),3, label="Negative")
plt.legend()
plt.ylabel('Number of Reviews')
plt.title('Proportion of Reviews')
plt.show()

## Data Cleansing and Feature Extraction

In [3]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = re.sub('<br\s?/>', ' ', text)
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = re.sub('\s{2,}', ' ', text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token, "v") for token in text.split(" ")]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

data['processed_reviews'] = data['review'].apply(lambda x: clean_text(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joshuarobison/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv_fit=cv.fit_transform(data["processed_reviews"])

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(cv_fit)
X_train_tfidf.shape

(25000, 93331)

## Create Word Count Histogram

In [None]:
words = cv.get_feature_names()
word_counts = cv_fit.toarray().sum(axis=0)
word_count_df = pd.DataFrame({'words':words, 'count':word_counts})

In [None]:
word_count_df.tail(10)

In [None]:
plt.hist(word_counts, bins=[0, 100, 200, 300, 400, 500, 600, 1000])
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Word Count Frequency')
plt.show()

In [None]:
word_counts.max()

In [None]:
word_count_df[word_count_df["count"]>=100000]

## Create and Test Naive Bayes Model

In [None]:
X_train = data["review"]
y_target = data["sentiment"]

In [None]:
y_target

In [None]:
data.head(5)

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, data["sentiment"])



### Evaluate Naive Bayes Model on Test Data

In [6]:
test_data=pd.read_csv("data/testData.tsv", sep="\t")

test_data["review"]=test_data.review.apply(lambda x: clean_text(x))
test_data["sentiment"] = test_data["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)

In [7]:
test_data.head(3)

Unnamed: 0,id,review,sentiment
0,12311_10,naturally film whos main theme mortality nosta...,1
1,8348_2,movie disaster within disaster film full great...,0
2,5828_4,movie kid saw tonight child love one point kid...,0


In [9]:
cv_fit_test = cv.transform(test_data["review"])
X_test_tfidf = tfidf_transformer.transform(cv_fit_test)

In [15]:
predicted_test=clf.predict(X_test_tfidf)
np.mean(predicted_test == test_data['sentiment'])

0.83056

## Create and Test Neural Network

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data['processed_reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(data['processed_reviews'])

maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = data['sentiment']

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 3
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

In [None]:
y_test = df_test["sentiment"]
list_sentences_test = df_test["review"]
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
prediction = model.predict(X_te)

In [None]:
y_pred = (prediction > 0.5)

In [None]:
from sklearn.metrics import f1_score, confusion_matrix
print('F1-score: {0}'.format(f1_score(y_pred, y_test)))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)