In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None) # will display full text in row
import os, re, string
from time import time
import tensorflow as tf
import nltk
#nltk.download(['punkt', 'stopwords', 'averaged_perceptron_tagger', 'wordnet'], quiet=True)
from transformers import pipeline
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from rouge import Rouge
from clean_text import clean_text
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Get IMDB Movie Review dataset

In [2]:
# data stored locally, alternatively use function under .utils.download_imdb_data()
df = pd.read_csv('../datasets/IMDB Dataset.csv')

In [3]:
print(df.shape)

(50000, 2)


### Restrict to data subset (n=5000)

In [4]:
df = df.sample(n=5000, random_state=999).reset_index(drop=True)

### Clean data

We'll create two versions of cleaned reviews: one that only converts numbers to words, removes sentence breaks, and small grammatical errors. This version leaves punctuation. A second version performs these modifications, but also removes stop words, punctuation, and lemmatizes words. Both versions lowercase all words for consistency.

In [5]:
df.rename(columns={'review': 'orig'}, inplace=True)

In [6]:
df['clean'] = clean_text().run(df['orig'], no_stop_words=False, remove_punctuation=False, lemmatize=False)

In [7]:
print(df['orig'][0])

MST 3000 should do this movie. It is the worst acted movie I have ever seen. First of all, you find out that the shooter has no bank account and no history since leaving the army in 1993 and pays his rent in cash. There is no way in hell that a person like that would ever be allowed to be that close to a president not to mention a high profile job. Also, the head of security for the POTHUS would not be so emotional that he would start drinking into a haze if the president was shot. This movie sucked. I cannot express the extremite that this movie was. Every single actor was terrible. Even the chick at the trailer park. I crap on this garbage. What a waste of time.


In [8]:
print(df['clean'][0])

mst three thousand should do this movie. it is the worst acted movie i have ever seen. first of all, you find out that the shooter has no bank account and no history since leaving the army in one thousand, nine hundred and ninety-three and pays his rent in cash. there is no way in hell that a person like that would ever be allowed to be that close to a president not to mention a high profile job. also, the head of security for the pothus would not be so emotional that he would start drinking into a haze if the president was shot. this movie sucked. i cannot express the extremite that this movie was. every single actor was terrible. even the chick at the trailer park. i crap on this garbage. what a waste of time.


In [9]:
# Without stop words, and lemmatizing words
# Lemmatization takes some time..
start = time()
df['cleaner'] = clean_text().run(df['orig'], no_stop_words=True, remove_punctuation=True, lemmatize=True)
print('Total time:', round(time() - start, 0), 'seconds')

Total time: 103.0 seconds


In [10]:
df['cleaner'][0]

'mst three thousand movie bad act movie ever see first find shooter bank account history since leave army one thousand nine hundred ninety-three pay rent cash way hell person like would ever allow close president mention high profile job also head security pothus would emotional would start drinking haze president shot movie suck can not express extremite movie every single actor terrible even chick trailer park crap garbage waste time'

### TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [12]:
vectors = vectorizer.fit_transform(df['cleaner'].tolist()) # scipy.sparse.csr.csr_matrix
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()

In [13]:
# Create lookup dict for each document's words
tfidf_dict = {}
for doc in range(len(denselist)):
    positions = [idx for idx, val in enumerate(denselist[doc]) if val > 0] # get index if word in document, i.e. > 0 tf-idf
    values = [val for val in denselist[doc] if val > 0] # tf-idf values in doc
    words = [feature_names[i] for i in positions] # words themselves
    tfidf_dict[doc] = dict(zip(words, values))

In [14]:
tfidf_dict[0]

{'account': 0.14912465763153843,
 'act': 0.05969006877501036,
 'actor': 0.06806123278147211,
 'allow': 0.11958192703788867,
 'also': 0.06226235135818268,
 'army': 0.1406273709624444,
 'bad': 0.056766053164434956,
 'bank': 0.1452845414326214,
 'can': 0.07285073684364074,
 'cash': 0.1406273709624444,
 'chick': 0.1423914844101084,
 'close': 0.1043565458430218,
 'crap': 0.11162874481253006,
 'drinking': 0.15507698793293076,
 'emotional': 0.12829586249867486,
 'even': 0.053984862609809316,
 'ever': 0.1365432679149389,
 'every': 0.07768137046790471,
 'express': 0.14740904448424874,
 'extremite': 0.22952484656021518,
 'find': 0.07244455544104945,
 'first': 0.06087478486823326,
 'garbage': 0.13139268021743922,
 'haze': 0.19693960036262706,
 'head': 0.09624693846802221,
 'hell': 0.10961140057074911,
 'high': 0.09217110148804472,
 'history': 0.10961140057074911,
 'hundred': 0.08268280087294669,
 'job': 0.09203906717633571,
 'leave': 0.10261844570241098,
 'like': 0.044348638821792984,
 'mention':