In [47]:
import os
import glob
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim import lr_scheduler
import operator

from sklearn import tree
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from collections import Counter
from sklearn.preprocessing import LabelEncoder, scale, PolynomialFeatures
from sklearn.datasets import load_boston
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from datetime import datetime
from scipy import stats
import torch.utils.data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [48]:
train_csv = pd.read_csv("Dataset/train.csv", keep_default_na=False)
test_csv = pd.read_csv("Dataset/test.csv", keep_default_na=False)

data = {}
labels = {}

In [49]:
train_csv.shape

(5959, 3)

In [50]:
def preprocess_data(dataset):
    dataset['Review Text'] = dataset["Review Title"].map(str) + " " + dataset['Review Text']    
    dataset = dataset.drop(['Review Title'],axis=1)
    return dataset

In [51]:
train = preprocess_data(train_csv)
test = preprocess_data(test_csv)

train_X = train["Review Text"]
train_y = train["topic"]
test_X = test["Review Text"]

#train_X, test_X, train_y, test_y = train_test_split(train["Review Text"], train["topic"], random_state=33)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))
#s = " ".join(train["Review Text"][0:10])
#print(s)

IMDb reviews (combined): train = 5959, test = 2553


In [52]:
import numpy as np
import re
from collections import Counter

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # TODO: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    #       sentence is a list of words.
    word_counts = Counter(np.concatenate( data, axis=0 ))
    # word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    
    # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.
    
    sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)

    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [53]:
#word_dict = build_dict(train_X)
# print(word_dict)
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [54]:
review_to_words(train_X[0])

['useless',
 'noth',
 'help',
 'lost',
 'even',
 'work',
 'eat',
 'healthi',
 'curb',
 'appetit',
 'anyth']

In [55]:
import pickle

cache_dir = os.path.join("cache", "key_analysis")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    print(cache_file)
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            print("File not found")
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        print('Preprocess training and test data to obtain words for each review')
        # Preprocess training and test data to obtain words for each review
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
        
        # Write to cache file for future runs
        print('Preprocess training and test data to obtain words for each review')
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        print('Unpack data loaded from cache file')
        # Unpack data loaded from cache file
        words_train, words_test, labels_train = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'])
    
    return words_train, words_test, labels_train

In [56]:
# Preprocess data
train_X, test_X, train_y = preprocess_data(train_X, test_X, train_y)

preprocessed_data.pkl
Read preprocessed data from cache file: preprocessed_data.pkl
Unpack data loaded from cache file


In [57]:
word_dict = build_dict(train_X)

In [58]:
# TODO: Use this space to determine the five most frequently appearing words in the training set.
word_counts = Counter(np.concatenate( train_X, axis=0 ))
sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)
print(sorted_words[0:5])
print(word_counts['movi'])

['tast', 'product', 'like', 'flavor', 'brand']
1


In [59]:
with open(os.path.join(cache_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [60]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [61]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
test_X, test_X_len = convert_and_pad_data(word_dict, test_X)

In [62]:
print(len(train_X[0]))
print(train_X_len[0:1]) # 500 - 322 (non zeros) = 178

500
[11]


In [63]:
class LSTMClassifier(nn.Module):
    """
    This is the simple RNN model we will be using to perform Sentiment Analysis.
    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        """
        Initialize the model by settingg up the various layers.
        """
        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dense = nn.Linear(in_features=hidden_dim, out_features=1)
        self.sig = nn.Sigmoid()
        
        self.word_dict = None

    def forward(self, x):
        """
        Perform a forward pass of our model on some input.
        """
        x = x.t()
        lengths = x[0,:]
        reviews = x[1:,:]
        embeds = self.embedding(reviews)
        lstm_out, _ = self.lstm(embeds)
        out = self.dense(lstm_out)
        out = out[lengths - 1, range(len(lengths))]
        return self.sig(out.squeeze())

In [64]:
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1).to_csv('sample_train.csv', header=False, index=False)