# IMDB Sentiment Analyses¶

This notebook uses Kaggle dataset (https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews).

1. Import dataset
3. Data preparation
3. Creation of training and test dataset
4. Model training
5. Save model file

In [None]:
import pandas as pd
import numpy as np
import pickle
import nltk
import keras

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

## 1.0 - Import dataset

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()

df = pd.read_csv('../datasets/imdb-dataset.csv', delimiter=',')
df = df.head(30000)

## 2.0 - Data preparation
Data preparation using the following Text Feature Engineering techniques:

1. Tonkenization
2. Removes stop words
3. Stemming text (porter)
4. Joining words (tokens) into a single string

In [None]:
def identify_tokens(row):
    """Identify tokens in a row
    Args:
        row (list): row of dataframe
    
    Returns:
        list: text splited in tokens
    """    
    source = row[0]
    tokens = word_tokenize(source)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

In [None]:
def remove_stops(row):
    """Remove stop words from text
    Args:
        row (list): row of dataframe
    
    Returns:
        list: list of tokens without stop words
    """    
    source_tokenization = row[2]
    stop = [w for w in source_tokenization if not w in stop_words]
    return (stop)

In [None]:
def stem_porter(row):
    """Execute steamming porter
    Args:
        row (list): row of dataframe
    
    Returns:
        list: list of tokens with steamming.
    """      
    my_list = row[2]
    stemmed_list = [porter_stemmer.stem(word) for word in my_list]
    return (stemmed_list)

In [None]:
def rejoin_words(row):
    """Join tokens in a single string
    Args:
        row (list): row of dataframe
    
    Returns:
        str: text of joined tokens
    """      
    my_list = row[2]
    joined_words = (" ".join(my_list))
    return joined_words

In [None]:
def pre_processing(df):
    """Execute text feature engineering (TFE)
    Args:
        df (dataframe): row of dataframe
    
    Returns:
        df: New df post text feature engineering (TFE)
    """ 
    print('Tokenization ...')
    df['text1'] = df.apply(identify_tokens, axis=1)
    print('Removing stop words ...')
    df['text1'] = df.apply(remove_stops, axis=1)
    print('Stemming (porter) ...')
    df['text1'] = df.apply(stem_porter, axis=1)
    print('Joining words ...')
    df['clean_text'] = df.apply(rejoin_words, axis=1)

    return df

In [None]:
df = pre_processing(df)
df['clean_text'] = df['clean_text'].str.lower()

X = df['clean_text']
Y = df['sentiment']

## 3.0 - Creation of training and test dataset
NOTE: Test dataset (30%) and Training dataset (70%) balanced (Stratified)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=48,
                                                    stratify=Y)

vectorizer = TfidfVectorizer(ngram_range=(2, 3),
                        sublinear_tf=True,
                        max_features=10000)

NOTE: Machine Learning or Deep Learning models uses numeric values input. The Tf-Idf Text Feature Engineering (TFE) process will be used to transform the texts into vectors.

In [None]:
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

le = preprocessing.LabelEncoder()

le.fit(list(Y_train))
Y_train_le = le.transform(list(Y_train))
Y_test_le = le.transform(list(Y_test))

num_class = Y.value_counts().shape
input_shape = X_train_tf.shape

## 4.0 - Model training

In [None]:
from keras.utils import to_categorical

Y_train_label_keras = to_categorical(Y_train_le)
Y_test_label_keras = to_categorical(Y_test_le)

from keras import models
from keras import layers

In [None]:
network = models.Sequential()

network.add(layers.Dense(2, activation='relu', input_shape=(input_shape[1], )))
network.add(layers.Dropout(0.4))

network.add(layers.Dense(5, activation='relu'))
network.add(keras.layers.Dropout(0.4))

network.add(layers.Dense(5, activation='sigmoid'))
network.add(layers.Dropout(0.4))

network.add(layers.Dense(num_class[0], activation='softmax'))

network.compile(optimizer='adamax',
                loss="binary_crossentropy",
                metrics=['accuracy'])

network.summary()

In [None]:
network.fit(X_train_tf.toarray(),
            Y_train_label_keras,
            verbose=1,
            epochs=50,
            validation_split=0.3)

## 5.0 - Save model file

In [None]:
network.save('../models/neural_network.h5')
pickle.dump(vectorizer, open('../models/vectorizer.pkl', 'wb'))