In [27]:
import numpy as np
import pandas as pd 
import re
from collections import Counter
import string
import os
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
    
#text preprocessing functions:
def text_cleaning(text):
    #to lower case
    text = text.lower()
    #remove numbers
    text = re.sub(r'\d+', '', text)
    #remove puctuations
    text = text.translate(str.maketrans("", "", string.punctuation))
    #remove extra spaces
    text = re.sub(' +', ' ', text)
    return text

def text_preprocessing(df):
    #text_cleaning
    df['review'] = df['review'].apply(text_cleaning)
    #tokenization
    df['review'] = df.apply(lambda x: word_tokenize(x['review']), axis = 1)
    #remove stopwords
    df['review'] = df.apply(lambda x: [i for i in x['review'] if not i in stopwords], axis = 1)
    return df

def text_stemming(df):
    df = text_preprocessing(df)
    #stemming
    stemmer = SnowballStemmer(language = 'english')
    df['review'] = df.apply(lambda x: [stemmer.stem(i) for i in x['review']], axis = 1)
    return df

def text_lemmatize(df):
    df = text_preprocessing(df)
    #lemmatize
    lemmatizer = WordNetLemmatizer()
    df['review'] = df.apply(lambda x: [lemmatizer.lemmatize(i) for i in x['review']], axis = 1)
    return df


#naive bayes classifier functions:
def add_k_word_score(word, k, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg):
    if word in V_dict:
        P_word_pos = (pos_dict.get(word, 0) + k / (N_word_pos + V_size * k))
        P_word_neg = (neg_dict.get(word, 0) + k / (N_word_neg + V_size * k))
        return np.log(P_word_pos / P_word_neg)
    else:
        P_word_pos = (k / (N_word_pos + V_size * k))
        P_word_neg = (k / (N_word_neg + V_size * k))
        return np.log(P_word_pos / P_word_neg) 
                      
def add_k_score(text, k, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg):
    score = 0
    for word in text:
        score += add_k_word_score(word, k, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg)
    return(score)

def add_k_smoothing_naive_bayes_classifier(train_set, test_set, k):
    #Parameters for the classifier
    #all unique words and its counts from all document
    V_dict = Counter([item for sublist in train_set['review'].values.tolist() for item in sublist]) 
    #number of unique words across all document
    V_size = len(V_dict.keys())
    #number of document from each class
    N_doc_pos = sum(train_set.label == 1)
    N_doc_neg = sum(train_set.label == 0)
    #probability of document from each class
    P_c_pos = N_doc_pos / len(train_set)
    P_c_neg = N_doc_neg / len(train_set)
    #unique words and its counts from each class
    pos_dict = Counter([item for sublist in train_set.loc[train_set.label == 1]['review'].values.tolist() for item in sublist])
    neg_dict = Counter([item for sublist in train_set.loc[train_set.label == 0]['review'].values.tolist() for item in sublist])
    #number of unique words from each class
    N_word_pos = len(pos_dict.keys())
    N_word_neg = len(neg_dict.keys())
    
    test_set['score'] = test_set.review.apply(lambda x: add_k_score(x, k, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg))
    test_set['prediction'] = test_set.score.apply(lambda x: int(x > 0))
    
    return test_set

def interpolated_word_score(word, l, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg):
    if word in V_dict:
        P_word_pos = (1 - l) * (pos_dict.get(word, 0)/ N_word_pos) + l * (V_dict.get(word, 0)/ V_size)
        P_word_neg = (1 - l) * (neg_dict.get(word, 0)/ N_word_neg) + l * (V_dict.get(word, 0)/ V_size)
        return np.log(P_word_pos / P_word_neg)
    else:
        P_word_pos = N_word_pos / V_size
        P_word_neg = N_word_neg / V_size
        return np.log(P_word_pos / P_word_neg)
                      
def interpolated_score(text, l, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg):
    score = 0
    for word in text:
        score += interpolated_word_score(word, l, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg)
    return(score)

def interpolated_smoothing_naive_bayes_classifier(train_set, test_set, l):
    #Parameters for the classifier
    #all unique words and its counts from all document
    V_dict = Counter([item for sublist in train_set['review'].values.tolist() for item in sublist]) 
    #number of unique words across all document
    V_size = len(V_dict.keys())
    #number of document from each class
    N_doc_pos = sum(train_set.label == 1)
    N_doc_neg = sum(train_set.label == 0)
    #probability of document from each class
    P_c_pos = N_doc_pos / len(train_set)
    P_c_neg = N_doc_neg / len(train_set)
    #unique words and its counts from each class
    pos_dict = Counter([item for sublist in train_set.loc[train_set.label == 1]['review'].values.tolist() for item in sublist])
    neg_dict = Counter([item for sublist in train_set.loc[train_set.label == 0]['review'].values.tolist() for item in sublist])
    #number of unique words from each class
    N_word_pos = len(pos_dict.keys())
    N_word_neg = len(neg_dict.keys())
    
    test_set['score'] = test_set.review.apply(lambda x: interpolated_score(x, l, V_dict, V_size, pos_dict, neg_dict, N_word_pos, N_word_neg))
    test_set['prediction'] = test_set.score.apply(lambda x: int(x > 0))
    
    return test_set


#F1 measure functions:
def recall(df):
    return len(df[(df['label'] == 1) & (df['prediction'] == 1)]) / len(df[(df['label'] == 1)])

def precision(df):
    return len(df[(df['label'] == 1) & (df['prediction'] == 1)]) / len(df[(df['prediction'] == 1)])
    
def f1score(df):
    return (2 * recall(df) * precision(df)) / (recall(df) + precision(df))

In [28]:
if __name__ == '__main__':

    #reading the csv file as datafarme object with relative path
    fname = 'imdb_master.csv'
    df = pd.read_csv(fname, encoding='latin1')

    #exclude all entries without being labelled pos or neg
    df = df[df['label'] != 'unsup']
    
    #mapping the label pos as 1 and neg as 0
    df['label'] = df.label.map({'pos': 1, 'neg': 0})
    
    #the below line inspected the number of entries with each label, which is 25,000 entries for both label
    #print(df['label'].value_counts())

    #splitting the training and testing data
    train_set = df[df['type'] == 'train']
    test_set = df[df['type'] == 'test']

    #dropping unhelpful columns
    train_set = train_set.drop(columns= ['type', 'Unnamed: 0', 'file', 'file'])
    test_set = test_set.drop(columns= ['type', 'Unnamed: 0', 'file', 'file'])

    stopwords = stopwords.words('english')
    stopwords.append('<br />')
    
    #preprocess both dataset without stemming or lemmatization
    train_set_cleaned = text_preprocessing(train_set.copy())
    test_set_cleaned = text_preprocessing(test_set.copy())
    
    #preprocess both dataset with stemming
    train_set_stemming = text_stemming(train_set.copy())  
    test_set_stemming = text_stemming(test_set.copy())
    
    #preprocess both dataset with stemming
    train_set_lemmatize = text_lemmatize(train_set.copy())  
    test_set_lemmatize = text_lemmatize(test_set.copy())
    
    #Without stemming or lemmatization
    f1score_clean = []
    #add 1 & add 10 smoothing
    clean_result_add_1 = add_k_smoothing_naive_bayes_classifier(train_set_cleaned, test_set_cleaned, 1)
    f1score_clean.append(f1score(clean_result_add_1))
    clean_result_add_10 = add_k_smoothing_naive_bayes_classifier(train_set_cleaned, test_set_cleaned, 10)
    f1score_clean.append(f1score(clean_result_add_10))

    #interpolation smoothing with lambda = 0.1, 0.5, 0.9
    clean_result_inter_1 = interpolated_smoothing_naive_bayes_classifier(train_set_cleaned, test_set_cleaned, 0.1)
    f1score_clean.append(f1score(clean_result_inter_1))
    clean_result_inter_2 = interpolated_smoothing_naive_bayes_classifier(train_set_cleaned, test_set_cleaned, 0.6)
    f1score_clean.append(f1score(clean_result_inter_2))
    clean_result_inter_3 = interpolated_smoothing_naive_bayes_classifier(train_set_cleaned, test_set_cleaned, 0.9)
    f1score_clean.append(f1score(clean_result_inter_3))


    #Stemming
    f1score_stemming = []
    #add 1 & add 10 smoothing
    stem_result_add_1 = add_k_smoothing_naive_bayes_classifier(train_set_stemming, test_set_stemming, 1)
    f1score_stemming.append(f1score(stem_result_add_1))
    stem_result_add_10 = add_k_smoothing_naive_bayes_classifier(train_set_stemming, test_set_stemming, 10)
    f1score_stemming.append(f1score(stem_result_add_10))

    #interpolation smoothing with lambda = 0.1, 0.5, 0.9
    stem_result_inter_1 = interpolated_smoothing_naive_bayes_classifier(train_set_stemming, test_set_stemming, 0.1)
    f1score_stemming.append(f1score(stem_result_inter_1))
    stem_result_inter_2 = interpolated_smoothing_naive_bayes_classifier(train_set_stemming, test_set_stemming, 0.6)
    f1score_stemming.append(f1score(stem_result_inter_2))
    stem_result_inter_3 = interpolated_smoothing_naive_bayes_classifier(train_set_stemming, test_set_stemming, 0.9)
    f1score_stemming.append(f1score(stem_result_inter_3))


    #Lemmatization
    f1score_lemma = []
    #add 1 & add 10 smoothing
    lemma_result_add_1 = add_k_smoothing_naive_bayes_classifier(train_set_lemmatize, test_set_lemmatize, 1)
    f1score_lemma.append(f1score(lemma_result_add_1))
    lemma_result_add_10 = add_k_smoothing_naive_bayes_classifier(train_set_lemmatize, test_set_lemmatize, 10)
    f1score_lemma.append(f1score(lemma_result_add_10))

    #interpolation smoothing with lambda = 0.1, 0.5, 0.9
    lemma_result_inter_1 = interpolated_smoothing_naive_bayes_classifier(train_set_lemmatize, test_set_lemmatize, 0.1)
    f1score_lemma.append(f1score(lemma_result_inter_1))
    lemma_result_inter_2 = interpolated_smoothing_naive_bayes_classifier(train_set_lemmatize, test_set_lemmatize, 0.6)
    f1score_lemma.append(f1score(lemma_result_inter_2))
    lemma_result_inter_3 = interpolated_smoothing_naive_bayes_classifier(train_set_lemmatize, test_set_lemmatize, 0.9)
    f1score_lemma.append(f1score(lemma_result_inter_3))    

    results = {'Add 1 smoothing':[f1score_clean[0], f1score_stemming[0], f1score_lemma[0]], 
               'Add 10 smoothing':[f1score_clean[1], f1score_stemming[1], f1score_lemma[1]], 
               'Interpolation lambda = 0.1': [f1score_clean[2], f1score_stemming[2], f1score_lemma[2]], 
               'Interpolation lambda = 0.6': [f1score_clean[3], f1score_stemming[3], f1score_lemma[3]], 
               'Interpolation lambda = 0.9': [f1score_clean[4], f1score_stemming[4], f1score_lemma[4]]}

    results_df = pd.DataFrame(results, index = ['Without stemming or lemmatization', 'Stemming', 'Lemmatization'])

In [29]:
results_df

Unnamed: 0,Add 1 smoothing,Add 10 smoothing,Interpolation lambda = 0.1,Interpolation lambda = 0.6,Interpolation lambda = 0.9
Without stemming or lemmatization,0.742427,0.745061,0.803281,0.809406,0.814431
Stemming,0.747816,0.75065,0.799134,0.807304,0.809858
Lemmatization,0.744938,0.748013,0.802502,0.809726,0.813995
