In [1]:
import pandas as pd
import math
csv_loaded = False

In [2]:
def log_likely(thetas_given_negative, thetas_given_positive, phis, string):
    lst = string.split(' ')

    log_y_0 = math.log(phis[4])
    for i in range(len(lst)):
        lst[i] = lst[i].lower()
        if(lst[i] in thetas_given_positive):
            log_y_0 += math.log(thetas_given_positive[lst[i]])


    log_y_1 = math.log(phis[0])
    for i in range(len(lst)):
        lst[i] = lst[i].lower()
        if(lst[i] in thetas_given_negative):
            log_y_1 += math.log(thetas_given_negative[lst[i]])
    
    ss = log_y_0 + log_y_1
    log_y_0 /= ss
    log_y_1 /= ss
    if(log_y_0>log_y_1):
        return 0
    return 4

In [3]:
def calculate_phi(df0):
    phis = {}
    df_positive = df0[df0['Y']==4]
    df_negative = df0[df0['Y']==0]

    phis[0] = df_negative.size/6
    phis[4] = df_positive.size/6
    
    return phis

In [4]:
def calculate_theta_given_y(df1, df2, df3):
    thetas_given_positive = {}
    thetas_given_negative = {}

    for i in range(1,df2['frequency'].size):
        thetas_given_positive[df2['word'][i]] = (1+df2['frequency'][i])/(df1['frequency'].size + df2['frequency'][1:].sum())

    for i in range(1,df3['frequency'].size):
        thetas_given_negative[df3['word'][i]] = (1+df3['frequency'][i])/(df1['frequency'].size + df3['frequency'][1:].sum())
    return thetas_given_positive, thetas_given_negative

In [5]:
def train(df0, df1, df2, df3):
    phis = calculate_phi(df0)
    thetas_given_positive, thetas_given_negative = calculate_theta_given_y(df1, df2, df3)
    return phis, thetas_given_positive, thetas_given_negative

In [6]:
def test(phis, thetas_given_positive, thetas_given_negative, test_file):
    test_result = []
    df_test = pd.read_csv(test_file)
    for i in range(df_test['Y'].size):
        result = log_likely(thetas_given_negative, thetas_given_positive, phis, df_test['text'][i])
        test_result.append(result)
    df_test['Y_test'] = test_result
    
    accuracy = 0
    total = 0
    confusion_martix = [[0,0],[0,0]]
    for i in range(df_test['Y'].size):
        if(df_test['Y'][i]!=2):
            if(df_test['Y'][i]==df_test['Y_test'][i]):
                accuracy += 1
                if(df_test['Y'][i]==0):
                    confusion_martix[0][0] += 1
                else:
                    confusion_martix[1][1] += 1
            
            if(df_test['Y'][i] == 0 and df_test['Y_test'][i] == 4):
                confusion_martix[1][0] += 1
            elif(df_test['Y'][i] == 4 and df_test['Y_test'][i] == 0):
                confusion_martix[0][1] += 1
            total += 1
    return (accuracy/total), confusion_martix

In [10]:
from nltk.tokenize import word_tokenize
def main(train_file, test_file, wordbag_dest, positive_wordbag_dest, negative_wordbag_dest):
    if(csv_loaded==False):
        df0 = pd.read_csv(train_file)
        df1 = pd.read_csv(wordbag_dest)
        df2 = pd.read_csv(positive_wordbag_dest)
        df3 = pd.read_csv(negative_wordbag_dest)
        csv_loaded = True
    phis, thetas_given_positive, thetas_given_negative = train(df0, df1, df2, df3)
    train_accuracy, confusion_matrix_train = test(phis, thetas_given_positive, thetas_given_negative, train_file)
    test_accuracy, confusion_matrix_test = test(phis, thetas_given_positive, thetas_given_negative, test_file)
    print('Training Accuracy is ' + str(train_accuracy*100) + '%')
    print('Testing Accuracy is ' + str(test_accuracy*100) + '%')
    print('Confusion Matrix for Testing Data: \n', confusion_matrix_test)
    print('-ve tweets accuracy: %s' % str(confusion_matrix_test[0][0]/(confusion_matrix_test[0][0]+confusion_matrix_test[1][0])))
    print('+ve tweets accuracy: %s' % str(confusion_matrix_test[1][1]/(confusion_matrix_test[1][1]+confusion_matrix_test[0][1])))

In [11]:
if __name__ == '__main__':
    training_size = 1600000
    data_folder = 'trainingandtestdata'
    train_file = './'+data_folder+'/'+'training.csv'
    test_file = './' + data_folder+'/'+'testing.csv'
    wordbag_dest = './' + data_folder + '/wordbag.csv'
    positive_wordbag_dest = './' + data_folder + '/positive_wordbag.csv'
    negative_wordbag_dest = './' + data_folder + '/negative_wordbag.csv'
    main(train_file, test_file, wordbag_dest, positive_wordbag_dest, negative_wordbag_dest)
    
# Training Accuracy is 64.6699375%
# Testing Accuracy is 66.01671309192201%

Training Accuracy is 68.49875%
Testing Accuracy is 69.91643454038997%
Confusion Matrix for Testing Data: 
 [[136, 67], [41, 115]]
