In [1]:
import numpy as np
import pandas as pd
import math

In [75]:
def calculate(data, attributes):
    """
    Calculate all the model parameters
    """
    probabilities = {}

    prob_false = len(data[np.where(data[:,-1] == False)])/len(data)
    prob_true = len(data[np.where(data[:,-1] == True)])/len(data)
    true_num = len(data[np.where(data[:,-1] == True)])
    false_num = len(data[np.where(data[:,-1] == False)])
    
    for index, att in enumerate(attributes[:-1]):
        if att in [' # sentences', ' # words']:
            mu_T = np.mean(data[np.where(data[:,-1]==True),index])
            sigma_T = np.var(data[np.where(data[:,-1]==True),index])
            mu_F = np.mean(data[np.where(data[:,-1]==False),index])
            sigma_F = np.var(data[np.where(data[:,-1]==False),index])
            probabilities[att] = np.array([[mu_T, sigma_T], [mu_F, sigma_F]])
        else:
            TT = float(len(data[np.where((data[:,index]==True) & (data[:,-1]==True))]) / true_num)
            TF = float(len(data[np.where((data[:,index]==True) & (data[:,-1]==False))]) / false_num)
            FT = float(len(data[np.where((data[:,index]==False) & (data[:,-1]==True))]) / true_num)
            FF = float(len(data[np.where((data[:,index]==False) & (data[:,-1]==False))]) / false_num)
            probabilities[att] = np.array([[TT, TF], [FT, FF]])
    return prob_true, prob_false, probabilities

In [76]:
def test(prob_true, prob_false, probabilities, test_data, attributes):
    """
    Test the model on given test data
    """
    correct = 0
    total = len(test_data)

    for row in range(len(test_data)):
        output = None
        pred_T = prob_true
        pred_F = prob_false
        for index,col in enumerate(attributes[:-1]):
            if index == 6 or index == 7:
                denom = (2*math.pi*probabilities[col][0,1])**.5
                num = math.exp(-(float(test_data[row,index])-float(probabilities[col][0,0]))**2/(2*probabilities[col][0,1]))
                pred_T *= (num/denom)
                denom = (2*math.pi*probabilities[col][1,1])**.5
                3num = math.exp(-(float(test_data[row,index])-float(probabilities[col][1,0]))**2/(2*probabilities[col][1,1]))
                pred_F *= (num/denom)
            else:
                if test_data[row, index]:
                    pred_T *= probabilities[col][0,0]
                    pred_F *= probabilities[col][0,1]
                else:
                    pred_T *= probabilities[col][1,0]
                    pred_F *= probabilities[col][1,1]
        pred_T /= (pred_T+pred_F)
        pred_F /= (pred_T+pred_F)
        if pred_T > pred_F:
            output = True
        else:
            output = False
        if output == data[row,-1]:
            correct += 1

    return round(correct/total, 2)

In [77]:
'''
Read the train dataset to calculate model parameters
'''
data = pd.read_csv('q3.csv', header=0, index_col=False)
attributes = data.columns
data = data.to_numpy()

# Calculate model parameters by calling calculate()
prob_true, prob_false, probabilities = calculate(data, attributes)

In [79]:
print('Parameters:')
print('True and False probability:', prob_true, prob_false)
print('For each attribute:')
for key,val in probabilities.items():
    print(key,':\n',val)

Parameters:
True and False probability: 0.172 0.828
For each attribute:
in html :
 [[0.75581395 0.58695652]
 [0.24418605 0.41304348]]
 has emoji :
 [[0.19767442 0.147343  ]
 [0.80232558 0.852657  ]]
 sent to list :
 [[0.06976744 0.3115942 ]
 [0.93023256 0.6884058 ]]
 from .com :
 [[0.74418605 0.27536232]
 [0.25581395 0.72463768]]
 has my name :
 [[0.34883721 0.60144928]
 [0.65116279 0.39855072]]
 has sig :
 [[0.6627907 0.3236715]
 [0.3372093 0.6763285]]
 # sentences :
 [[3.97674419 3.7203894 ]
 [6.19082126 6.40078532]]
 # words :
 [[ 68.8372093   79.34559221]
 [ 70.7705314  912.76618474]]


In [81]:
'''
Read the test dataset
'''
test_data = pd.read_csv('q3b.csv', header=0, index_col=False)
attributes = test_data.columns
test_data = test_data.to_numpy()

'''
Run the model on test dataset by calling test()
'''
test_accuracy = test(prob_true, prob_false, probabilities, test_data, attributes)
print('Test Accuracy =', test_accuracy)
print('Loss =', 1-test_accuracy)

Test Accuracy = 0.43
Loss = 0.5700000000000001


In [82]:
"""
Trying different combinations of attributes for getting better performance
"""
def subset(prob_true, prob_false, probabilities, test_data, attributes):
    choices = np.array([[0,1,2],
                       [2,4,5,6],
                       [1,3,5],
                       [3,7]])

    exp_accuracies = []
    for choice in choices:
        print('choice:', choice)
        temp = choice.append(8)
        prob_true, prob_false, probabilities = calculate(data[:, choice], attributes[temp])
        exp_accuracies.append(test(prob_true, prob_false, probabilities, test_data, attributes[temp]))

    max_acc = max(exp_accuracies)
    print('Maximum accuracy,', max_acc, 'for attributes:', choices[exp_accuracies.index(max_acc)])


In [83]:
# calling subset function to get which combination/subset of attributes give better results
subset(prob_true, prob_false, probabilities, test_data, attributes)

choice: [0, 1, 2]
choice: [2, 4, 5, 6]
choice: [1, 3, 5]
choice: [3, 7]
Maximum accuracy, 0.81 for attributes: [0, 1, 2, 8]
