In [6]:
import numpy as np
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

NEG_MEAN_1 = 'data/DS1_m_0.txt'
POS_MEAN_1 = 'data/DS1_m_1.txt'
COV_1 = 'data/DS1_Cov.txt'
TRAIN_DATA = 'generated_data/DS1_train.csv'
TEST_DATA = 'generated_data/DS1_test.csv'

''' Part 1 '''
print('Part 1 - Generating Data Using Dataset 1 Mean and Covariances ... \n')
# Prepare Data
mean0 = pd.read_csv(NEG_MEAN_1, header=None)
mean1 = pd.read_csv(POS_MEAN_1, header=None)
cov = pd.read_csv(COV_1, header=None)
mean0.drop([20], axis=1, inplace=True)
mean1.drop([20], axis=1, inplace=True)
cov.drop([20], axis=1, inplace=True)

# Generate samples using gaussian distribution
data0 = pd.DataFrame(np.random.multivariate_normal(mean0.as_matrix()[0], cov.values, 2000))
data1 = pd.DataFrame(np.random.multivariate_normal(mean1.as_matrix()[0], cov.values, 2000))
data0[20] = 0
data1[20] = 1

# Split data into test and training
msk = np.random.rand(len(data0)) < 0.7
train_data0 = data0.loc[msk]
test_data0 = data0.loc[~msk]
msk = np.random.rand(len(data1)) < 0.7
train_data1 = data1.loc[msk]
test_data1 = data1.loc[~msk]

# Put both classes together in single data set
train_data = pd.concat([train_data0, train_data1], ignore_index=True)
test_data = pd.concat([test_data0, test_data1], ignore_index=True)
train_data.to_csv(TRAIN_DATA)
test_data.to_csv(TEST_DATA)

print('Training Data Generated. See: {}'.format(TRAIN_DATA))
print('Testing Data Generated. See: {} \n'.format(TEST_DATA))

# Create variables for future usage
train_data0 = pd.DataFrame(train_data[train_data[20] == 0])
train_data1 = pd.DataFrame(train_data[train_data[20] == 1])
test_output = test_data[20]
train_output = train_data[20]

# Drop the outputs
train_data0.drop([20], axis=1, inplace=True)
train_data1.drop([20], axis=1, inplace=True)
test_data.drop([20], axis=1, inplace=True)
train_data.drop([20], axis=1, inplace=True)


Part 1 - Generating Data Using Dataset 1 Mean and Covariances ... 

Training Data Generated. See: generated_data/DS1_train.csv
Testing Data Generated. See: generated_data/DS1_test.csv 



In [7]:
''' Part 2 '''
print('Part 2 - LDA Model Using Maximum Likelihood Approach \n')

# Find max probability
prob0 = float(len(train_data0)) / float(len(train_data0) + len(train_data1))
prob1 = 1.0 - prob0

# Find mean
mean0 = np.array(train_data0.mean())
mean1 = np.array(train_data1.mean())

# Find covariance matrix
diff0 = np.array(train_data0 - mean0)
diff1 = np.array(train_data1 - mean1)
cov = (np.matmul(diff0.T, diff0) + np.matmul(diff1.T, diff1)) / float(len(train_data0) + len(train_data1))

# Compute coefficients
w0 = math.log(prob0) - math.log(prob1) - 0.5 * (np.matmul(np.matmul(mean0.T, np.linalg.pinv(cov)), mean0) - np.matmul(np.matmul(mean1.T, np.linalg.pinv(cov)), mean1))
w1 = np.matmul(np.linalg.pinv(cov), mean0 - mean1)
print("w0: ", w0, '\n')
print("w1: " + str([i for i in w1]) + "\n")

# Compute output prediction
pred_output = np.matmul(test_data, w1) + w0

# Set prediction to 0 or 1 based on decision boundary
pred_output[pred_output > 0] = 0
pred_output[pred_output < 0] = 1

# Compute confusion matrix
confusion = [[0, 0], [0, 0]]
for i in range(len(test_output)):
    true_value = test_output[i]
    pred_value = pred_output[i]
    if pred_value == 1:
        if pred_value == true_value:
            confusion[0][0] += 1
        else:
            confusion[0][1] += 1
    if pred_value == 0:
        if pred_value == true_value:
            confusion[1][1] += 1
        else:
            confusion[1][0] += 1
tp = confusion[0][0]
fp = confusion[0][1]
fn = confusion[1][0]
tn = confusion[1][1]

# Compute result
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f = 2 * precision * recall / (precision + recall)

print('F Measure: ', f)
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall, '\n')

Part 2 - LDA Model Using Maximum Likelihood Approach 

w0:  29.259797254602912 

w1: [15.394931289213986, -9.185646705696398, -5.928089971809898, -3.555251944730623, -10.344634497147682, -4.871682722537692, 18.2891256442551, -25.555524497556632, -31.21481606951889, 9.897539968625427, -14.158893375852987, -13.094090326307377, 16.553401531941205, 13.964014770335815, -5.956016064445741, 13.748578337520332, 31.633298731145825, -7.240627737286969, -0.832813528318292, -5.227616321886086]

F Measure:  0.9468267581475129
Accuracy:  0.948376353039134
Precision:  0.9419795221843004
Recall:  0.9517241379310345 

