In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import csv
import sys

from sklearn.metrics import mean_squared_error, accuracy_score
from generative_model import GenerativeModel as MyGM

In [2]:
train_x_csv_path = 'data/format/X_train'
train_x_df = pd.read_csv(train_x_csv_path)
train_y_csv_path = 'data/format/Y_train'
train_y_df = pd.read_csv(train_y_csv_path)
test_x_csv_path = 'data/format/X_test'
test_x_df = pd.read_csv(test_x_csv_path)
train_x = train_x_df.values
train_y = train_y_df.values
test_x = test_x_df.values

In [3]:
dim = 106

def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 0.00000000000001, 0.99999999999999)


def predict(X_test, mu1, mu2, shared_sigma, N1, N2):
    sigma_inverse = np.linalg.pinv(shared_sigma)
    w = np.dot((mu1 - mu2), sigma_inverse)
    x = X_test.T
    b = (-0.5) * np.dot(np.dot(mu1, sigma_inverse), mu1) + (0.5) * np.dot(np.dot(mu2, sigma_inverse), mu2) + np.log(
        float(N1) / N2)
    a = np.dot(w, x) + b
    y = sigmoid(a)
    return y


def train(X_train, Y_train):
    # gaussian distribution parameters
    train_data_size = X_train.shape[0]
    cnt1 = 0
    cnt2 = 0

    mu1 = np.zeros((dim,))
    mu2 = np.zeros((dim,))
    for i in range(train_data_size):
        if Y_train[i] == 1:
            mu1 += X_train[i]
            cnt1 += 1
        else:
            mu2 += X_train[i]
            cnt2 += 1
    mu1 /= cnt1
    mu2 /= cnt2

    sigma1 = np.zeros((dim, dim))
    sigma2 = np.zeros((dim, dim))
    for i in range(train_data_size):
        if Y_train[i] == 1:
            sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [(X_train[i] - mu1)])
        else:
            sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [(X_train[i] - mu2)])
    sigma1 /= cnt1
    sigma2 /= cnt2
    shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2
    return (mu1, mu2, shared_sigma, cnt1, cnt2)

def ta_predict():
    mu1, mu2, shared_sigma, N1, N2 = train(train_x, train_y)
    ta_train_pred = np.around(predict(train_x, mu1, mu2, shared_sigma, N1, N2))
    acc = round(accuracy_score(train_y, ta_train_pred), 6)
    print(f"ta train acc = {acc}")
    
    
    y = predict(test_x, mu1, mu2, shared_sigma, N1, N2)
    return np.around(y)

In [4]:
ta_pred = ta_predict()

ta train acc = 0.842173


In [5]:
ta_pred

array([0., 0., 0., ..., 1., 0., 1.])

In [6]:
my_gm = MyGM()
my_gm.fit(train_x, train_y)
my_pred = my_gm.predict(test_x)
my_pred

array([0., 0., 0., ..., 1., 0., 1.])

In [7]:
np.array_equal(my_pred, ta_pred)

True

In [8]:
count = 0
for a, b in zip(my_pred, ta_pred):
    if a != b:
        count += 1
count

0

In [9]:
round(accuracy_score(train_y, my_gm.predict(train_x)), 6)

0.842173