In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [754]:
def getData():  
    data = pd.read_csv('data/train.csv')
    X_train = pd.get_dummies(data.iloc[:,:-1], dummy_na=False, columns=None, sparse=False, drop_first=True, dtype=None)
    Y_train = pd.get_dummies(data.iloc[:,-1], dummy_na=False, columns=None, sparse=False, drop_first=True, dtype=None)
    
    res = get_dummy_id(X_train)
    dummy_columns = res['dummy_columns']
    numeric_columns = res['numeric_columns']

    scaler = Scaler()
    scaler.fit(X_train, dummy_columns, numeric_columns)
    X_train = scaler.transform(X_train)
    
    X_test = pd.read_csv('data/test.csv')
    X_test = pd.get_dummies(X_test, dummy_na=False, columns=None, sparse=False, drop_first=True, dtype=None)
#     missing_cols = set(X_train.columns) - set(X_test.columns)

    # Add a missing column in test set with default value equal to 0
#     for c in missing_cols:
#         X_test[c] = 0

    # Ensure the order of column in the test set is in the same order than in train set
    X_test = X_test[X_train.columns]
    X_test = scaler.transform(X_test)

    return ({'X_train' : X_train, 'Y_train': Y_train, 'X_test': X_test} )

In [728]:
class Scaler():
    def fit(self, xss, dummy_columns, numeric_columns):
        
        self.numeric_columns = numeric_columns
        self.dummy_columns = dummy_columns
        
        ## process numeric matrix
        numeric_vector = xss.iloc[:,numeric_columns]
        self.mean = np.mean(numeric_vector)
        self.std = np.std(numeric_vector)
        
        ## process dummy matrix
        dummy_vector = xss.iloc[:,dummy_columns]
        self.proportion = np.mean(dummy_vector)
        
    def transform_dummy(self,xs,proportion):
        trans_xs = [1 - proportion if x == 1 else proportion for x in xs]
        return trans_xs
        
    def transform_numeric(self,xs, mean, std):
        xs = (xs-mean)/std
        return(xs)

    def transform(self, xss):
        row, col = xss.shape
        df = []
        for c in range(col):
            xs = xss.iloc[:,c]
            if c in self.dummy_columns:
                dff = xs
            else:
                idx = self.numeric_columns.index(c)
                mean = self.mean[idx]
                std = self.std[idx]
                dff = self.transform_numeric(xs,mean,std)
                
            df.append(dff)
        df = (np.column_stack(df))
        return(pd.DataFrame(df))
        

In [440]:
def sigmoid(predict):
    return(1/(1 + np.exp(-predict)))

In [725]:
def mean_var(xss):
    mu = np.mean(xss,0)
    var = np.cov(xss.T)     
    return mu, var

def gaussian(xs, mu, cov):
    col = len(xs)
    coeff = 1 / np.sqrt((2 * np.pi) ** col)
    determinant = 1 / np.sqrt(np.linalg.det(cov))
    term = (-1/2) * np.dot((xs-mu).T, np.linalg.inv(cov))
    term = np.dot(term,(xs-mu))
    
    prob = coeff * determinant * np.exp(term)
    return(prob)

In [651]:
def get_dummy_id(X_train):
    df = X_train.describe().T
    row, col = df.shape
    dummy_columns = []
    numeric_columns = []
    for r in range(row):
        if df['max'][r] <= 1 and df['min'][r] >= 0:
            dummy_columns.append(r)
        else:
            numeric_columns.append(r)
    
    return({'dummy_columns': dummy_columns,
            'numeric_columns': numeric_columns})

In [None]:
res = getData()
X_train = res['X_train']
Y_train = res['Y_train']
X_test = res['X_test']

In [731]:
col_0 = np.where(Y == 0)
col_1 = np.where(Y == 1)

X_train = np.array(X_train)
x_0 = X_train[col_0]
x_1 = X_train[col_1]

res = mean_var(x_0)
mu_0 = res[0]
cov_0 = res[1]

res = mean_var(x_1)
mu_1 = res[0]
cov_1 = res[1]

num_0 = len(col_0)
num_1 = len(col_1)
num = num_0 + num_1
cov = (num_0 / num) * cov_0 + (num_1 / num) * cov_1

In [732]:
ans = []
for i in range(len(X_test)):
    x = X_test.iloc[i,:]
    c0 = gaussian(x, mu_0, cov)
    c1 = gaussian(x, mu_1, cov)
    value = 0 if c0 > c1 else 1
    ans.append(value)
np.sum(ans)

16281

In [487]:
class GBN:
    def fit(self,X,Y):
        col_0 = np.where(Y == 0)
        col_1 = np.where(Y == 1)

        self.num_0 = len(col_0)
        self.num_1 = len(col_1)
        self.num = num_0 + num_1

        x_0 = X[col_0]
        x_1 = X[col_1]

        self.mu_0 = np.mean(x_0,axis=0)
        self.mu_1 = np.mean(x_1,axis=0)

        self.var_0 = np.cov(x_0.T)
        self.var_1 = np.cov(x_1.T)
        self.cov = (num_0 / num * self.var_0) + (num_1 / num * self.var_1)

    def predict(self,x):

        w = np.dot((self.mu_0 - self.mu_1).T, np.linalg.inv(self.cov))
        res = np.dot(w,x.T)

        b0 = np.dot(mu_0.T,np.linalg.inv(self.cov))
        aab0 = np.dot(b0,mu_0)
        
        b1 = np.dot(mu_1.T,np.linalg.inv(self.cov))
        b1 = np.dot(b1,mu_1)
        
        z_0 = res - (1/2 * b0) + (1/2 * b1) + np.log(self.num_0 / self.num_1)
        prob_0 = [sigmoid(i) for i in z_0]
        
        return(prob_0) 