In [277]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
class standardScaler():
    def fit(self, xss):
        self.mean = np.mean(xss, axis=0)
        self.sd = np.std(xss, axis=0)

    def transform(self, xss):
        xss = (xss-self.mean)/(self.sd)
        return(xss)

In [519]:
def sigmoid(predict):
    return(1/(1 + np.exp(-predict)))

def loss(y, yhat):
    return -np.mean(y*np.log(yhat)) + (1-y)*np.log(1-yhat)

def accuarcy(xss,yss,w):
    xss = np.column_stack(([1] * len(xss) ,xss))
    predict = np.dot(xss,w)
    predict = np.array([0  if predict[i] < 0.5 else 1 for i in range(len(predict))])
    accur = np.mean(predict == yss)
    return(accuar)

In [896]:
class Scaler():
    def fit(self, xss, dummy_columns, numeric_columns):
        
        self.numeric_columns = numeric_columns
        self.dummy_columns = dummy_columns
        
        ## process numeric matrix
        numeric_vector = xss.iloc[:,numeric_columns]
        self.mean = np.mean(numeric_vector)
        self.std = np.std(numeric_vector)
        
        ## process dummy matrix
        dummy_vector = xss.iloc[:,dummy_columns]
        self.proportion = np.mean(dummy_vector)
        
    def transform_dummy(self,xs,proportion):
        trans_xs = [1 - proportion if x == 1 else proportion for x in xs]
        return trans_xs
        
    def transform_numeric(self,xs, mean, std):
        xs = (xs-mean)/std
        return(xs)

    def transform(self, xss):
        row, col = xss.shape
        df = []
        for c in range(col):
            xs = xss.iloc[:,c]
            if c in self.dummy_columns:
                dff = xs
                
                idx = self.dummy_columns.index(c)
                proportion = self.proportion[idx]
                dff = self.transform_dummy(xs,proportion)
            else:
                idx = self.numeric_columns.index(c)
                mean = self.mean[idx]
                std = self.std[idx]
                dff = self.transform_numeric(xs,mean,std)
                
            df.append(dff)
        df = (np.column_stack(df))
        return(pd.DataFrame(df))
        

In [None]:
def get_dummy_id(X_train):
    df = X_train.describe().T
    row, col = df.shape
    dummy_columns = []
    numeric_columns = []
    for r in range(row):
        if df['max'][r] <= 1 and df['min'][r] >= 0:
            dummy_columns.append(r)
        else:
            numeric_columns.append(r)
    
    return({'dummy_columns': dummy_columns,
            'numeric_columns': numeric_columns})

In [897]:
res = getData(scale=False)
X_train = res['X_train']
Y_train = res['Y_train']
X_test = res['X_test']

res = get_dummy_id(X_train)
dummy_columns = res['dummy_columns']
numeric_columns = res['numeric_columns']

scaler = Scaler()
scaler.fit(X_train, dummy_columns, numeric_columns)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# transformer = FunctionTransformer(np.expm1, validate=True)
# X_train = np.nan_to_num(X_train.values, copy=True)
# X_train = transformer.fit_transform(X_train)

## feature selection

In [526]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

def selectFeature(k):
    data = getData()
    X_train = pd.DataFrame(data['X_train'])
    Y_train = data['Y_train']
    chi_selector = SelectKBest(chi2, k)
    chi_selector.fit(X_norm, Y_train)
    chi_support = chi_selector.get_support()
    chi_feature = X_train.loc[:,chi_support].columns.tolist()
    print(str(len(chi_feature)), 'selected features')
    X_train = X_train[chi_feature]
    
    return({'X_train': X_train, 'feature': chi_feature})

In [495]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=0)

In [900]:
from IPython.display import clear_output

def gradientDescent(xss, yss , lr = 1, alpha = 0):
    max_iter =  10 ** 3
    epochs =  50
    
    ## get bias
    xss = np.column_stack(([1] * len(xss) ,xss))
        
    num = xss.shape[1]
    w = np.zeros(num)
    w_lr = np.zeros(num)

    for t in range(epochs):
        w_grad = None
        for m in range(max_iter):
            predict = np.dot(xss,w)
            predict = sigmoid(predict)
            w_grad = np.dot(xss.T, predict-yss)
            w -= lr * w_grad / len(w)
    
        predict = np.array([0  if predict[i] < 0.5 else 1 for i in range(len(predict))])
        accur = np.mean(predict == yss)

        clear_output()
        print(t)
        print(predict)
        print(accur)
    res = {'w': w, 'accuracy': accur}
    return (res)

In [887]:
from IPython.display import clear_output

def stochasticGradientDescent(xss, yss , lr = 1, alpha = 0):
    max_iter =  10 ** 5
    epochs =  30
    
    ## get bias
    xss = np.column_stack(([1] * len(xss) ,xss))
        
    num = xss.shape[1]
    w = np.zeros(num)
    w_lr = np.zeros(num)
    l = len(yss)

    for t in range(epochs):
        w_grad = None
        for m in range(max_iter):
            ran = np.random.randint(0,l)
            xs = xss[ran]
            ys = yss[ran]
            predict = np.dot(xs,w)
            predict = sigmoid(predict)
            w_grad = np.dot(xs.T, predict-ys)
            w -= lr * w_grad
    
        predict = np.dot(xss,w)
        predict = np.array([0  if predict[i] < 0.5 else 1 for i in range(len(predict))])
        accur = np.mean(predict == yss)
        clear_output()
        print(t)
        print(predict)
        print(accur)
        
    res = {'w': w, 'accuracy': accur}
    return (res)

In [None]:
his_train = []
his_test = []
features = []
w_train = []
lr_list = np.logspace(-6,0,7)

x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=0)

for lr in lr_list:
    res = gradientDescent(x_train.values,y_train.values.ravel(),lr)
    w = res['w']
    accur = res['accuracy']
    his_train.append(accur)
    w_g.append(w)

    predict = np.dot(np.column_stack(([1] * len(x_test.values) , x_test.values)),w)
    predict = sigmoid(predict)
    predict = np.array([0  if predict[i] < 0.5 else 1 for i in range(len(predict))])
    accur = np.mean(predict == y_test.values.ravel())
    his_test.append(accur)
    print(his_test)

print(his)
print(his_train)

In [923]:
X_train = pd.DataFrame(X_train)
res = gradientDescent(X_train.values,Y_train.values.ravel(),10**-2)
w = res['w']
acurr = res['accuracy']

49
[0 0 0 ... 0 0 1]
0.8533521697736556


In [795]:
outputFile = '0311-2'

In [925]:
## save model
# np.save('model/'+ outputFile + '.npy',{'w': w, 'scaler': scaler})

## output testing data
row , col = X_test.shape
ans = []
for i in range((row)):
    x = X_test.iloc[i,:]
    x = np.concatenate(([1] ,x))
    predict = np.dot(x,w)
    predict = sigmoid(predict)
    predict =  predict = 0 if predict < 0.5 else 1
    val = predict
    ans.append([i+1 ,val])
    
ans = pd.DataFrame(ans,columns=['id', 'label'])
print(np.sum(ans['label']))
# ans.to_csv('data/'+outputFile+'.csv', index=False)

3157


In [926]:
lab4 = pd.read_csv('data/0311-1.csv')['label']
lab3 = ((ans['label']))
print(np.sum(lab3))
print(np.sum(lab4))
np.sum(lab3 != lab4)

3157
3158


3