# P2.py

In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('./X_train') as X_f, open('./Y_train')as y_f:
    data = []
    while True:
        train_X = X_f.readline()
        if(not train_X): #read until end of file
            break
        single_example = []
        train_X = train_X.strip().split(',') #pre-process with '\n' and ','
        train_y = y_f.readline().strip().split(',')  #pre-process with '\n' and ','
        single_example.extend(train_X)
        single_example.extend(train_y)
        data.append(single_example)
    data = np.array(data[1:]) #ignore row of attribute name

In [3]:
print(data.shape) 

(32561, 107)


In [4]:
# Standardization
train_X = np.array(data[:, :-1], dtype='float')
train_y = np.array(data[:, -1:], dtype='float')

std = train_X.std(axis=0)
mean = train_X.mean(axis=0)
train_X = (train_X - mean) / std
        
data = np.hstack((train_X, train_y))
std.tofile('./stdandardize_std.model')
mean.tofile('./stdandardize_mean.model')

In [5]:
def sigmoid(z):
    y = 1 / (1.0 + np.exp(-z))
    return np.clip(y,1e-13,1-(1e-13)) #np.clip to avoid value overflow

In [6]:
def cal_accurancy(data, w, b):
    # count error with training set
    train_X = np.array(data[:, :-1], dtype='float')
    train_y = np.array(data[:, -1:], dtype='float').ravel()
    predict = np.dot(train_X,w) + b
    predict = sigmoid(predict)
    
    cross_entropy = -(np.dot(train_y, np.log(predict)) + np.dot((1-train_y), np.log(1-predict)))
    classification = lambda i: 1 if i >= 0.5 else 0
    predict = [classification(i) for i in predict]
    
    err_times = 0
    for i in range(train_y.shape[0]):
        if predict[i] == train_y[i]:
            err_times += 1
    print(err_times / train_y.shape[0], cross_entropy)

In [7]:
def logistic_regression_SGD(data, learning_rate = 1, epochs = 500):
    batch_size = 25
    # assume w and b with all ones will have nuch performance than zeros
    w = np.zeros((106), dtype='float')
    b = np.zeros((1), dtype='float')
    #adagrad strategy
    pre_w = np.zeros((106), dtype='float')
    pre_b = np.zeros((1), dtype='float')
    
    z = np.ones((batch_size), dtype='float')
    y = np.ones((batch_size), dtype='float')
    
    print('training start..')
    for _ in range(epochs):
        batch_iteration = 0
        np.random.shuffle(data) # shuffle data to avoid SGD converge at local minimum
        # an epoch
        #in default, a epoch run 32561 / 25 = 1302... batch times
        while batch_iteration * batch_size < (data.shape[0]): 
            #data extraction
            batch_data = data[batch_iteration*batch_size : batch_iteration*batch_size+25]
            train_X_batch = np.array(batch_data[:, :-1],dtype='float')
            train_y_batch = np.array(batch_data[:, -1:],dtype='float').ravel() #change dimension to 1-D array
            #print('train_X_batch.shape =',train_X_batch.shape,'\n'+'w.shape =',w.shape,'\n'+'b.shape =',b.shape)
            #print('z.shape =',z.shape,'\n'+'y.shape =',y.shape,'\n'+'train_y_batch.shape =',train_y_batch.shape)

            #predict
            z = np.dot(train_X_batch,w) + b
            y = sigmoid(z)

            # update weights and bias
            # use mean instead sum will imporve performance
            w_gradient = np.mean(-1 * train_X_batch * (train_y_batch-y).reshape(batch_data.shape[0],1), axis = 0)
            b_gradient = np.mean(-1 * (train_y_batch-y))
            pre_w += w_gradient**2
            pre_b += b_gradient**2
            ada_w = np.sqrt(pre_w)
            ada_b = np.sqrt(pre_b)

            w = w - learning_rate * w_gradient / ada_w
            b = b - learning_rate * b_gradient /ada_b
            batch_iteration += 1
            
        cal_accurancy(data, w, b)
    # save model
    w.tofile('./w.model')
    b.tofile('./b.model')
    print('training finish..')

In [8]:
logistic_regression_SGD(data, learning_rate = 1, epochs = 500)

training start..
0.8354473142716747 12528.9049103
0.8422345751051872 11402.6980403
0.8407297073185713 11325.7762638
0.847394121802156 10850.9656495
0.8476398145020116 10854.570958
0.8436780197168392 10840.5837651
0.8470255827523725 10717.0874564
0.8469948711648905 10706.9596603
0.8493903749884831 10656.2828349
0.8487454316513621 10689.6876389
0.8480390651392771 10742.5351824
0.8491753938761094 10667.8714156
0.8496360676883388 10689.8226173
0.8484690273640244 10594.6079982
0.8500660299130862 10553.9797409
0.8498817603881945 10580.0617953
0.8487147200638802 10550.2038184
0.8491139707011456 10597.5531879
0.8499124719756764 10583.269454
0.8474248333896379 10612.8363882
0.8493596634010012 10616.9495308
0.8500967415005681 10543.8280611
0.8498817603881945 10525.5677062
0.8488989895887719 10512.5223811
0.8509566659500629 10489.7602046
0.8504038573753877 10487.1042794
0.8496053561008569 10454.3984231
0.8511409354749547 10569.711044
0.8498203372132306 10508.8409255
0.8499738951506404 10441.99936

0.8525843800866066 10323.9653222
0.8527072264365345 10324.8730604
0.8527686496114985 10319.3781672
0.8529529191363902 10323.5268773
0.8528607843739443 10323.4677309
0.8522158410368232 10332.1845074
0.8525536684991247 10329.643066
0.8512944934123645 10326.7055842
0.8524922453241608 10313.775787
0.8531064770737999 10330.5726776
0.8524308221491969 10340.5861578
0.8526765148490526 10335.1673197
0.8531064770737999 10313.5338949
0.8520315715119314 10329.1075213
0.8521237062743773 10316.6507692
0.8525536684991247 10326.3647685
0.8521544178618593 10330.5334042
0.853045053898836 10320.8051937
0.8532907465986916 10317.0356984
0.8527379380240164 10326.2331896
0.8524922453241608 10335.4357255
0.852338687386751 10327.0496417
0.8531064770737999 10330.9139377
0.8524001105617149 10332.1664186
0.8524001105617149 10320.8874472
0.8520315715119314 10316.0763902
0.853045053898836 10319.4765514
0.8528607843739443 10316.4745487
0.8527379380240164 10329.3060267
0.8533521697736556 10331.6865138
0.8534443045361

In [9]:
def testing(filename = './X_test'):
    print('Starting testing..')
    #read testing dataset
    with open(filename) as f:
        test_data = []
        test_num = 0
        while True:
            test_X = f.readline()
            if(not test_X): #read until end of file
                break
            test_num += 1
            single_example = []
            test_X = test_X.strip().split(',') #pre-process with '\n' and ','
            single_example.extend(test_X)
            test_data.append(single_example)
        test_data = np.array(test_data[1:]).astype('float') #ignore row of attribute name
        
        #loading model
        w = np.fromfile('./w.model')
        b = np.fromfile('./b.model')
        std = np.fromfile('./stdandardize_std.model')
        mean = np.fromfile('./stdandardize_mean.model')
        test_data = (test_data - mean) / std #Standardization

        #predict
        z = np.dot(test_data,w) + b
        y = sigmoid(z)
        
        #write to file
        write_to_file(y)
    print('Testing finishing..')

In [10]:
def write_to_file(test_y,filename = './predictions.csv'):
    with open(filename, 'w') as f:
        classification = lambda i: 1 if i >= 0.5 else 0
        predict = [classification(i) for i in test_y]
        f.write('id,label\n')
        for i in range(len(predict)):
            f.write(str(i) + ',' + str(predict[i]) + '\n')
            #print(test_y[i])

In [None]:
testing()