# P2.py

In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('./X_train') as X_f, open('./Y_train')as y_f:
    data = []
    while True:
        train_X = X_f.readline()
        if(not train_X): #read until end of file
            break
        single_example = []
        train_X = train_X.strip().split(',') #pre-process with '\n' and ','
        train_y = y_f.readline().strip().split(',')  #pre-process with '\n' and ','
        single_example.extend(train_X)
        single_example.extend(train_y)
        data.append(single_example)
    data = np.array(data[1:]) #ignore row of attribute name

In [3]:
print(data.shape) 

(32561, 107)


In [4]:
std = train_X.std(axis=0)
mean = train_X.mean(axis=0)
train_X = (train_X - mean) / std
mean

AttributeError: 'str' object has no attribute 'std'

In [116]:
# Normalize
train_X = np.array(data[:, :-1], dtype='float')
train_y = np.array(data[:, -1:], dtype='float')

maximum = train_X.max(axis=0)
minimum =train_X.min(axis=0)
train_X = (train_X - minimum) / (maximum-minimum)

data = np.hstack((train_X, train_y))
maximum.tofile('./normalize_Max.model')
minimum.tofile('./normalize_min.model')

In [11]:
def sigmoid(z):
    y = 1 / (1.0 + np.exp(-z))
    return np.clip(y,1e-13,1-(1e-13)) #np.clip to avoid value overflow

In [107]:
def cal_accurancy( w, b):
    # count error with training set
    train_X = np.array(data[:, :-1], dtype='float')
    train_y = np.array(data[:, -1:], dtype='float').ravel()
    predict = np.dot(train_X,w)
    predict = sigmoid(predict)
    
    cross_entropy = -(np.dot(train_y, np.log(predict)) + np.dot((1-train_y), np.log(1-predict)))
    classification = lambda i: 1 if i >= 0.5 else 0
    predict = [classification(i) for i in predict]
    
    err_times = 0
    for i in range(train_y.shape[0]):
        if predict[i] != train_y[i]:
            err_times += 1
    print(err_times / train_y.shape[0], cross_entropy)

In [108]:
def logistic_regression_SGD(learning_rate = 1, epochs = 3000):
    batch_size = 25
    # assume w and b with all ones will have nuch performance than zeros
    w = np.ones((106), dtype='float')
    b = np.ones((1), dtype='float')
    z = np.ones((batch_size), dtype='float')
    y = np.ones((batch_size), dtype='float')
    
    print('training start..')
    for i in range(epochs):
        batch_iteration = 0
        #np.random.shuffle(data) # shuffle data to avoid SGD converge at local minimum
        # an epoch
        #in default, a epoch run 32561 / 25 = 1302... batch times
        while batch_iteration * batch_size < (data.shape[0]): 
            #data extraction
            batch_data = data[batch_iteration*batch_size : batch_iteration*batch_size+25]
            train_X_batch = np.array(batch_data[:, :-1],dtype='float')
            train_y_batch = np.array(batch_data[:, -1:],dtype='float').ravel() #change dimension to 1-D array
            #print('train_X_batch.shape =',train_X_batch.shape,'\n'+'w.shape =',w.shape,'\n'+'b.shape =',b.shape)
            #print('z.shape =',z.shape,'\n'+'y.shape =',y.shape,'\n'+'train_y_batch.shape =',train_y_batch.shape)

            #predict
            z = np.dot(train_X_batch,w) + b
            y = sigmoid(z)

            # update weights and bias
            # use mean instead sum will imporve performance
            w_gradient = np.mean(-1 * train_X_batch * (train_y_batch-y).reshape(batch_data.shape[0],1), axis = 0)
            b_gradient = np.mean(-1 * (train_y_batch-y))


            w = w - learning_rate * w_gradient 
            b = b - learning_rate * b_gradient 
            batch_iteration += 1
            
        learning_rate = learning_rate * 0.998 #adagrad strategy
        cal_accurancy(w, b)
    w.tofile('./w.model')
    b.tofile('./b.model')
    print('training finish..')

In [21]:
logistic_regression_SGD()

training start..
training finish..
0.7538159147446332 139241.75137664823


In [119]:
def testing(filename = './X_test'):
    #read testing dataset
    with open(filename) as f:
        test_data = []
        test_num = 0
        while True:
            test_X = f.readline()
            if(not test_X): #read until end of file
                break
            test_num += 1
            single_example = []
            test_X = test_X.strip().split(',') #pre-process with '\n' and ','
            single_example.extend(test_X)
            test_data.append(single_example)
        test_data = np.array(test_data[1:]).astype('float') #ignore row of attribute name
        
        #loading model
        w = np.fromfile('./w.model')
        b = np.fromfile('./b.model')
        maximum = np.fromfile('./normalize_Max.model')
        minimum = np.fromfile('./normalize_min.model')
        test_data = (test_data - minimum) / (maximum-minimum)

        #predict
        z = np.dot(test_data,w) + b
        y = sigmoid(z)
        
        #write to file
        write_to_file(y)

In [None]:
def write_to_file(test_y,filename = './predictions.csv'):
    with open(filename, 'w') as f:
        classification = lambda i: 1 if i >= 0.5 else 0
        predict = [classification(i) for i in test_y]
        f.write('id,label\n')
        for i in range(len(predict)):
            f.write(str(i) + ',' + str(predict[i]) + '\n')
            print(test_y[i])

In [120]:
testing()

[[0.10958904 0.14569009 1.         ... 0.         0.         0.        ]
 [0.28767123 0.05265413 1.         ... 0.         0.         0.        ]
 [0.15068493 0.22049823 1.         ... 0.         0.         0.        ]
 ...
 [0.28767123 0.24632781 1.         ... 0.         0.         0.        ]
 [0.36986301 0.0486315  1.         ... 0.         0.         0.        ]
 [0.24657534 0.11536314 1.         ... 0.         0.         0.        ]]
0.002267246946601956
0.11832687117014551
0.38296288902225556
0.7746002321786276
0.0012533907239255088
0.004527707803145329
0.010184613368349454
0.8904538359825733
0.006296617759001705
0.048477513797885015
0.7710677130510807
0.586103386350347
0.00960181403459428
0.18140722472375975
0.506213083725992
0.8257010342794889
0.0022234343740287154
0.2780124333544317
0.008712600666080506
0.783422055010969
0.6421275333570335
0.0017828237872891636
0.0026834065401549773
0.04849507986846808
0.5189292835998746
0.92483362583429
0.003829326674676372
0.026387931987992

0.5207324929362086
0.4406192722861434
0.02917006580590238
0.6561231911679155
0.25947467938568436
0.0117735378730822
0.1821528852203976
0.050585278441066917
0.9069616253485848
0.007274632525772604
0.02919452431162054
0.001999657073706485
0.12249893563484564
0.16676469940040525
0.2223588838491563
0.32628882314086416
0.21900284671253678
0.2343742170776794
0.016074024104047315
0.004370530235818031
0.0061602548640851285
0.4235424860650185
0.230354590242376
0.020241564482662153
0.15151124357506204
0.053360066512595436
0.6116181398765635
0.038484768687455206
0.015204828239706593
0.12482290642394175
0.09690345389680997
0.09591061172974871
0.00979999043354668
0.2527061931061834
0.025045617076816344
0.23431020271267108
0.1563446638535436
0.2757996885646585
0.08429697353464258
0.12860425086623678
0.036692251406213544
0.0032718938920001618
0.0007945231038045649
0.03515549079692696
0.026447175794920126
0.22563798020421275
0.08036005504316823
0.10000957395467895
0.12434599747894003
0.010706031038921

0.09501324820675479
0.019399506177306864
0.18673351964492038
0.09041639859744036
0.005423856761361827
0.3635753457408395
0.046435738623370006
0.3366426659647568
0.012568919149166755
0.00025706138588530483
0.0027543706615126206
0.1847609136481365
0.4616745772224616
0.7435382390688345
0.00425786204093839
0.36747047086179263
0.34210357498132793
0.11849557095568891
0.13797333532816106
0.033698282258770404
0.07512275347557475
0.004695798883289243
0.023942235543926254
0.01084218494014309
0.014092736494772676
0.6499604657117375
0.16878693773229897
0.010056002058780725
0.03575444283445992
0.7538203003966483
0.03444470280408471
0.17147784618186365
0.09159766192559494
0.5123013695368418
0.13784957121857805
0.003700522950095958
0.31500851284127973
0.16075167607172644
0.02453221841141063
0.09336019179860212
0.5146032552174499
0.0006754152423270564
0.002400379824467046
0.9036130897356248
0.008916438477458509
0.015013744674547429
0.7239159588559748
0.002751214763774195
0.05168348288332601
0.00092411

0.012035070047927901
0.08111544208409398
0.0523046445860835
0.04597589609960714
0.053859155877704845
0.03644197944565818
0.01933325734647762
0.15922521835663567
0.01860232017060314
0.8738860822630223
0.07590157054707067
0.26146354196153865
0.20336811364681898
0.062122029554436416
0.24737268823052244
0.41107837777397593
0.017235313950211592
0.017196015857581907
0.1597120024873604
0.0005065046954423538
0.001618070479635415
0.12617406498903058
0.5527022464587308
0.9658538605280037
0.9999999999999
0.7177289859261217
0.01812122271484711
0.37128587635958665
0.5265672034575803
0.02179985421146381
0.015761269865402067
0.9205547511935657
0.23827800265398597
0.039715333373032735
0.02385649879187484
0.009683276485705354
0.045834080794776355
0.0742549494733875
0.5185630612471152
0.014065929883507392
0.4469237891200634
0.21988519162545478
0.711190059358682
0.039985388281038876
0.05983910583895465
0.05012960934284038
0.23034559155033454
0.9368016808896764
0.048565049759328
0.04891048497817959
0.0486

0.015831966032830182
0.7005965032442983
0.05077568029227567
0.014109975070599001
0.0013351708663163978
0.11731150957882318
0.02677776333380734
0.6069099996547748
0.053990710219804984
0.27058686012756905
0.0033392099883150146
0.06201384512951037
0.6202196320336005
0.01451016928227049
0.04899775051281107
0.027429518766911604
0.07337956095958423
0.32251925218013816
0.1696415462610109
0.0004680506079176082
0.42544483190897536
0.04803330304035571
0.08022263321716545
1e-13
0.20887498719734918
0.21213479588270398
0.0454353652344132
0.11750142682519328
0.0006451532356867226
0.007995378180634559
0.027124941507492187
0.2629055212742859
0.10517216668579768
0.04006235258465778
0.05509271801882891
0.019055587927260226
0.10003986701770165
0.593702922219649
0.23907111046681556
0.883172596018664
0.024979088371694137
0.0005067050965533273
0.12893372322133598
0.04787621359228422
0.04231754800838639
0.29610036282200997
0.9099073244037639
0.009411689287859829
0.393638181690583
0.05074182135248548
0.088101

0.13508035608778313
0.010276611791887132
0.043365358936167
0.3649466616496857
0.4753395752124523
0.016590211473787364
0.03909715620094606
0.3229249285042603
0.20040355446639335
0.007308223704759635
0.07986020646261784
0.015467242044270722
0.0030202271980077767
0.21225458906159797
0.20502219222467374
0.00776915205539037
0.0010243884807874819
0.2633109558902227
0.09583214797206979
0.007644950679619274
0.05679821675226074
0.03184034861721096
0.059920649344282005
0.2327861564863339
0.00904615351929055
0.429555177128521
0.2614425854268209
0.13002684050110364
0.03914991405095123
0.2093145146226842
0.5076711795363327
0.020243432133432857
0.004743683867090961
0.0019949542125658345
0.3446339935671011
0.029817678156516255
0.34541088456973873
0.8882746651704136
0.19928653689148812
0.9984059473351058
0.027564115341244734
0.7074412138254396
0.06462819150808505
0.21048686062169528
0.11962527112422575
0.04960625840406685
0.01231611487899773
0.02253433638235894
0.33453156457186467
0.00304484564298897


0.005387782336589479
0.007091146769295296
0.2934579212487152
0.3526278706051881
0.05098175886770058
0.009588425520690847
0.15198168488410074
4.904963833563489e-05
0.44288642136487527
0.4256569217588224
0.16516287815798408
0.613764071900589
0.05102965220615265
0.0034300947717611804
0.2839822179777151
0.2590963951439373
0.0433535580496577
0.7438426840340335
0.6649131602039862
0.013011484989203733
0.032969161630697935
0.006006048320538448
0.5527804062569757
0.02972424469397885
0.029332878572613862
0.31442201445119283
0.013735166087619391
0.7903002318138735
0.2931988600892345
0.08480571726248312
0.9944297542961573
0.2607671220353211
0.5426626988745017
0.014076684520633887
0.6376364495177929
0.0005414319398592327
0.007482395157711229
0.000344404625657
0.0007237897442112272
0.16266519026237536
0.6200008656106039
0.1307009044122518
0.010403755614839335
0.008396157904718995
0.0014777795695509982
0.43408784477585816
0.5962479262673134
0.16577654622771656
0.005400452405463476
0.00732787199288358