In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame


#reads the datafiles and returns the training and the testing data
def get_data():
    # get test & test csv files as a DataFrame
    train_df = pd.read_csv("data/train.csv")
    test_df    = pd.read_csv("data/test.csv")

    #creating dummies of the data
    train_df=pd.get_dummies(train_df)
    test_df=pd.get_dummies(test_df)

    #remove unwanted columns and the columns that are created for ?
    columns_to_remove=set(list(train_df)).symmetric_difference(set(list(test_df)))
    columns_to_remove.remove('salary')
    columns_to_remove.add('race')
    columns_to_remove.add('native-country')
    for col in list(train_df):
        if (col in columns_to_remove) or ("?" in col) :
            train_df=train_df.drop([col],axis=1)
    for col in list(test_df):
        if (col in columns_to_remove) or ("?" in col) :
            test_df=test_df.drop([col],axis=1)
    
    return train_df,test_df


def process_data(percent):
    train_df,test_df=get_data()
    Y=train_df['salary'].as_matrix()
    X=train_df.drop(['salary'], axis=1).as_matrix()
    Y=Y.reshape(len(Y),1)
    end=int(X.shape[0] * percent)
    #training data
    train_X=X[:end,:]
    train_Y=Y[:end,:]
    #data for cross validation
    cross_X=X[end:,:]
    cross_Y=Y[end:,:]
    #testing data
    test_X=test_df.as_matrix()
    test_ids=test_df['id'].as_matrix()
    #replacing ids as bias term
    train_X[:0]=1
    test_X[:0]=1
    
    return train_X,train_Y,cross_X,cross_Y,test_X,test_ids




# using softmax as output layer is recommended for classification where outputs are mutually exclusive
def softmax(w):
    e = np.exp(w - np.amax(w))
    dist = e / np.sum(e)
    return dist

# using tanh over logistic sigmoid for the hidden layer is recommended   
def tanh(x):
    return np.tanh(x)
    
# derivative for tanh sigmoid
def dtanh(y):
    return 1 - y*y


#sigmoid funciton
def sigmoid(x):
    return 1/(1+np.exp(-x))

#differntial sigmoid
def dsigmoid(x):
    return x*(1-x)

#Neural Network implementation
def nn(X_train,Y_train,neurons,learning_rate):
    np.random.seed(1) #random number
    w0=np.random.random((X_train.shape[1],neurons))-1
    w1=np.random.random((neurons,1))-1
    alpha=1
    for j in range(5000):
        #forward prop
        l0=X_train
        l1=sigmoid(np.dot(l0,w0))
        l2=sigmoid(np.dot(l1,w1))
        l2_error=Y_train-l2
        l2_delta=l2_error*dsigmoid(l2)
        l1_error=np.dot(l2_delta,w1.T)
        l1_delta=l1_error*dsigmoid(l1)
        w1+=alpha*(np.dot(l1.T,l2_delta))
        w0+=alpha*(np.dot(l0.T,l1_delta))
    
    return w0,w1

def predict(X_test,w0,w1):
    hidden_layer=sigmoid(np.dot(X_test,w0))
    Y_test=sigmoid(np.dot(hidden_layer,w1))

    return Y_test

#writes the predicted values to file 
def write_result(ids,predicted,file_name):
    output=np.column_stack((ids,predicted))
    np.savetxt(file_name,output,delimiter=",",fmt="%d,%d",header="id,salary",comments ='')


In [None]:
train_X,train_Y,cross_X,cross_Y,test_X,test_ids= process_data(0.115)
w0,w1=nn(train_X,train_Y,100,1)



In [None]:
predicted=predict(test_X,w0,w1)

In [28]:

predicted[predicted>0.5]=1
predicted[predicted<=0.5]=0
print(predicted[:5,:])

[[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]


In [34]:
print(np.count_nonzero(predicted))

6878


In [3]:
print(test_ids.shape,predicted.shape)
write_result(test_ids,predicted,"10neurons")

(38973, 15) (38973,)
[[-0.92802572 -0.03272367 -0.43189954 -0.79670677 -0.74767426 -0.25617415
  -0.80457052 -0.41864107 -0.02998001 -0.1531712 ]]
(6878, 1)


In [35]:
np.savetxt('checking',predicted,delimiter=",",fmt="%f",header="salary",comments ='')



0    0
1    1
2    2
3    3
Name: id, dtype: int64

In [36]:
X= np.array([ [0,0,1],
              [0,1,1],
              [1,0,1],
              [1,1,1], ])

Y=np.array([ [ 0,1,0,1 ] ]).T
print(X.shape,Y.shape)
w0,w1=nn(X,Y,100,1)
predicted=predict(X,w0,w1)
print(predicted)


(4, 3) (4, 1)
predicting
[[ 0.00431016]
 [ 0.99467133]
 [ 0.00934583]
 [ 0.99012167]]


In [7]:
predicted=nn(X,Y,X,50,1)
print(predicted)
predicted=nn(X,Y,X,50,0.1)
print(predicted)
predicted=nn(X,Y,X,50,0.01)
print(predicted)

[[-0.34389533  1.69680924  0.48157856 -0.14194807  0.2734018   0.50042163
  -0.24033495  0.68571935  1.47872693  1.94862908 -0.19434768  0.3056804
   1.04407801  1.52870164 -0.43859884  0.10100418  0.11646271  0.9265588
   0.94821002  0.2628298   0.27657514  0.92647246  0.10721155  0.42481585
   1.15574883  0.9852561   0.84884338  0.13334203 -0.29088181  0.49790059
   0.79245257  0.47721154  1.63659464 -0.35730444  0.37394008  0.89385101
   0.35819129  1.2520115   0.67314849  1.90370291  0.29206175 -0.00753055
   0.40386074  0.83959593  0.5159915   0.65617943  0.02774922  0.22203849
   0.36847031  0.6532641 ]]
[[ 0.99999593]
 [ 0.99997299]
 [ 0.99996986]
 [ 0.99986455]]
[[-0.34389533  1.69680924  0.48157856 -0.14194807  0.2734018   0.50042163
  -0.24033495  0.68571935  1.47872693  1.94862908 -0.19434768  0.3056804
   1.04407801  1.52870164 -0.43859884  0.10100418  0.11646271  0.9265588
   0.94821002  0.2628298   0.27657514  0.92647246  0.10721155  0.42481585
   1.15574883  0.9852561   

In [8]:
predicted=nn(X,Y,X,10,1)
print(predicted)
predicted=nn(X,Y,X,10,0.1)
print(predicted)
predicted=nn(X,Y,X,10,0.01)
print(predicted)

[[-1.37107119  2.6894496  -0.5949312   1.17090215 -0.94650185 -0.88984271
  -0.6618885   0.10529835 -1.72403777  1.76452551]]
[[ 0.5041566 ]
 [ 0.95311554]
 [ 0.95238837]
 [ 0.98957434]]
[[-1.37107119  2.6894496  -0.5949312   1.17090215 -0.94650185 -0.88984271
  -0.6618885   0.10529835 -1.72403777  1.76452551]]
[[ 0.5041566 ]
 [ 0.95311554]
 [ 0.95238837]
 [ 0.98957434]]
[[-1.37107119  2.6894496  -0.5949312   1.17090215 -0.94650185 -0.88984271
  -0.6618885   0.10529835 -1.72403777  1.76452551]]
[[ 0.5041566 ]
 [ 0.95311554]
 [ 0.95238837]
 [ 0.98957434]]


In [33]:
predicted=nn(X,Y,X,100,1)
print(predicted)
predicted=nn(X,Y,X,100,0.1)
print(predicted)
predicted=nn(X,Y,X,100,0.01)
print(predicted)

[[-0.1877674  -0.12463105 -0.31118023 -0.43023434 -0.8387227  -0.53287079
  -0.65465191 -0.77454642 -0.40701164 -0.68708662 -0.08348778 -0.08996284
  -0.74260828 -0.88873277 -0.80693394 -0.49996737 -0.27116707 -0.79117226
  -0.75159699 -0.14821368 -0.58394273 -0.3830378  -0.76620407 -0.89764275
  -0.48364284 -0.52224814 -0.84707527 -0.37780014 -0.4557538  -0.34536896
  -0.85521737 -0.24802188 -0.77736216 -0.48046715 -0.21423818 -0.97743172
  -0.67531522 -0.12666656 -0.15501587 -0.46097568 -0.13296656 -0.04979487
  -0.17339843 -0.14565734 -0.90105901 -0.34845366 -0.29623666 -0.38945791
  -0.20005772 -0.96520811 -0.22967045 -0.26755791 -0.74006198 -0.74262543
  -0.36741437 -0.65434829 -0.20313002 -0.55356971 -0.21683042 -0.00900907
  -0.69952495 -0.85659868 -0.09819548 -0.45798894 -0.02503698 -0.36320803
  -0.0057911  -0.45362154 -0.47312512 -0.86423448 -0.64394163 -0.97347605
  -0.83932439 -0.25401082 -0.96915495 -0.63311315 -0.13724919 -0.30710578
  -0.3088173  -0.81092727 -0.55747709 