In [1]:
import numpy as np
import pandas as pd 

In [37]:
class nn:
    def __init__(self,hidden_layer_size=100,learning_rate=0.01,neurons=20,iterations=60000,activation_function='tan'):
        self.hidden_layer_size=hidden_layer_size
        self.activation_function=activation_function
        self.learning_rate=learning_rate
        self.layer=list()
        self.layer_weights=list()
        self.output_layer=1
        self.iterations=iterations
        self.neurons=neurons
        
        
    def create_network(self,X):
        #np.random.seed(1) #to have random in between the specific range
        random_weights=2*np.random.random((X.shape[1],self.neurons))-1
        self.layer_weights.append(random_weights)
        for i in range(self.hidden_layer_size-2):
            random_weights=2*np.random.random((self.neurons,self.neurons))-1
            self.layer_weights.append(random_weights)
        random_weights=2*np.random.random((self.neurons,self.output_layer))-1
        self.layer_weights.append(random_weights)
        
        
    def activation(self,x,derivative=False):
        if derivative:
            if self.activation_function == "sigmoid":
                return x * (1 - x)
            if self.activation_function=="tan":
                return 1.0 - np.tanh(x)**2
            if self.activation_function == "ReLU":
                return (x > 0).astype(int)        
        else:
            if self.activation_function == "sigmoid":
                return 1 / (1 + np.exp(-x))
            if self.activation_function=="tan":
                    return np.tanh(x)
            if self.activation_function == "ReLU":
                return x * (x > 0)
            
        
    def fit(self,X,Y):
        end_error=0
        self.create_network(X)
        for _ in range(self.iterations):
            #feed forward throught the network
            self.layer=list()
            self.layer.append(X)
            for i in range(self.hidden_layer_size):
                hidden_layer=self.activation(np.dot(self.layer[i],self.layer_weights[i]))
                self.layer.append(hidden_layer)
            
            error=Y-self.layer[-1]
            end_error=np.mean(np.abs(error))
#             if(_%100==1):
#                 print(str(_)+" Error "+str(end_error))
            for i in range(self.hidden_layer_size,0,-1):
                delta = error*self.activation(self.layer[i],derivative=True)
                error = delta.dot(self.layer_weights[i-1].T)
                self.layer_weights[i-1] += self.layer[i-1].T.dot(delta)
       
        print("End Error"+str(end_error))

    
    def predict(self,X):
        predicted=X
        for i in range(self.hidden_layer_size):
            predicted=self.activation(np.dot(predicted,self.layer_weights[i]))
        predict=predicted
        if (self.activation_function=='sigmoid'):
            predict[predict>0.5]=1
            predict[predict<=0.5]=0
        if(self.activation_function=='tan'):
            predict[predict>0]=1
            predict[predict<=0]=0
        return predict.ravel()
    
    def score(self,X_test,Y_true):
        predict=self.predict(X_test)
        return np.sum(predict.ravel()==Y_true.ravel())/Y_true.shape[0]


In [42]:
def normalize(inputData):
    #return (inputData - inputData.mean()) / inputData.std()
    return (inputData - inputData.min()) / (inputData.max() - inputData.min())

#reads the datafiles and returns the training and the testing data
def get_data():
    # get test & test csv files as a DataFrame
    train_df = pd.read_csv("data/train.csv")
    test_df    = pd.read_csv("data/test.csv")
    
    #removing race and native country
    cols_to_drop=['race','native-country','fnlwgt']
    for col in cols_to_drop:
        train_df=train_df.drop([col],axis=1)
        test_df=test_df.drop([col],axis=1)

    numericalColumns = ('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week')
    for i in numericalColumns:
        train_df[i] = normalize(train_df[i])
        test_df[i] = normalize(test_df[i])

    
    #creating dummies of the data
    train_df=pd.get_dummies(train_df)
    test_df=pd.get_dummies(test_df)

    #remove unwanted columns and the columns that are created for ?
    columns_to_remove=set(list(train_df)).symmetric_difference(set(list(test_df)))
    columns_to_remove.remove('salary')
    for col in list(train_df):
        if (col in columns_to_remove) or ("?" in col) :
            train_df=train_df.drop(col,1)
    for col in list(test_df):
        if (col in columns_to_remove) or ("?" in col) :
            test_df=test_df.drop(col,1)
    
    return train_df,test_df


def process_data(percent):
    train_df,test_df=get_data()
    test_ids=test_df['id'].as_matrix()
    train_df=train_df.drop(['id'],1)
    test_df=test_df.drop(['id'],1)
    train_df['const']=1
    test_df['const']=1
    Y=train_df['salary'].as_matrix()
    X=train_df.drop(['salary'], axis=1).as_matrix()
    Y=Y.reshape(len(Y),1)
    end=int(X.shape[0] * percent)
    #training data
    train_X=X[:end,:]
    train_Y=Y[:end,:]
    #data for cross validation
    cross_X=X[end:,:]
    cross_Y=Y[end:,:]
    #testing data
    test_X=test_df.as_matrix()
    return train_X,train_Y,cross_X,cross_Y,test_X,test_ids



#writes the predicted values to file 
def write_result(ids,predicted,file_name):
    output=np.column_stack((ids,predicted))
    np.savetxt(file_name,output,delimiter=",",fmt="%d,%d",header="id,salary",comments ='')
    


In [55]:

train_X,train_Y,cross_X,cross_Y,test_X,test_ids= process_data(0.5)
neural_network=nn(hidden_layer_size=3,neurons=20,iterations=100,learning_rate=0.00001,activation_function='tan')
neural_network.fit(train_X,train_Y)
predict=neural_network.predict(cross_X)
print("Score ",neural_network.score(cross_X,cross_Y))

End Error0.762906702248
Score  0.289064504541


In [56]:
predict=neural_network.predict(test_X)
print(test_ids.shape,predict.shape)
write_result(test_ids,predict,"hareesh.csv")

(6878,) (6878,)


    
## Classification using libraries


In [52]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


train_X,train_Y,cross_X,cross_Y,test_X,test_ids= process_data(0.50)
X_train=train_X
Y_train=train_Y.ravel()
X_test=cross_X
Y_test=cross_Y.ravel()

#-----------  Neural Netowrk------------------
neural_network=nn(hidden_layer_size=3,neurons=20,iterations=100,learning_rate=0.0001,activation_function='tan')
neural_network.fit(train_X,train_Y)
print("Neural Network : "+str(neural_network.score(cross_X, cross_Y)))


#----------- Logistic Regression------------------
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
print("Logistic Regression : "+ str(logreg.score(X_test, Y_test)))



#----------- Support Vector Machines------------------
svc = SVC()
svc.fit(X_train, Y_train)
print("Support Vector Machines : "+str(svc.score(X_test, Y_test)))



#-----------  Random Forests------------------
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
print("Random Forests : "+str(random_forest.score(X_test, Y_test)))



#----------- K NN Classification------------------
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, Y_train)
print("K NN Classification : "+str(knn.score(X_test, Y_test)))


#-----------  Gaussian Naive Bayes------------------
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
print("Gaussian Naive Bayes : "+str(gaussian.score(X_test, Y_test)))

End Error1.23709329775
Neural Network : 0.75737671268
Logistic Regression : 0.84774465028
Support Vector Machines : 0.834094524555
Random Forests : 0.844563042028
K NN Classification : 0.829578693488
Gaussian Naive Bayes : 0.644583568533


In [51]:
#playgorund
X= np.array([ [0,0,1],
              [0,1,1],
              [1,0,10],
              [10,1,1], ])

Y=np.array([ [ 0,1,1,1 ] ]).T

n=nn(hidden_layer_size=3,neurons=20,iterations=100,activation_function='sigmoid')
n.fit(X,Y)
predict=n.predict(X)
print(predict)

1 Error 0.297777800194
End Error0.0817159495915
[ 0.13131335  0.89094717  0.93470715  0.98301249]
1.0


In [None]:
#playgorund
X= np.array([ [0,0,1],
              [0,1,1],
              [1,0,10],
              [10,1,1], ])

Y=np.array([ [ 0,1,1,1 ] ]).T

n=nn(hidden_layer_size=3,neurons=20,iterations=100,activation_function='sigmoid')
n.fit(X,Y)
predict=n.predict(X)
print(predict)