In [160]:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 115)
pd.set_option('display.width', 200)
from pandas.api.types import is_string_dtype
import numpy as np
eps = np.finfo(float).eps

In [161]:
def genNumBuckets(data,cats):
    #Used to get the buckets for numerical data
    buckets = []
    for cat in cats:
        bucket = []
        if(not(is_string_dtype(data[cat]))):
            median = data[cat].median()-eps
            bucket = [float('-inf'),median,float('inf')]
        buckets.append(bucket)
    return buckets

def genModes(data,cats):
    modes = []
    for cat in cats:
        mode = data[cat].mode().iloc[0]
        #print(cat,mode)
        modes.append(mode)
    
    return modes

def cleanNumeric(data,cats,buckets):
    #Replaces numerical data with given bucket categories
    for i in range(len(cats)):
        cat = cats[i]
        if(not(is_string_dtype(data[cat]))):
            data[cat] = pd.cut(data[cat],bins=buckets[i],include_lowest=True,duplicates='drop')
    return data

def cleanUnknown(data,cats,modes):
    for i in range(len(cats)):
        data[cats[i]] = data[cats[i]].replace("?",modes[i])
    return data
        

In [162]:
#Preprocessing for Training Data
trainData = pd.read_csv('Data/train_final.csv')
#print(trainData)
labels = trainData.columns
buckets = genNumBuckets(trainData,labels)
trainData = cleanNumeric(trainData,labels[:-1],buckets)
modes = genModes(trainData,labels)
trainData = cleanUnknown(trainData,labels[:-1],modes)
#print(trainData)
trainData = pd.get_dummies(trainData)
dummyLabels = trainData.columns[1:]
#print(dummyLabels)
#print(trainData)

#Preprocessing for Test Data
testData = pd.read_csv('Data/test_final.csv')
testData = testData.drop("ID",axis=1)
#print(testData)
testData = cleanNumeric(testData,labels[:-1],buckets)
testData = cleanUnknown(testData,labels[:-1],modes)
#print(testData)
testData = pd.get_dummies(testData)
#print(testData)

In [171]:
#Decision Tree Classifier
from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth = 8)
clf = clf.fit(trainData[dummyLabels],trainData[labels[-1]])

n = len(trainData.index)
correct = 0
for i in range(n):
    prediction = clf.predict(trainData.iloc[i][dummyLabels].array.reshape(1,-1))
    actual = trainData.iloc[i][0]
    #print(prediction,actual)
    if(prediction == actual):
        correct += 1

print(correct/n)

with open('submissions/treeSub.csv', 'w') as f:
    f.write("ID,Prediction\n")
    n = len(testData.index)
    for i in range(n):
        prediction = clf.predict(testData.iloc[i][dummyLabels].array.reshape(1,-1))
        toPrint = str(i+1)
        toPrint += "," + str(prediction[0]) + "\n"
        f.write(toPrint)

0.83984


In [172]:
#Stocastic Gradient Descent
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(eta0 = 0.5**11,max_iter=2**30)
clf.fit(trainData[dummyLabels],trainData[labels[-1]])

n = len(trainData.index)
correct = 0
for i in range(n):
    prediction = clf.predict(trainData.iloc[i][dummyLabels].array.reshape(1,-1))
    actual = trainData.iloc[i][0]
    #print(prediction,actual)
    if(prediction == actual):
        correct += 1

print(correct/n)

with open('submissions/SGDSub.csv', 'w') as f:
    f.write("ID,Prediction\n")
    n = len(testData.index)
    for i in range(n):
        prediction = clf.predict(testData.iloc[i][dummyLabels].array.reshape(1,-1))
        toPrint = str(i+1)
        toPrint += "," + str(prediction[0]) + "\n"
        f.write(toPrint)

0.83692


In [181]:
#Multi-Layer Perceptron
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(activation = 'logistic', solver='adam', alpha=1e-5, hidden_layer_sizes=(30,30,30),max_iter = 1000)
clf.fit(trainData[dummyLabels],trainData[labels[-1]])
print("Classifier Fitting Done")

n = len(trainData.index)
correct = 0
for i in range(n):
    prediction = clf.predict(trainData.iloc[i][dummyLabels].array.reshape(1,-1))
    actual = trainData.iloc[i][0]
    #print(prediction,actual)
    if(prediction == actual):
        correct += 1

print(correct/n)

with open('submissions/MLPSub.csv', 'w') as f:
    f.write("ID,Prediction\n")
    n = len(testData.index)
    for i in range(n):
        prediction = clf.predict(testData.iloc[i][dummyLabels].array.reshape(1,-1))
        toPrint = str(i+1)
        toPrint += "," + str(prediction[0]) + "\n"
        f.write(toPrint)

Classifier Fitting Done
0.8588
