In [40]:
import numpy as np
from sklearn.linear_model import SGDClassifier 

# CONSTANTS
LOSSES = ["hinge", "log", "modified_huber"]
PENALTIES = ["l2", "l1", "elasticnet"]
RUNS = 2000

data = np.loadtxt("training_data.txt", delimiter="|", skiprows=1)
trainingX = data[0:3000, 0:-1]
trainingY = data[0:3000, -1]

testX = data[3000:, 0:-1]
testY = data[3000:, -1]

# Error function as a fraction
def erf(a1, a2):
    total = 0
    for x in range(len(a1)):
        if a1[x] == a2[x]:
            total += 1
    return float(total)/len(a1)

# Loss and penalty terms
def res(l, p, runs):
    clf = SGDClassifier(loss=l, penalty=p)
    clf.fit(trainingX, trainingY)
    error = 0
    for x in range(runs):
        error += erf(clf.predict(testX), testY)
    return error/runs

# Prints testing results
def test(losses, penalties, runs):
    for loss in losses:
        for penalty in penalties:
            val = res(loss, penalty, runs)
            print "%s, %s: %0.5f" %(loss, penalty, val)

test(LOSSES, PENALTIES, RUNS)


hinge, l2: 0.63499
hinge, l1: 0.64087
hinge, elasticnet: 0.61144
log, l2: 0.61312
log, l1: 0.62994
log, elasticnet: 0.61817
modified_huber, l2: 0.63162
modified_huber, l1: 0.61060
modified_huber, elasticnet: 0.62742


In [42]:
# For testing for submission

test_data = np.loadtxt("testing_data.txt", delimiter="|", skiprows=1)

# Stochastic Gradient Descent
l, p = "modified_huber", "l2" # Optimal choices
clf_test = SGDClassifier(loss=l, penalty=p)
clf_test.fit(trainingX, trainingY)
prediction = clf.predict(test_data)

f = open("results.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(prediction)):
    f.write(str(x+1) + "," + str(int(prediction[x])) + "\n")
f.close()