In [5]:
import numpy as np
from sklearn.linear_model import SGDClassifier 

# CONSTANTS
LOSSES = ["hinge", "log", "modified_huber"]
PENALTIES = ["l2", "l1", "elasticnet"]
TRAINING = 3351
RUNS = 3000

data = np.loadtxt("training_data.txt", delimiter="|", skiprows=1)
trainingX = data[0:TRAINING, 0:-1]
trainingY = data[0:TRAINING, -1]

testX = data[TRAINING:, 0:-1]
testY = data[TRAINING:, -1]

# Error function as a fraction
def erf(a1, a2):
    total = 0
    for x in range(len(a1)):
        if a1[x] == a2[x]:
            total += 1
    return float(total)/len(a1)

# Loss and penalty terms
def res(l, p, runs):
    clf = SGDClassifier(loss=l, penalty=p)
    clf.fit(trainingX, trainingY)
    error = 0
    for x in range(runs):
        error += erf(clf.predict(testX), testY)
    return error/runs

# Prints testing results
def test(losses, penalties, runs):
    for loss in losses:
        for penalty in penalties:
            val = res(loss, penalty, runs)
            print "%s, %s: %0.5f" %(loss, penalty, val)

test(LOSSES, PENALTIES, RUNS)


hinge, l2: 0.63126
hinge, l1: 0.62172
hinge, elasticnet: 0.64678
log, l2: 0.61337
log, l1: 0.63842
log, elasticnet: 0.64320
modified_huber, l2: 0.57637
modified_huber, l1: 0.65155
modified_huber, elasticnet: 0.60143


In [None]:
# For testing for submission

test_data = np.loadtxt("testing_data.txt", delimiter="|", skiprows=1)

# Stochastic Gradient Descent
l, p = "modified_huber", "l2" # Optimal choices
clf_test = SGDClassifier(loss=l, penalty=p)
clf_test.fit(trainingX, trainingY)
prediction = clf.predict(test_data)

f = open("results.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(prediction)):
    f.write(str(x+1) + "," + str(int(prediction[x])) + "\n")
f.close()

In [46]:
from sklearn import svm
import numpy as np

# CONSTANTS
loss = ["hinge", "squared_hinge"]
penalty = ["l1", "l2"]
TRAINING = 3351
RUNS = 3000

data = np.loadtxt("training_data.txt", delimiter="|", skiprows=1)
trainingX = data[0:TRAINING, 0:-1]
trainingY = data[0:TRAINING, -1]

testX = data[TRAINING:, 0:-1]
testY = data[TRAINING:, -1]

test_data = np.loadtxt("testing_data.txt", delimiter="|", skiprows=1)

clf = svm.LinearSVC(dual=False, tol=1e-6, max_iter=1000, random_state=0)
clf.fit(trainingX, trainingY)
clf.score(testX, testY)

prediction = clf.predict(test_data)

f = open("svm.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(prediction)):
    f.write(str(x+1) + "," + str(int(prediction[x])) + "\n")
f.close()
