In [1]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [2]:
def E(y, predictions):
    return np.sum(np.not_equal(y, predictions)) / len(y)

In [3]:
# Load training text file
Data_train = np.loadtxt('training_data.txt', skiprows = 1)
X_train = Data_train[0:20000, 1:]
y_train = Data_train[0:20000, 0]
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (20000, 1000)
y_train shape: (20000,)


In [None]:
# Normalize training data
for i in range(len(X_train[0])):
    col = X_train[:, i]
    mean = np.mean(col)
    std = np.std(col)
    X_train[:, i] = (col - mean) / std
    

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [0.1, 1, 10]},
                    {'kernel': ['linear'], 'C': [0.01, 0.1, 1, 10]}]
clf = GridSearchCV(SVC(), tuned_parameters, scoring = 'accuracy')
clf.fit(X_train, y_train)
print(gridsearch.best_score_)
print(gridsearch.best_params_)

In [5]:
# 5-fold cross validation: don't run this on the actual training

clf = SVC(kernel="linear", C=1)
kf = KFold(n_splits=5, shuffle=True)

E_in = []
E_val = []

# Truncate the dataset to see faster results
# X_train = X_train[:3000]
# y_train = y_train[:3000]

for train_indices, val_indices in kf.split(X_train):
    print("TRAIN:", len(train_indices), "VALIDATION:", len(val_indices))
    
    # X_in and X_val are subsets of X_train
    X_in = X_train[train_indices]
    X_val = X_train[val_indices]
    y_in = y_train[train_indices]
    y_val = y_train[val_indices]
    
    # Fit
    clf.fit(X_in, y_in)
    
    # Compute errors
    e_in = E(y_in, clf.predict(X_in))
    e_val = E(y_val, clf.predict(X_val))
    E_in.append(e_in)
    E_val.append(e_val)
    print("e_in =", e_in)
    print("e_val =", e_val)

TRAIN: 2400 VALIDATION: 600
e_in = 0.0
e_val = 0.226666666667
TRAIN: 2400 VALIDATION: 600
e_in = 0.0
e_val = 0.211666666667
TRAIN: 2400 VALIDATION: 600
e_in = 0.0
e_val = 0.216666666667
TRAIN: 2400 VALIDATION: 600
e_in = 0.0
e_val = 0.233333333333
TRAIN: 2400 VALIDATION: 600
e_in = 0.0
e_val = 0.211666666667


In [None]:
# Actual training and making predictions
X_test = np.loadtxt('test_data.txt', skiprows = 1)

clf = SVC(kernel="linear", C=0.01, verbose=1)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
submission = [[i+1, int(pred[i])] for i in range(len(pred))]
submission.insert(0, ['Id','Prediction'])
with open('submission.csv', 'w') as f:
    for line in submission:
        f.write(','.join(map(str, line)) + '\n')