# Imports and setup

In [1]:
import pandas as pd 
import numpy as np
from sklearn import neighbors
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from scipy.stats import randint as sp_randint
from sklearn.svm import SVC

# Utility functions

In [2]:
def create_submission_file(predictions, model_name):
    num_preds = len(predictions)
    
    if num_preds != 3926:
        print("Number of predictions not equal to number of test observations!")
    
    pd.DataFrame({
        "Prediction" : predictions,
        "Id" : range(1, num_preds + 1)
    }).to_csv(model_name + ".csv", index=False)
    
def accuracy(predictions, truth):
    return np.mean(predictions == truth)

# Load the data and split into training and validation sets

In [3]:
# note: these datasets are slightly modified from the provided data
# the train set includes the "activity" column, and both datasets have column names
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# FOR THE SAKE OF THE WORKSHOP: USE A SMALLER TRAINING SET SO THINGS RUN FASTER
# FOR COMPETITION ENTRIES, REMOVE THIS SO YOU USE ALL THE TRAINING DATA
train = train.loc[1:500, :]

y = train["activity"]
X = train.iloc[:, train.columns != "activity"]

# could split the data manually if you wanted:

# msk = np.random.rand(len(train)) < 0.8

# X_train = X[msk]
# y_train = y[msk]
# X_val = X[~msk]
# y_val = y[~msk]

# or use provided code
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Train models with different hyperparameters on the training set and evaluate on the validation set

In [4]:
for k in [5, 10, 15]:
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    
    knn.fit(X_train, y_train)
    preds = knn.predict(X_val)
    
    report = "k: " + str(k) + " gives validation accuracy: " + str(accuracy(preds, y_val))
    print(report)

k: 5 gives validation accuracy: 0.83
k: 10 gives validation accuracy: 0.78
k: 15 gives validation accuracy: 0.83


We see that the best value of K is 5. In practice, we want to do this for a bunch of train/validation splits, so we use cross-validation. In practice, it's also better to search for the best k by picking k randomly. Here's some code that automatically searches for the best k using cross validation.

In [5]:
# pick k randomly in the range [1, 30]
# note that for the competition there are a number of other useful hyperparameters worth exploring
# in particular the precise metric, and whether votes are weighed by distance
param_dist = {
    'n_neighbors': sp_randint(1, 31),
}

clf = neighbors.KNeighborsClassifier()

# now try 10 random k and save the best
clf_rs = RandomizedSearchCV(clf, param_distributions=param_dist,
                            n_iter=10, n_jobs = -1)

# now clf_rs is just a KNN model with the best value of k automatically selected
# give the randomized search the entire training set, it'll do cross validation internally
clf_rs.fit(X, y)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000F647DEA908>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

See the results of the cross-validated search for k. Also assess performance on the validation set.

In [6]:
pd.DataFrame(clf_rs.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.016001,0.081339,0.766,0.892998,19,{'n_neighbors': 19},4,0.778443,0.882883,0.784431,0.900901,0.73494,0.89521,9.798072e-07,0.013598,0.022033,0.00752
1,0.022669,0.094673,0.762,0.900997,14,{'n_neighbors': 14},7,0.802395,0.906907,0.772455,0.891892,0.710843,0.904192,0.003772031,0.001886,0.038084,0.006533
2,0.019677,0.095047,0.774,0.871992,22,{'n_neighbors': 22},2,0.796407,0.846847,0.802395,0.888889,0.722892,0.88024,0.0004690441,0.002152,0.036114,0.018127
3,0.021682,0.09807,0.764,0.894994,16,{'n_neighbors': 16},5,0.802395,0.897898,0.772455,0.885886,0.716867,0.901198,0.00449952,0.00572,0.035409,0.00658
4,0.020348,0.0934,0.758,0.918997,11,{'n_neighbors': 11},8,0.772455,0.933934,0.784431,0.900901,0.716867,0.922156,0.0009435275,0.002056,0.029408,0.013669
5,0.024017,0.098069,0.78,0.877983,25,{'n_neighbors': 25},1,0.772455,0.864865,0.808383,0.873874,0.759036,0.89521,0.005103061,0.003562,0.020833,0.012724
6,0.020681,0.09707,0.772,0.947001,5,{'n_neighbors': 5},3,0.820359,0.945946,0.778443,0.948949,0.716867,0.946108,0.0004708084,0.002948,0.042475,0.001379
7,0.020681,0.095735,0.75,0.913995,12,{'n_neighbors': 12},9,0.802395,0.921922,0.754491,0.900901,0.692771,0.919162,0.0004718778,0.002496,0.044844,0.009327
8,0.024351,0.100738,0.764,0.894994,16,{'n_neighbors': 16},5,0.802395,0.897898,0.772455,0.885886,0.716867,0.901198,0.006132872,0.004646,0.035409,0.00658
9,0.020681,0.092402,0.738,0.931006,4,{'n_neighbors': 4},10,0.808383,0.927928,0.712575,0.93994,0.692771,0.92515,0.0009429656,0.003095,0.050494,0.006418


Finally, predict on the text set and save the 

In [7]:
preds = clf_rs.predict(test)
create_submission_file(preds, "knn_tuned_predictions")

# Some sample code for grid search on a support vector machine for classification

In [8]:
# in practice you will want to try much broader ranges of hyperparameters than this
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

svm_clf = GridSearchCV(SVC(), tuned_parameters)
svm_clf.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Again we check how well we did

In [10]:
pd.DataFrame(svm_clf.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_gamma,param_kernel,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.09013,0.040859,0.658,0.776013,1,0.001,rbf,"{'kernel': 'rbf', 'C': 1, 'gamma': 0.001}",11,0.682635,0.780781,0.700599,0.783784,0.590361,0.763473,0.015493,0.010837,0.048246,0.008951
1,0.134634,0.038352,0.378,0.379997,1,0.0001,rbf,"{'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}",12,0.383234,0.378378,0.383234,0.378378,0.36747,0.383234,0.007092,0.005303,0.007424,0.002289
2,0.061332,0.037352,0.9,0.992007,10,0.001,rbf,"{'kernel': 'rbf', 'C': 10, 'gamma': 0.001}",9,0.856287,0.996997,0.964072,0.993994,0.879518,0.98503,0.003768,0.005017,0.046353,0.005084
3,0.072917,0.048084,0.688,0.813002,10,0.0001,rbf,"{'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}",10,0.688623,0.81982,0.754491,0.807808,0.620482,0.811377,0.007365,0.001724,0.054683,0.005037
4,0.063181,0.035648,0.908,1.0,100,0.001,rbf,"{'kernel': 'rbf', 'C': 100, 'gamma': 0.001}",1,0.868263,1.0,0.934132,1.0,0.921687,1.0,0.006948,0.006724,0.028595,0.0
5,0.062504,0.026046,0.902,0.991006,100,0.0001,rbf,"{'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}",8,0.856287,0.993994,0.964072,0.993994,0.885542,0.98503,2e-06,0.007372,0.045549,0.004226
6,0.062501,0.031244,0.908,1.0,1000,0.001,rbf,"{'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}",1,0.868263,1.0,0.934132,1.0,0.921687,1.0,1.5e-05,8e-06,0.028595,0.0
7,0.062716,0.024967,0.908,1.0,1000,0.0001,rbf,"{'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}",1,0.862275,1.0,0.934132,1.0,0.927711,1.0,0.014239,0.006737,0.032486,0.0
8,0.055084,0.02547,0.908,1.0,1,,linear,"{'kernel': 'linear', 'C': 1}",1,0.862275,1.0,0.934132,1.0,0.927711,1.0,0.005964,0.006996,0.032486,0.0
9,0.060589,0.037033,0.908,1.0,10,,linear,"{'kernel': 'linear', 'C': 10}",1,0.862275,1.0,0.934132,1.0,0.927711,1.0,0.008241,0.005193,0.032486,0.0


In [11]:
preds = svm_clf.predict(test)
create_submission_file(preds, "svm_tuned_predictions")