# Imports and setup

In [10]:
import pandas as pd 
import numpy as np
from sklearn import neighbors
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from scipy.stats import randint as sp_randint
from sklearn.svm import SVC

# Utility functions

In [2]:
def create_submission_file(predictions, model_name):
    num_preds = len(predictions)
    
    if num_preds != 3926:
        print("Number of predictions not equal to number of test observations!")
    
    pd.DataFrame({
        "Prediction" : predictions,
        "Id" : range(1, num_preds + 1)
    }).to_csv(model_name + ".csv", index=False)
    
def accuracy(predictions, truth):
    return np.mean(predictions == truth)

# Load the data and split into training and validation sets

In [4]:
# note: these datasets are slightly modified from the provided data
# the train set includes the "activity" column, and both datasets have column names
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# FOR THE SAKE OF THE WORKSHOP: USE A SMALLER TRAINING SET SO THINGS RUN FASTER
# FOR COMPETITION ENTRIES, REMOVE THIS SO YOU USE ALL THE TRAINING DATA
train = train.loc[1:500, :]

y = train["activity"]
X = train.iloc[:, train.columns != "activity"]

# could split the data manually if you wanted:

# msk = np.random.rand(len(train)) < 0.8

# X_train = X[msk]
# y_train = y[msk]
# X_val = X[~msk]
# y_val = y[~msk]

# or use provided code
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Train models with different hyperparameters on the training set and evaluate on the validation set

In [5]:
for k in [5, 10, 15]:
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    
    knn.fit(X_train, y_train)
    preds = knn.predict(X_val)
    
    report = "k: " + str(k) + " gives validation accuracy: " + str(accuracy(preds, y_val))
    print(report)

k: 5 gives validation accuracy: 0.89
k: 10 gives validation accuracy: 0.89
k: 15 gives validation accuracy: 0.89


We see that the best value of K is 5. In practice, we want to do this for a bunch of train/validation splits, so we use cross-validation. In practice, it's also better to search for the best k by picking k randomly. Here's some code that automatically searches for the best k using cross validation.

In [6]:
# pick k randomly in the range [1, 30]
param_dist = {
    'n_neighbors': sp_randint(1, 31),
}

clf = neighbors.KNeighborsClassifier()

# now try 10 random k and save the best
clf_rs = RandomizedSearchCV(clf, param_distributions=param_dist,
                            n_iter=10, n_jobs = -1)

# now clf_rs is just a KNN model with the best value of k automatically selected
# give the randomized search the entire training set, it'll do cross validation internally
clf_rs.fit(X, y)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000001F9692ACF8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

See the results of the cross-validated search for k. Also assess performance on the validation set.

In [7]:
print(pd.DataFrame(clf_rs.cv_results_))

   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0       0.016674         0.087377            0.758          0.913992   
1       0.019346         0.098404            0.700          0.931006   
2       0.020348         0.100404            0.780          0.877983   
3       0.018680         0.096736            0.774          0.871992   
4       0.021015         0.093401            0.758          0.918997   
5       0.025017         0.097403            0.768          0.878987   
6       0.021014         0.100405            0.776          0.875972   
7       0.020348         0.105075            0.772          0.869966   
8       0.020681         0.097069            0.762          0.900997   
9       0.020014         0.090398            0.764          0.908996   

  param_n_neighbors               params  rank_test_score  split0_test_score  \
0                10  {'n_neighbors': 10}                8           0.796407   
1                 2   {'n_neighbors': 2}       

Finally, predict on the text set and save the 

In [8]:
preds = clf_rs.predict(test)
create_submission_file(preds, "knn_tuned_predictions")

# Some sample code for grid search on a support vector machine for classification

In [11]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

svm_clf = GridSearchCV(SVC(), tuned_parameters)
svm_clf.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [None]:
Again we check how well we did

In [12]:
print(pd.DataFrame(svm_clf.cv_results_))

    mean_fit_time  mean_score_time  mean_test_score  mean_train_score param_C  \
0        0.081696         0.037323            0.658          0.776013       1   
1        0.125740         0.038011            0.378          0.379997       1   
2        0.059991         0.033343            0.900          0.992007      10   
3        0.076012         0.036001            0.688          0.813002      10   
4        0.058671         0.030663            0.908          1.000000     100   
5        0.056017         0.030663            0.902          0.991006     100   
6        0.077333         0.039996            0.908          1.000000    1000   
7        0.077328         0.038676            0.908          1.000000    1000   
8        0.057343         0.026668            0.908          1.000000       1   
9        0.057348         0.032002            0.908          1.000000      10   
10       0.069339         0.036000            0.908          1.000000     100   
11       0.051996         0.

In [14]:
preds = svm_clf.predict(test)
create_submission_file(preds, "svm_tuned_predictions")