In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


labelled = pd.read_csv("monthly-data-labelled.csv")
unlabelled = pd.read_csv("monthly-data-unlabelled.csv")

In [2]:
X = labelled.drop('city',1).values
y = labelled['city'].values


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [4]:
bayes_model = make_pipeline(
        StandardScaler(),
        GaussianNB()
    )

In [5]:
bayes_model.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gaussiannb', GaussianNB(priors=None))])

In [6]:
bayes_model.score(X_test, y_test)

0.62068965517241381

In [65]:
knn_model = make_pipeline(
        StandardScaler(),
        KNeighborsClassifier(n_neighbors=25)
    )

In [66]:
knn_model.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=25, p=2,
           weights='uniform'))])

In [67]:
knn_model.score(X_test, y_test)

0.68620689655172418

In [97]:
svc_model = make_pipeline(
        StandardScaler(),
        SVC(kernel='linear', C=2.0)
    )

In [98]:
svc_model.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=2.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [99]:
svc_model.score(X_test, y_test)

0.80000000000000004

In [108]:
inputsForPrediction = unlabelled.drop('city', 1).values

In [110]:
citiesPredicted = svc_model.predict(inputsForPrediction)

In [111]:
citiesPredicted

array(['Miami', 'Vancouver', 'Denver', 'Seattle', 'Atlantic City',
       'Raleigh Durham', 'Atlanta', 'Seattle', 'San Francisco', 'Chicago',
       'Edmonton', 'Los Angeles'], dtype=object)

In [112]:
df = pd.DataFrame({'truth': y_test, 'prediction': svc_model.predict(X_test)})
print(df[df['truth'] != df['prediction']])

         prediction           truth
1    Raleigh Durham         Atlanta
3          Edmonton         Calgary
6         Saskatoon          Regina
7            Ottawa        Montreal
17         Montreal          Ottawa
39           Québec        Montreal
45          Seattle       Vancouver
53        Vancouver        Victoria
58        Vancouver        Victoria
60          Chicago   Atlantic City
67         Winnipeg       Saskatoon
68           Regina       Saskatoon
78          Chicago         Toronto
82   Raleigh Durham         Atlanta
85         Montreal          Ottawa
90   Raleigh Durham         Atlanta
101          Ottawa        Montreal
103          London         Toronto
107     New Orleans         Atlanta
109          Regina        Edmonton
111          Regina       Saskatoon
112          Regina       Saskatoon
125          Regina       Saskatoon
126        Montreal          Ottawa
130         Seattle        Portland
134          Regina       Saskatoon
140          Ottawa        M