# Logistic Regression Bow Tie

- Descriptors asked for:
- Day of week
- day of month
- 'weather' (precipitation seems to have high correlation)
- shirt color
- phone battery

In [341]:
# read in our bow tie data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

df=pd.read_csv('./Bow_Tie_Machine_Learning.csv')

In [342]:
# delete some data I collected which is not going to be necessary
# also fix up names of columns

df.columns = df.columns.str.replace(' ', '_')
del df['Date']
del df['Month']
#del df['Day_of_Month']
del df['Weather_Current']
#df['Bow_Tie'] = df['Bow_Tie'].replace({True: 'yes', False: 'no'})
df.head()

Unnamed: 0,Day_of_Month,Day_of_Week,Phone_Battery,Shirt_Color,Weather_High,Weather_Low,Wind_Speed_(12pm),Humidity_(12pm),Dew_Point,Wind_Speed_(1am),Max_Wind_Speed,Precipitation,Bow_Tie
0,17,Tuesday,65,Blue,46,32,15,79,35.21,13,20,0.17,True
1,19,Thursday,100,Other,42,29,26,82,36.87,0,30,0.57,True
2,24,Tuesday,86,Blue,40,32,6,76,30.04,9,9,0.0,True
3,26,Thursday,87,Grey,25,17,24,84,20.33,17,24,0.2,True
4,31,Tuesday,43,Grey,21,10,7,45,1.67,14,14,0.0,False


In [343]:
# find the correlation for numerical features

numerical = ['Phone_Battery','Weather_High', 'Weather_Low', 'Dew_Point',
             'Max_Wind_Speed', 'Precipitation','Humidity_(12pm)','Day_of_Month','Wind_Speed_(1am)','Wind_Speed_(12pm)']
df[numerical].corrwith(df.Bow_Tie).to_frame('correlation')

Unnamed: 0,correlation
Phone_Battery,0.211135
Weather_High,-0.296449
Weather_Low,-0.134092
Dew_Point,0.104944
Max_Wind_Speed,0.170177
Precipitation,0.521368
Humidity_(12pm),0.464759
Day_of_Month,0.254133
Wind_Speed_(1am),0.059134
Wind_Speed_(12pm),0.334067


In [191]:
# find the mutual information for the categorical features

categorical = ['Day_of_Week','Shirt_Color']

from sklearn.metrics import mutual_info_score
# it can be useful to define a function for this
def calculate_mi(series):
    return mutual_info_score(series, df.Bow_Tie)

# we can enact our function on a subset of data with '.apply(calculate_mi)'
df_mi = df[categorical].apply(calculate_mi)
#df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

df_mi

Day_of_Week    0.086305
Shirt_Color    0.001487
dtype: float64

In [192]:
# now we use the above to choose which features to keep...
# looks like wind speed at 1am has very little correlation
# dew point does not seem very strong either
# surprisingly, shirt color seems to have lower mutual information, at least compared to day of week
# also, weather low is basically the weather high data, let's get rid of that
# we can probably remove max wind speed, since wind speed at 12pm seems much better

del df['Shirt_Color']
del df['Wind_Speed_(1am)']
del df['Max_Wind_Speed']
del df['Dew_Point']
del df['Weather_Low']

In [193]:
# Since our dataset is pretty short, let's just do a single split in the data
# split into training and test datasets
df_train, df_test = train_test_split(df, random_state=30,test_size=0.50, shuffle=True)

df_train_full = df_train.copy()
# we now need to now split df_train_full into separate train and validation
#df_train, df_val = train_test_split(df_train_full, random_state=100,test_size=0.25, shuffle=True)
#df_val.index

# move target values out
y_train = df_train.Bow_Tie.values
#y_val = df_val.Bow_Tie.values
y_test = df_test.Bow_Tie.values

y_all = df.Bow_Tie.values

del df_train['Bow_Tie']
#del df_val['Bow_Tie']
del df_test['Bow_Tie']

base = ['Day_of_Week', 'Phone_Battery', 'Weather_High', 
        'Precipitation', 'Humidity_(12pm)','Day_of_Month','Wind_Speed_(12pm)']
numerical = ['Phone_Battery','Weather_High', 'Precipitation','Humidity_(12pm)','Day_of_Month','Wind_Speed_(12pm)']
categorical = ['Day_of_Week']

In [194]:
# one-hot encode

full_dict = df[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(full_dict)
X_full = dv.transform(full_dict)

feature_names = dv.get_feature_names_out()

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

test_dict = df_test[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test = dv.transform(test_dict)

feature_names

array(['Day_of_Month', 'Day_of_Week=Thursday', 'Day_of_Week=Tuesday',
       'Humidity_(12pm)', 'Phone_Battery', 'Precipitation',
       'Weather_High', 'Wind_Speed_(12pm)'], dtype=object)

In [195]:
# let's start with a logistic regression model to predict bow tie data
# train the model on our training dataset
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

# here are the weights of each feature
print(model.coef_)
print(feature_names)

[[ 0.33259916  0.77890486 -0.89856782  0.0157241   0.07066073  0.03816934
  -0.28311994  0.18713519]]
['Day_of_Month' 'Day_of_Week=Thursday' 'Day_of_Week=Tuesday'
 'Humidity_(12pm)' 'Phone_Battery' 'Precipitation' 'Weather_High'
 'Wind_Speed_(12pm)']


In [196]:
# soft predictions

y_pred = model.predict_proba(X_test)[:, 1]
model.predict_proba(X_test)

array([[6.19563380e-01, 3.80436620e-01],
       [4.13200055e-01, 5.86799945e-01],
       [9.98101150e-01, 1.89885009e-03],
       [6.02439024e-02, 9.39756098e-01],
       [9.99967272e-01, 3.27278974e-05],
       [4.35091935e-01, 5.64908065e-01],
       [6.52295912e-01, 3.47704088e-01],
       [9.61781837e-01, 3.82181631e-02],
       [9.99988293e-01, 1.17068909e-05],
       [2.24195784e-04, 9.99775804e-01],
       [3.86706786e-04, 9.99613293e-01],
       [9.99991876e-01, 8.12370385e-06],
       [5.62298178e-01, 4.37701822e-01],
       [9.34281167e-03, 9.90657188e-01],
       [2.41039425e-04, 9.99758961e-01]])

In [197]:
# Store the predictions of whether or not we expect bow tie
# meaning that the number in y_pred is > 0.5, tending towards 1 rather than 0

bow_tie = (y_pred >= 0.5)

# and let's see how our accuracy is
from sklearn.metrics import accuracy_score
# and also the auc
from sklearn.metrics import roc_auc_score

print('accuracy score: ', accuracy_score(y_test, bow_tie))
print('auc score: ', roc_auc_score(y_test, bow_tie))

accuracy score:  0.6666666666666666
auc score:  0.6666666666666667


- Not the best scores ever, the scores seem to vary a lot with the random seed that I use to shuffle
- I need to be careful about my test train split since we have so few data points. Sometimes my split (by either size or random seed) causes one of the datasets to not contain one of the categorical outcomes (like all shirt colors).

# Forest Bow Tie

In [198]:
from sklearn.ensemble import RandomForestClassifier

In [199]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

df=pd.read_csv('./Bow_Tie_Machine_Learning.csv')

df.columns = df.columns.str.replace(' ', '_')
del df['Date']
del df['Month']
del df['Weather_Current']

# delete descriptors with low correlation

#del df['Shirt_Color']
del df['Wind_Speed_(1am)']
del df['Max_Wind_Speed']
del df['Dew_Point']
del df['Weather_Low']

# split data using sklearn

# now assign the train and test data
df_train, df_test = train_test_split(df, random_state=107,test_size=0.30, shuffle=True)

# move target values out
y_train = df_train.Bow_Tie.values
y_test = df_test.Bow_Tie.values

del df_train['Bow_Tie']
del df_test['Bow_Tie']

base = ['Day_of_Week', 'Phone_Battery', 'Shirt_Color', 'Weather_High', 'Weather_Low', 
       'Dew_Point', 'Precipitation', 'Humidity_(12pm)','Day_of_Month','Wind_Speed_(1am)','Wind_Speed_(12pm)']
numerical = ['Phone_Battery','Weather_High', 'Wind_Speed_(12pm)',
            'Precipitation','Humidity_(12pm)','Day_of_Month']
categorical = ['Day_of_Week','Shirt_Color']

# one hot encode

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

test_dict = df_test[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test = dv.transform(test_dict)

In [200]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(n_estimators = 10,random_state=7)

parameters = {'max_depth':[1,2,3,4,5,6,7,8,9,10,11,15,20,25,30,40], 
              'min_samples_leaf':[1,2,3,4,5,6,7,8,9]}

rf_classifier = GridSearchCV(rf,parameters,scoring='roc_auc',cv=6)
rf_classifier.fit(X_train,y_train)
print(rf_classifier.best_params_)
print(rf_classifier.best_score_)

{'max_depth': 1, 'min_samples_leaf': 2}
0.75


In [201]:
y_pred = rf_classifier.predict_proba(X_test)[:,1]
roc_auc_score(y_test,y_pred)

1.0

## after trying some different random states, the AUC results seem to rely a lot on how things were shuffled (one random state gave me AUC = 0.4, another gave me AUC = 0.6). The dataset may be a little small for accuracte predictions. In the most recent attempt above, I got an AUC of 1.0 but I am not sure I can trust this. I set the test dataset to be 30 percent of the original dataset, which means it contains only 10 entries. With the random state I chose, it is possible I simply just got lucky with the match.

## another issue I run into is based on the test train split. if I make the testing dataset too small, then the random shuffle can actually cause the test dataset to not have some of the descriptors in it. This happens if the testing dataset does not have every categorical outcome for the categorical descriptors, so one-hot encoding makes fewer new descriptors based on those outcomes. Another limitation of the small dataset we are working with.

# NEXT, try SVM, knn, single decision tree

# SVM Bow Tie

In [202]:
# first read in our dataset and one-hot encode it to prepare to building model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

df=pd.read_csv('./Bow_Tie_Machine_Learning.csv')

df.columns = df.columns.str.replace(' ', '_')
del df['Date']
del df['Month']
del df['Weather_Current']

# delete descriptors with low correlation

#del df['Shirt_Color']
del df['Wind_Speed_(1am)']
del df['Max_Wind_Speed']
del df['Dew_Point']
del df['Weather_Low']

# split data using sklearn

# now assign the train and test data
df_train, df_test = train_test_split(df, random_state=107,test_size=0.50, shuffle=True)

# move target values out
y_train = df_train.Bow_Tie.values
y_test = df_test.Bow_Tie.values

del df_train['Bow_Tie']
del df_test['Bow_Tie']

base = ['Day_of_Week', 'Phone_Battery', 'Shirt_Color', 'Weather_High', 'Weather_Low', 
       'Dew_Point', 'Precipitation', 'Humidity_(12pm)','Day_of_Month','Wind_Speed_(1am)','Wind_Speed_(12pm)']
numerical = ['Phone_Battery','Weather_High', 'Wind_Speed_(12pm)',
            'Precipitation','Humidity_(12pm)','Day_of_Month']
categorical = ['Day_of_Week','Shirt_Color']

# one hot encode

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

test_dict = df_test[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test = dv.transform(test_dict)

In [203]:
# and here we go with the svm model
# we start with a polynomial kernel
# let grid search do our cross validation, find the polynomial degree

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

parameters={'degree':[1,2,3,4,5,6]}
clf = SVC(kernel = 'poly', gamma='auto')
svm_clf = GridSearchCV(clf,parameters,scoring='roc_auc',cv=5)
svm_clf.fit(X_train, y_train)
print(svm_clf.best_params_)
print(svm_clf.best_score_)

{'degree': 2}
0.7


In [204]:
prediction_test = svm_clf.predict(X_test)

from sklearn.metrics import roc_auc_score

print('auc score: ', roc_auc_score(y_test,prediction_test))
print('accuracy: ', accuracy_score(y_test,prediction_test))

auc score:  0.7410714285714286
accuracy:  0.7333333333333333


- not too bad of values

# Decision Tree Bow Tie

In [205]:
# first read in our dataset and one-hot encode it to prepare to building model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

df=pd.read_csv('./Bow_Tie_Machine_Learning.csv')

df.columns = df.columns.str.replace(' ', '_')
del df['Date']
del df['Month']
del df['Weather_Current']

# delete descriptors with low correlation

#del df['Shirt_Color']
del df['Wind_Speed_(1am)']
del df['Max_Wind_Speed']
del df['Dew_Point']
del df['Weather_Low']

# split data using sklearn

# now assign the train and test data
df_train, df_test = train_test_split(df, random_state=60,test_size=0.3, shuffle=True)

# move target values out
y_train = df_train.Bow_Tie.values
y_test = df_test.Bow_Tie.values

del df_train['Bow_Tie']
del df_test['Bow_Tie']

base = ['Day_of_Week', 'Phone_Battery', 'Shirt_Color', 'Weather_High', 'Weather_Low', 
       'Dew_Point', 'Precipitation', 'Humidity_(12pm)','Day_of_Month','Wind_Speed_(1am)','Wind_Speed_(12pm)']
numerical = ['Phone_Battery','Weather_High', 'Wind_Speed_(12pm)',
             'Precipitation','Humidity_(12pm)','Day_of_Month']
categorical = ['Day_of_Week','Shirt_Color']

# one hot encode

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

test_dict = df_test[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test = dv.transform(test_dict)

In [206]:
# and now here we do our single decision tree, classifier version
# find depth and number of leaf, using grid search

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

classifier = DecisionTreeClassifier(random_state=0)
parameters = {'max_depth':[1,2,3,4,5,6,7,8,9,10,11,15,20], 
              'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,12,14,16,20,25]}

decision_classifier = GridSearchCV(classifier,parameters,scoring='roc_auc',cv=5)
decision_classifier.fit(X_train,y_train)
print(decision_classifier.best_params_)
print(decision_classifier.best_score_)

{'max_depth': 2, 'min_samples_leaf': 5}
0.5833333333333334


In [207]:
from sklearn.metrics import roc_auc_score

y_pred = decision_classifier.predict_proba(X_test)[:,1]
print('test:', roc_auc_score(y_test, y_pred))

test: 0.8


## once again I have trouble getting a consistently high AUC value. Really fluctuates based on test train split and the random state shuffling. In the above I got a relatively low AUC for the trained model, but then when used on the test dataset I get a decent AUC. At least this seems to suggest I am not overfitted, but with a small dataset where random state seems very important, I am not so sure

# K Neighbor Classifier for Bow Tie

In [323]:
# first read in our dataset and one-hot encode it to prepare to building model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

df=pd.read_csv('./Bow_Tie_Machine_Learning.csv')

df.columns = df.columns.str.replace(' ', '_')
del df['Date']
del df['Month']
del df['Weather_Current']

# delete descriptors with low correlation

#del df['Shirt_Color']
del df['Wind_Speed_(1am)']
del df['Max_Wind_Speed']
del df['Dew_Point']
del df['Weather_Low']

# split data using sklearn

# now assign the train and test data
df_train, df_test = train_test_split(df, random_state=5,test_size=0.3, shuffle=True)

# move target values out
y_train = df_train.Bow_Tie.values
y_test = df_test.Bow_Tie.values

del df_train['Bow_Tie']
del df_test['Bow_Tie']

base = ['Day_of_Week', 'Phone_Battery', 'Shirt_Color', 'Weather_High', 'Weather_Low', 
       'Dew_Point', 'Precipitation', 'Humidity_(12pm)','Day_of_Month','Wind_Speed_(1am)','Wind_Speed_(12pm)']
numerical = ['Phone_Battery','Weather_High', 'Wind_Speed_(12pm)',
             'Precipitation','Humidity_(12pm)','Day_of_Month']
categorical = ['Day_of_Week','Shirt_Color']

# one hot encode

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

print(dv.get_feature_names_out())

test_dict = df_test[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test = dv.transform(test_dict)

print(dv.get_feature_names_out())

['Day_of_Month' 'Day_of_Week=Thursday' 'Day_of_Week=Tuesday'
 'Humidity_(12pm)' 'Phone_Battery' 'Precipitation' 'Shirt_Color=Blue'
 'Shirt_Color=Grey' 'Shirt_Color=Other' 'Weather_High' 'Wind_Speed_(12pm)']
['Day_of_Month' 'Day_of_Week=Thursday' 'Day_of_Week=Tuesday'
 'Humidity_(12pm)' 'Phone_Battery' 'Precipitation' 'Shirt_Color=Blue'
 'Shirt_Color=Grey' 'Shirt_Color=Other' 'Weather_High' 'Wind_Speed_(12pm)']


In [324]:
# here we do our classifier
# use gridsearchcv for finding the best number of nearest neighbors, so we also don't overfit

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn=KNeighborsClassifier()
parameters={'n_neighbors':[1,2,3,4,5,6,7]}
knn_classifier=GridSearchCV(knn,parameters,scoring='roc_auc',cv=5)
knn_classifier.fit(X_train,y_train)
print(knn_classifier.best_params_)
print(knn_classifier.best_score_)

{'n_neighbors': 7}
0.9166666666666666


In [325]:
# let's get our prediction for the test dataset
from sklearn.metrics import roc_auc_score

prediction_knn = knn_classifier.predict(X_test)
print('auc score:', roc_auc_score(y_test, prediction_knn))

auc score: 0.8333333333333333


## Knn seems to have given me decent scores, but they still rely a lot on random state. But they seem maybe consistently higher than the other methods I have tried

- let's try changing precipitation to a categorical, either there was some or there is none
- This may help with the final prediction which is a percent chance for the day

In [375]:
# first read in our dataset and one-hot encode it to prepare to building model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

df=pd.read_csv('./Bow_Tie_Machine_Learning.csv')

df.columns = df.columns.str.replace(' ', '_')
del df['Date']
del df['Month']
del df['Weather_Current']

# delete descriptors with low correlation

#del df['Shirt_Color']
del df['Wind_Speed_(1am)']
del df['Max_Wind_Speed']
del df['Dew_Point']
del df['Weather_Low']

# now let's change precipitation into a categorical

df['Precipitation']=(df.Precipitation>0.0)
df['Precipitation'] = df['Precipitation'].replace({True: 'yes', False: 'no'})

# split data using sklearn

# now assign the train and test data
df_train, df_test = train_test_split(df, random_state=5,test_size=0.3, shuffle=True)

# move target values out
y_train = df_train.Bow_Tie.values
y_test = df_test.Bow_Tie.values

del df_train['Bow_Tie']
del df_test['Bow_Tie']

base = ['Day_of_Week', 'Phone_Battery', 'Shirt_Color', 'Weather_High', 'Weather_Low', 
       'Dew_Point', 'Precipitation', 'Humidity_(12pm)','Day_of_Month','Wind_Speed_(1am)','Wind_Speed_(12pm)']
numerical = ['Phone_Battery','Weather_High', 'Wind_Speed_(12pm)',
             'Humidity_(12pm)','Day_of_Month']
categorical = ['Day_of_Week','Shirt_Color','Precipitation']

# one hot encode

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

print(dv.get_feature_names_out())

test_dict = df_test[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(test_dict)
X_test = dv.transform(test_dict)

print(dv.get_feature_names_out())

['Day_of_Month' 'Day_of_Week=Thursday' 'Day_of_Week=Tuesday'
 'Humidity_(12pm)' 'Phone_Battery' 'Precipitation=no' 'Precipitation=yes'
 'Shirt_Color=Blue' 'Shirt_Color=Grey' 'Shirt_Color=Other' 'Weather_High'
 'Wind_Speed_(12pm)']
['Day_of_Month' 'Day_of_Week=Thursday' 'Day_of_Week=Tuesday'
 'Humidity_(12pm)' 'Phone_Battery' 'Precipitation=no' 'Precipitation=yes'
 'Shirt_Color=Blue' 'Shirt_Color=Grey' 'Shirt_Color=Other' 'Weather_High'
 'Wind_Speed_(12pm)']


In [376]:
# here we do our classifier
# use gridsearchcv for 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn=KNeighborsClassifier()
parameters={'n_neighbors':[1,2,3,4,5,6,7]}
knn_classifier=GridSearchCV(knn,parameters,scoring='roc_auc',cv=5)
knn_classifier.fit(X_train,y_train)
print(knn_classifier.best_params_)
print(knn_classifier.best_score_)

{'n_neighbors': 7}
0.9166666666666666


In [377]:
# let's get our prediction for the test dataset
from sklearn.metrics import roc_auc_score

prediction_knn = knn_classifier.predict(X_test)
print('auc score:', roc_auc_score(y_test, prediction_knn))

auc score: 0.8333333333333333


# ----------------------------------------------------------------------------

## So, we now need to choose a model to predict for the final day of class

## We are given...

- Day of Week: Tuesday
- Day of Month: 2
- Phone Battery: 86 percent
- Month: May
- Temperature Max: 65
- Humidity: 48
- Precipitation: 1 percent chance
- Wind: 11
- Shirt Color: Blue

## I will try my Knn model to do the prediction

In [378]:
# here are my descriptors and their order...

print(dv.get_feature_names_out())

['Day_of_Month' 'Day_of_Week=Thursday' 'Day_of_Week=Tuesday'
 'Humidity_(12pm)' 'Phone_Battery' 'Precipitation=no' 'Precipitation=yes'
 'Shirt_Color=Blue' 'Shirt_Color=Grey' 'Shirt_Color=Other' 'Weather_High'
 'Wind_Speed_(12pm)']


In [379]:
prediction_array = (np.array([2, 0, 1, 48, 86, 1, 0, 1, 0, 0, 65, 11])).reshape(1, -1)
prediction_array

array([[ 2,  0,  1, 48, 86,  1,  0,  1,  0,  0, 65, 11]])

In [380]:
print(knn_classifier.predict(prediction_array))

[False]


# So my prediction is no bow tie on last day of class