KNN model to predict CS access in New Hampshire Public High Schools

In [1]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
import os


pd.set_option('display.max_columns', None)

In [2]:
df_cs = pd.read_csv(
    Path('Resources/NH.csv')
)
df_cs.head(10)

Unnamed: 0,yr_19,yr_20,yr_21,yr_22
0,Y,Y,Y,Y
1,Y,Y,Y,Y
2,Y,Y,Y,Y
3,Y,Y,Y,Y
4,Y,Y,Y,Y
5,Y,Y,Y,Y
6,N,N,N,N
7,Y,Y,Y,Y
8,Y,Y,Y,Y
9,Y,N,N,N


In [3]:
y = df_cs['yr_22']
X = df_cs.drop(columns='yr_22')


X[:10]


Unnamed: 0,yr_19,yr_20,yr_21
0,Y,Y,Y
1,Y,Y,Y
2,Y,Y,Y
3,Y,Y,Y
4,Y,Y,Y
5,Y,Y,Y
6,N,N,N
7,Y,Y,Y
8,Y,Y,Y
9,Y,N,N


In [4]:
y[:10]

0    Y
1    Y
2    Y
3    Y
4    Y
5    Y
6    N
7    Y
8    Y
9    N
Name: yr_22, dtype: object

In [5]:
X = pd.get_dummies(X)

In [6]:
X.head(10)

Unnamed: 0,yr_19_N,yr_19_Y,yr_20_N,yr_20_Y,yr_21_N,yr_21_Y
0,0,1,0,1,0,1
1,0,1,0,1,0,1
2,0,1,0,1,0,1
3,0,1,0,1,0,1
4,0,1,0,1,0,1
5,0,1,0,1,0,1
6,1,0,1,0,1,0
7,0,1,0,1,0,1
8,0,1,0,1,0,1
9,0,1,1,0,1,0


In [7]:
#Static model:
y_pred =  ['Y' if row.sum() > 2 else 'N' for index, row in df_cs.eq('Y').iterrows()]


In [8]:
accuracies = [1 if predicted == true else 0 for predicted, true in zip(y_pred, y)]
overall_accuracy = sum(accuracies) / len(accuracies)
print(f"The overall accuracy from using static determination: {overall_accuracy}")

The overall accuracy from using static determination: 0.9484536082474226


In [9]:
#KNN model:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
X_train.shape

(72, 6)

In [11]:
X_test.shape

(25, 6)

In [12]:
model = KNeighborsClassifier(n_neighbors=2)

In [13]:
model.fit(X_train, y_train)


In [14]:
y_pred = model.predict(X)
y_pred

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y'], dtype=object)

In [15]:
confusion_matrix(y_pred,y)

array([[16,  0],
       [ 1, 80]], dtype=int64)

In [19]:
print(classification_report(y_pred,y))

              precision    recall  f1-score   support

           N       0.94      1.00      0.97        16
           Y       1.00      0.99      0.99        81

    accuracy                           0.99        97
   macro avg       0.97      0.99      0.98        97
weighted avg       0.99      0.99      0.99        97



Using the KNN model, the overall accuracy is 99%