In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report



In [2]:
# Import our input dataset
fall_df = pd.read_csv('Data_for_ML_Model/bins_math_fall.csv')
fall_df = pd.DataFrame(fall_df)

winter_df = pd.read_csv('Data_for_ML_Model/bins_math_winter.csv')
winter_df = pd.DataFrame(winter_df)

In [3]:
#check data types
print(fall_df.dtypes)
print(winter_df.dtypes)

Student ID      int64
Gender         object
Ethnicity      object
FSA_MATH        int64
Retained       object
Attendance     object
Behavior       object
Fall_Diag       int64
Winter_Diag     int64
Bins_Fall       int64
dtype: object
Student ID      int64
Gender         object
Ethnicity      object
FSA_MATH        int64
Retained       object
Attendance     object
Behavior       object
Fall_Diag       int64
Winter_Diag     int64
Bins_Winter     int64
dtype: object


In [4]:
#drop null values
fall_df.dropna(axis=0)
winter_df.dropna(axis=0)

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_MATH,Retained,Attendance,Behavior,Fall_Diag,Winter_Diag,Bins_Winter
0,3505181649,M,Caucasian,4,NO,NO,NO,456,446,4
1,3507461649,M,Caucasian,4,NO,NO,NO,457,460,4
2,3508041649,F,Caucasian,5,NO,NO,NO,465,467,4
3,3509881649,M,Caucasian,5,NO,NO,NO,476,496,5
4,3510171549,M,Caucasian,3,YES,NO,NO,443,443,3
...,...,...,...,...,...,...,...,...,...,...
208,3531461508,F,Caucasian,5,no,no,no,451,460,4
209,3571921508,F,Caucasian,3,no,no,no,422,442,3
210,3582191508,F,Caucasian,4,no,no,no,450,463,4
211,3592081508,F,Caucasian,4,no,no,no,433,442,3


In [5]:
# pull bins from winter 
winter_bins = winter_df[['Student ID','Bins_Winter']]
winter_bins

Unnamed: 0,Student ID,Bins_Winter
0,3505181649,4
1,3507461649,4
2,3508041649,4
3,3509881649,5
4,3510171549,3
...,...,...
208,3531461508,4
209,3571921508,3
210,3582191508,4
211,3592081508,3


In [6]:
# Generate our categorical features list
cat_features = fall_df.dtypes[fall_df.dtypes == 'object'].index.tolist()
print(cat_features)

['Gender', 'Ethnicity', 'Retained', 'Attendance', 'Behavior']


In [7]:
# Create a LabelEncoder instance
le = LabelEncoder()

# Fit and transform the OneHotEncoder using the categorical variable list
#encode categorical columns
fall_df['Gender'] = le.fit_transform(fall_df['Gender'])
fall_df['Ethnicity'] = le.fit_transform(fall_df['Ethnicity'])
fall_df['Retained'] = le.fit_transform(fall_df['Retained'])
fall_df['Attendance'] = le.fit_transform(fall_df['Attendance'])
fall_df['Behavior'] = le.fit_transform(fall_df['Behavior'])

fall_df.head()

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_MATH,Retained,Attendance,Behavior,Fall_Diag,Winter_Diag,Bins_Fall
0,3505181649,1,3,4,0,0,0,456,446,4
1,3507461649,1,3,4,0,0,0,457,460,4
2,3508041649,0,3,5,0,0,0,465,467,4
3,3509881649,1,3,5,0,0,0,476,496,5
4,3510171549,1,3,3,1,0,0,443,443,3


In [8]:
# Merge the two DataFrames together 
fall_df = fall_df.merge(winter_bins)
fall_df.head()

Unnamed: 0,Student ID,Gender,Ethnicity,FSA_MATH,Retained,Attendance,Behavior,Fall_Diag,Winter_Diag,Bins_Fall,Bins_Winter
0,3505181649,1,3,4,0,0,0,456,446,4,4
1,3507461649,1,3,4,0,0,0,457,460,4,4
2,3508041649,0,3,5,0,0,0,465,467,4,4
3,3509881649,1,3,5,0,0,0,476,496,5,5
4,3510171549,1,3,3,1,0,0,443,443,3,3


In [9]:
#create target and features
X = fall_df.drop(columns = ['Student ID', 'FSA_MATH', 'Fall_Diag', 'Winter_Diag'])
y = fall_df['FSA_MATH']

#split training/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# create a StandardScaler instance (added in the 2nd iteration as previous year's scale score is different from other scale)
scaler = StandardScaler()

#fit the standardScaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f' Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}')

 Random forest predictive accuracy: 0.352


In [11]:
#generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2","Actual 3","Actual 4","Actual 5"],
    columns=["Predicted 1", "Predicted 2","Predicted 3","Predicted 4","Predicted 5" ]
)
display(cm_df)

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
Actual 1,2,1,1,0,0
Actual 2,1,2,3,1,0
Actual 3,0,2,5,5,4
Actual 4,0,2,4,4,6
Actual 5,0,0,0,5,6
