In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report



In [2]:
# Import our input dataset
ELA_df = pd.read_csv('ela_ml.csv')
ELA_df = pd.DataFrame(ELA_df)

In [3]:
ELA_df.dtypes

student_id      int64
fall_diag       int64
winter_diag     int64
fsa_ela         int64
gender         object
ethnicity      object
retained       object
attendance     object
behavior       object
passed          int64
dtype: object

In [4]:
#drop null values
ELA_df.dropna(axis=0)

Unnamed: 0,student_id,fall_diag,winter_diag,fsa_ela,gender,ethnicity,retained,attendance,behavior,passed
0,3569341204,372,450,1,F,Asian,YES,NO,NO,0
1,3504381608,472,514,1,F,Hispanic,NO,NO,YES,0
2,3507911409,430,475,1,M,Hispanic,NO,NO,NO,0
3,3519451745,489,498,1,F,Hispanic,NO,NO,NO,0
4,3551161509,471,488,1,F,African American,NO,NO,NO,0
...,...,...,...,...,...,...,...,...,...,...
208,3572921508,546,571,5,M,Caucasian,no,no,no,1
209,3581931508,568,587,5,M,Caucasian,no,no,no,1
210,3571491808,540,563,5,M,Caucasian,no,no,no,1
211,3531461508,612,593,5,F,Caucasian,no,no,no,1


In [5]:
# Generate our categorical features list
cat_features = ELA_df.dtypes[ELA_df.dtypes == 'object'].index.tolist()
print(cat_features)

['gender', 'ethnicity', 'retained', 'attendance', 'behavior']


In [6]:
#remove features
#cat_features.remove('behavior')
#cat_features.remove('attendance')
#print(cat_features)

In [7]:
#use get_dummies to encode categorical features
ELA_df = pd.get_dummies(ELA_df, columns=cat_features)
ELA_df.head()

Unnamed: 0,student_id,fall_diag,winter_diag,fsa_ela,passed,gender_F,gender_M,ethnicity_African American,ethnicity_American Indian,ethnicity_Asian,...,retained_no,retained_yes,attendance_NO,attendance_YES,attendance_no,attendance_yes,behavior_NO,behavior_YES,behavior_no,behavior_yes
0,3569341204,372,450,1,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
1,3504381608,472,514,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,3507911409,430,475,1,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,3519451745,489,498,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,3551161509,471,488,1,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0


In [8]:
#create target and features
X = ELA_df.drop(columns = ['student_id', 'fsa_ela', 'passed'])
y = ELA_df['passed']

#split training/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify=y)

# create a StandardScaler instance (added in the 2nd iteration as previous year's scale score is different from other scale)
scaler = StandardScaler()

#fit the standardScaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f' Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}')

 Random forest predictive accuracy: 0.797


In [10]:
#generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,9
Actual 1,4,43
