In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report



In [2]:
# Import our input dataset
ELA_df = pd.read_csv('ela_ml.csv')
ELA_df = pd.DataFrame(ELA_df)

In [3]:
ELA_df.dtypes

student_id      int64
fall_diag       int64
winter_diag     int64
fsa_ela         int64
gender         object
ethnicity      object
retained       object
attendance     object
behavior       object
dtype: object

In [4]:
#drop null values
ELA_df.dropna(axis=0)

Unnamed: 0,student_id,fall_diag,winter_diag,fsa_ela,gender,ethnicity,retained,attendance,behavior
0,3505181649,536,564,3,M,Caucasian,NO,NO,NO
1,3507461649,564,584,4,M,Caucasian,NO,NO,NO
2,3508041649,531,528,4,F,Caucasian,NO,NO,NO
3,3509881649,530,556,5,M,Caucasian,NO,NO,NO
4,3510171549,518,534,3,M,Caucasian,YES,NO,NO
...,...,...,...,...,...,...,...,...,...
208,3531461508,612,593,5,F,Caucasian,no,no,no
209,3571921508,503,522,4,F,Caucasian,no,no,no
210,3582191508,522,554,5,F,Caucasian,no,no,no
211,3592081508,529,534,3,F,Caucasian,no,no,no


In [5]:
# Generate our categorical features list
cat_features = ELA_df.dtypes[ELA_df.dtypes == 'object'].index.tolist()
print(cat_features)

['gender', 'ethnicity', 'retained', 'attendance', 'behavior']


In [6]:
#remove features
#cat_features.remove('behavior')
#cat_features.remove('attendance')
#print(cat_features)

In [7]:
#use get_dummies to encode categorical features
ELA_df = pd.get_dummies(ELA_df, columns=cat_features)
ELA_df.head()

Unnamed: 0,student_id,fall_diag,winter_diag,fsa_ela,gender_F,gender_M,ethnicity_African American,ethnicity_American Indian,ethnicity_Asian,ethnicity_Caucasian,...,retained_no,retained_yes,attendance_NO,attendance_YES,attendance_no,attendance_yes,behavior_NO,behavior_YES,behavior_no,behavior_yes
0,3505181649,536,564,3,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
1,3507461649,564,584,4,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
2,3508041649,531,528,4,1,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
3,3509881649,530,556,5,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0
4,3510171549,518,534,3,0,1,0,0,0,1,...,0,0,1,0,0,0,1,0,0,0


In [8]:
#create target and features
X = ELA_df.drop(columns = ['student_id', 'fsa_ela'])
y = ELA_df['fsa_ela']

#split training/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify=y)

# create a StandardScaler instance (added in the 2nd iteration as previous year's scale score is different from other scale)
scaler = StandardScaler()

#fit the standardScaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
#use a for loop to identify the learning rate that yields the best performance
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
               learning_rate=learning_rate,
               max_features=5,
               max_depth=3,
               random_state=0)
    
    classifier.fit(X_train_scaled, y_train)
    
    print("Learning rate:", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate: 0.05
Accuracy score (training): 0.745
Accuracy score (validation): 0.500

Learning rate: 0.1
Accuracy score (training): 0.785
Accuracy score (validation): 0.578

Learning rate: 0.25
Accuracy score (training): 0.926
Accuracy score (validation): 0.531

Learning rate: 0.5
Accuracy score (training): 0.987
Accuracy score (validation): 0.531

Learning rate: 0.75
Accuracy score (training): 1.000
Accuracy score (validation): 0.484

Learning rate: 1
Accuracy score (training): 1.000
Accuracy score (validation): 0.531



In [12]:
#use the learning_rate from the for loop to instantiate a model, train it, then create predictions
classifier = GradientBoostingClassifier(n_estimators=20,
                                       learning_rate=0.1,
                                       max_features=5,
                                       max_depth=3,
                                       random_state=0)

classifier.fit(X_train_scaled, y_train)
y_pred = classifier.predict(X_test_scaled)

In [13]:
#generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2","Actual 3","Actual 4","Actual 5"],
    columns=["Predicted 1", "Predicted 2","Predicted 3","Predicted 4","Predicted 5" ]
)
display(cm_df)

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
Actual 1,3,2,0,0,0
Actual 2,1,4,6,1,0
Actual 3,0,0,16,5,1
Actual 4,0,0,4,12,2
Actual 5,0,0,1,4,2


In [15]:
#generate a classification report to evaluate precision, recall, and F1
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           1       0.75      0.60      0.67         5
           2       0.67      0.33      0.44        12
           3       0.59      0.73      0.65        22
           4       0.55      0.67      0.60        18
           5       0.40      0.29      0.33         7

    accuracy                           0.58        64
   macro avg       0.59      0.52      0.54        64
weighted avg       0.58      0.58      0.57        64

