In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [22]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
scantronMath_38_encoded_df = pd.read_csv('../dataframes/scantronMath_38_encoded.csv')

In [23]:
scantronMath_38_encoded_df.head()

Unnamed: 0,studentId,level,dataSource,is_proficient,proficient_score,school_Elementary A,school_Elementary School B,school_Elementary School E,school_Elementary School F,school_Intermediate School E,school_Middle School A,school_Middle School C,school_Middle School D,school_Middle School F,iready_math,iready_reading,past_proficient_score,subject_math,subject_reading,subject_science,subject_social-studies,subject_music,subject_art,subject_performing-arts,subject_phys-ed,subject_career-prep
0,44777,7,value_scantronMath,False,0.935351,False,False,False,False,False,True,False,False,False,0,0,0.957757,2,2,2,2,1,0,0,0,0
1,44783,7,value_scantronMath,False,0.974724,False,False,False,False,False,True,False,False,False,0,0,0.949346,2,2,2,1,0,0,0,0,0
2,44784,6,value_scantronMath,True,1.021622,False,False,False,False,False,True,False,False,False,0,0,0.990132,2,2,2,2,1,0,0,0,0
3,44807,7,value_scantronMath,True,1.01519,False,False,False,False,False,True,False,False,False,0,0,1.035514,2,2,1,2,0,0,0,0,0
4,44809,7,value_scantronMath,False,0.92174,False,False,False,False,False,True,False,False,False,0,0,0.934393,2,2,2,2,1,0,0,0,0


In [24]:
scantronMath_38_encoded_df.dtypes

studentId                         int64
level                             int64
dataSource                       object
is_proficient                      bool
proficient_score                float64
school_Elementary A                bool
school_Elementary School B         bool
school_Elementary School E         bool
school_Elementary School F         bool
school_Intermediate School E       bool
school_Middle School A             bool
school_Middle School C             bool
school_Middle School D             bool
school_Middle School F             bool
iready_math                       int64
iready_reading                    int64
past_proficient_score           float64
subject_math                      int64
subject_reading                   int64
subject_science                   int64
subject_social-studies            int64
subject_music                     int64
subject_art                       int64
subject_performing-arts           int64
subject_phys-ed                   int64


In [25]:
#turning bools in one hot encoded rows to 0 and 1
scantronMath_38_encoded_df[scantronMath_38_encoded_df.select_dtypes(include=['bool']).columns] = scantronMath_38_encoded_df.select_dtypes(include=['bool']).astype(int)
scantronMath_38_encoded_df.dtypes

studentId                         int64
level                             int64
dataSource                       object
is_proficient                     int64
proficient_score                float64
school_Elementary A               int64
school_Elementary School B        int64
school_Elementary School E        int64
school_Elementary School F        int64
school_Intermediate School E      int64
school_Middle School A            int64
school_Middle School C            int64
school_Middle School D            int64
school_Middle School F            int64
iready_math                       int64
iready_reading                    int64
past_proficient_score           float64
subject_math                      int64
subject_reading                   int64
subject_science                   int64
subject_social-studies            int64
subject_music                     int64
subject_art                       int64
subject_performing-arts           int64
subject_phys-ed                   int64


In [26]:
scantronMath_38_encoded_df.isnull().sum()

studentId                          0
level                              0
dataSource                         0
is_proficient                      0
proficient_score                   0
school_Elementary A                0
school_Elementary School B         0
school_Elementary School E         0
school_Elementary School F         0
school_Intermediate School E       0
school_Middle School A             0
school_Middle School C             0
school_Middle School D             0
school_Middle School F             0
iready_math                        0
iready_reading                     0
past_proficient_score           5422
subject_math                       0
subject_reading                    0
subject_science                    0
subject_social-studies             0
subject_music                      0
subject_art                        0
subject_performing-arts            0
subject_phys-ed                    0
subject_career-prep                0
dtype: int64

In [27]:
#dropping rows that are missing past proficient score
scantronMath_38_encoded_df = scantronMath_38_encoded_df.dropna(subset=['past_proficient_score'])
print(scantronMath_38_encoded_df.isnull().sum()) 

studentId                       0
level                           0
dataSource                      0
is_proficient                   0
proficient_score                0
school_Elementary A             0
school_Elementary School B      0
school_Elementary School E      0
school_Elementary School F      0
school_Intermediate School E    0
school_Middle School A          0
school_Middle School C          0
school_Middle School D          0
school_Middle School F          0
iready_math                     0
iready_reading                  0
past_proficient_score           0
subject_math                    0
subject_reading                 0
subject_science                 0
subject_social-studies          0
subject_music                   0
subject_art                     0
subject_performing-arts         0
subject_phys-ed                 0
subject_career-prep             0
dtype: int64


Training

In [28]:
y_Math = scantronMath_38_encoded_df['proficient_score']
X_Math= scantronMath_38_encoded_df.drop(['proficient_score','is_proficient','studentId','dataSource','level'],  axis=1)
print("Number of examples: " + str(X_Math.shape[0]))
X_train_Math,X_test_Math, y_train_Math,y_test_Math = train_test_split(X_Math,y_Math, test_size=0.25, random_state=123)
print(X_train_Math.shape)
print(X_test_Math.shape)

Number of examples: 15149
(11361, 21)
(3788, 21)


In [29]:
#setting up the param grid, possible hyperparameter for max_depth and min_samples_leaf
max_depth_values = [8,12,25,32]
min_samples_leaf_values = [4,5,8,10]
param_grid = {'max_depth' : list(max_depth_values), 'min_samples_leaf' : list(min_samples_leaf_values)}

#Grid Search to find the best hyperparameters 
print("Running Grid Search")
dt_regressor_Math = DecisionTreeRegressor()

# will have a 3-fold cross validation 
dt_grid_Math = GridSearchCV(dt_regressor_Math, param_grid, cv=3, scoring='neg_root_mean_squared_error')
dt_grid_search_Math = dt_grid_Math.fit(X_train_Math, y_train_Math)

print("Done")
dt_best_params_Math = dt_grid_search_Math.best_params_

dt_best_params_Math

Running Grid Search
Done


{'max_depth': 8, 'min_samples_leaf': 10}

In [30]:
dt_model_Math = DecisionTreeRegressor(max_depth=8, min_samples_leaf=10)
dt_model_Math.fit(X_train_Math,y_train_Math)

#testing model on the X_test
y_dt_pred_Math = dt_model_Math.predict(X_test_Math)
dt_rmse_Math = mean_squared_error(y_test_Math,y_dt_pred_Math,squared=False)
dt_r2_Math= r2_score(y_test_Math,y_dt_pred_Math)

#trying to get f1 and accuracy score 
threshold = 1
y_pred_Math_bool = y_dt_pred_Math >= threshold
y_test_Math_bool = y_test_Math >= threshold

print("threshold=", threshold)
# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test_Math_bool, y_pred_Math_bool))

print('[DT] Root Mean Squared Error: {0}'.format(dt_rmse_Math))
print('[DT] R2: {0}'.format(dt_r2_Math))


threshold= 1

Classification Report:
              precision    recall  f1-score   support

       False       0.88      0.94      0.91      2594
        True       0.84      0.72      0.78      1194

    accuracy                           0.87      3788
   macro avg       0.86      0.83      0.84      3788
weighted avg       0.87      0.87      0.87      3788

[DT] Root Mean Squared Error: 0.0500670101860033
[DT] R2: 0.5674926903970963




In [31]:
#looking at the micro score
micro_f1 = f1_score(y_test_Math_bool, y_pred_Math_bool, average='micro')

print("Micro F1 Score:", micro_f1)

Micro F1 Score: 0.8695881731784583


Feature Importance 

In [32]:
importances = dt_model_Math.feature_importances_
feature_names = X_train_Math.columns

# Combine names and importances, and sort
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df= importance_df.sort_values(by='Importance', ascending=False)

# Display the sorted DataFrame
print(importance_df)

                         Feature  Importance
11         past_proficient_score    0.973357
19               subject_phys-ed    0.004578
2     school_Elementary School E    0.004250
12                  subject_math    0.003325
15        subject_social-studies    0.003270
7         school_Middle School D    0.003142
3     school_Elementary School F    0.001638
14               subject_science    0.001493
0            school_Elementary A    0.001194
6         school_Middle School C    0.001150
16                 subject_music    0.000885
9                    iready_math    0.000786
8         school_Middle School F    0.000528
13               subject_reading    0.000352
5         school_Middle School A    0.000052
4   school_Intermediate School E    0.000000
1     school_Elementary School B    0.000000
10                iready_reading    0.000000
17                   subject_art    0.000000
18       subject_performing-arts    0.000000
20           subject_career-prep    0.000000


In [37]:
#top ten features
importance_df.head(10)

Unnamed: 0,Feature,Importance
11,past_proficient_score,0.973357
19,subject_phys-ed,0.004578
2,school_Elementary School E,0.00425
12,subject_math,0.003325
15,subject_social-studies,0.00327
7,school_Middle School D,0.003142
3,school_Elementary School F,0.001638
14,subject_science,0.001493
0,school_Elementary A,0.001194
6,school_Middle School C,0.00115


w/o schools 

In [33]:
y = scantronMath_38_encoded_df['proficient_score']
X= scantronMath_38_encoded_df.drop(['proficient_score','is_proficient','studentId','dataSource','level','school_Elementary School E', 'school_Middle School D', 'school_Elementary School F', 'school_Elementary A', 'school_Middle School C','school_Middle School F', 'school_Middle School A', 'school_Elementary School B','school_Intermediate School E'],  axis=1)
print("Number of examples: " + str(X.shape[0]))
X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.25, random_state=123)
print(X_train.shape)
print(X_test.shape)

Number of examples: 15149
(11361, 12)
(3788, 12)


In [34]:
#Grid Search to find the best hyperparameters 
print("Running Grid Search")
dt_regressor = DecisionTreeRegressor()

# will have a 3-fold cross validation 
dt_grid = GridSearchCV(dt_regressor, param_grid, cv=3, scoring='neg_root_mean_squared_error')
dt_grid_search = dt_grid.fit(X_train, y_train)

print("Done")
dt_best_params = dt_grid_search.best_params_

dt_best_params

Running Grid Search
Done


{'max_depth': 8, 'min_samples_leaf': 10}

In [35]:
dt_model = DecisionTreeRegressor(max_depth=8, min_samples_leaf=10)
dt_model.fit(X_train,y_train)

#testing model on the X_test
y_dt_pred = dt_model.predict(X_test)
dt_rmse = mean_squared_error(y_test,y_dt_pred,squared=False)
dt_r2= r2_score(y_test,y_dt_pred)

#trying to get f1 and accuracy score 
threshold = 1
y_pred_bool = y_dt_pred>= threshold
y_test_bool = y_test >= threshold

print("threshold=", threshold)
# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test_bool, y_pred_bool))

print('[DT] Root Mean Squared Error: {0}'.format(dt_rmse))
print('[DT] R2: {0}'.format(dt_r2))

threshold= 1

Classification Report:
              precision    recall  f1-score   support

       False       0.88      0.94      0.91      2594
        True       0.84      0.72      0.77      1194

    accuracy                           0.87      3788
   macro avg       0.86      0.83      0.84      3788
weighted avg       0.87      0.87      0.86      3788

[DT] Root Mean Squared Error: 0.05004259238218441
[DT] R2: 0.5679144572776696




In [36]:
importances2 = dt_model.feature_importances_
feature_names2 = X_train.columns

# Combine names and importances, and sort
importance_df2= pd.DataFrame({'Feature': feature_names2, 'Importance': importances2})
importance_df2= importance_df2.sort_values(by='Importance', ascending=False)

# Display the sorted DataFrame
print(importance_df2)

                    Feature  Importance
2     past_proficient_score    0.981391
10          subject_phys-ed    0.005761
6    subject_social-studies    0.005468
3              subject_math    0.003672
4           subject_reading    0.001687
7             subject_music    0.000885
1            iready_reading    0.000676
5           subject_science    0.000353
0               iready_math    0.000107
8               subject_art    0.000000
9   subject_performing-arts    0.000000
11      subject_career-prep    0.000000
