In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Import our input dataset
MATH_df = pd.read_csv('Data/MATH_data_1819_clean.csv')
MATH_df = pd.DataFrame(MATH_df)

In [2]:
MATH_df.dtypes

Student                 int64
previous scale score    int64
diagnostic 1            int64
GM 1                    int64
GM 2                    int64
diagnostic 2            int64
Achievement Level       int64
dtype: object

In [3]:
#drop null values
MATH_df.dropna(axis=0)

Unnamed: 0,Student,previous scale score,diagnostic 1,GM 1,GM 2,diagnostic 2,Achievement Level
0,65,278,413,424,381,401,1
1,51,309,459,455,445,466,1
2,80,251,399,412,371,385,1
3,9,283,416,410,406,405,1
4,10,285,436,455,455,468,1
...,...,...,...,...,...,...,...
72,17,337,507,478,483,504,5
73,31,355,495,518,501,488,5
74,79,336,485,513,517,504,5
75,4,321,517,512,522,526,5


In [4]:
#create target and features
X = MATH_df.drop(columns = ['Student', 'Achievement Level'])
y = MATH_df['Achievement Level']

#split training/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# create a StandardScaler instance (previous year's scale score is different from other scale)
scaler = StandardScaler()

#fit the standardScaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
# create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f' Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}')

 Random forest predictive accuracy: 0.550


In [6]:
#generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2","Actual 3","Actual 4","Actual 5"],
    columns=["Predicted 1", "Predicted 2","Predicted 3","Predicted 4","Predicted 5" ]
)
display(cm_df)

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
Actual 1,5,1,0,0,0
Actual 2,1,2,2,0,1
Actual 3,0,2,2,0,0
Actual 4,0,0,0,2,0
Actual 5,0,0,0,2,0
