In [1]:
# Import  dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import confusion_matrix




In [None]:
#Set up connection to database
from sqlalchemy import create_engine

# Postgres username
POSTGRES_ADDRESS = 'final-project.cv7jwtgtdlpj.us-east-2.rds.amazonaws.com'
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'postgres'
POSTGRES_PASSWORD = dbpassword
POSTGRES_DBNAME = 'FSA_Data'

#login string
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
               .format(username=POSTGRES_USERNAME,
                      password=POSTGRES_PASSWORD,
                      ipaddress=POSTGRES_ADDRESS,
                      port=POSTGRES_PORT,
                      dbname=POSTGRES_DBNAME))

#Create connection
cnx = create_engine(postgres_str)

In [3]:
# Import our input dataset
MATH_df = pd.read_sql_query('''SELECT * FROM math_ml;''', cnx)
MATH_df = pd.DataFrame(MATH_df)
MATH_df.head()

In [4]:
MATH_df.dtypes

student_id      int64
fall_diag       int64
winter_diag     int64
fsa_math        int64
gender         object
ethnicity      object
retained       object
attendance     object
behavior       object
passed          int64
dtype: object

In [5]:
#drop null values
MATH_df.dropna(axis=0)

Unnamed: 0,student_id,fall_diag,winter_diag,fsa_math,gender,ethnicity,retained,attendance,behavior,passed
0,4804290612,386,410,1,M,Hispanic,NO,NO,NO,0
1,3560301549,405,435,1,M,Caucasian,YES,NO,NO,0
2,3527761508,386,415,1,F,African American,YES,NO,NO,0
3,3532401508,421,429,1,F,African American,YES,NO,NO,0
4,3546831509,381,389,1,M,African American,YES,YES,NO,0
...,...,...,...,...,...,...,...,...,...,...
208,3572921508,478,475,5,M,Caucasian,no,no,no,1
209,3581931508,432,457,5,M,Caucasian,no,no,no,1
210,3571491808,453,463,5,M,Caucasian,no,no,no,1
211,3508421508,456,465,5,M,Caucasian,no,no,no,1


In [6]:
# Generate our categorical features list
cat_features = MATH_df.dtypes[MATH_df.dtypes == 'object'].index.tolist()
print(cat_features)

['gender', 'ethnicity', 'retained', 'attendance', 'behavior']


In [7]:
#use get_dummies to encode categorical features
MATH_df = pd.get_dummies(MATH_df, columns=cat_features)
MATH_df.head()

Unnamed: 0,student_id,fall_diag,winter_diag,fsa_math,passed,gender_F,gender_M,ethnicity_African American,ethnicity_American Indian,ethnicity_Asian,...,retained_no,retained_yes,attendance_NO,attendance_YES,attendance_no,attendance_yes,behavior_NO,behavior_YES,behavior_no,behavior_yes
0,4804290612,386,410,1,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,3560301549,405,435,1,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,3527761508,386,415,1,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
3,3532401508,421,429,1,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
4,3546831509,381,389,1,0,0,1,1,0,0,...,0,0,0,1,0,0,1,0,0,0


In [8]:
#create target and features
X = MATH_df.drop(columns = ['student_id', 'fsa_math', 'passed'])
y = MATH_df['passed']

#split training/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify=y)

# create a StandardScaler instance (added in the 2nd iteration as previous year's scale score is different from other scale)
scaler = StandardScaler()

#fit the standardScaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=42)

#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f' Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}')

 Random forest predictive accuracy: 0.891


In [10]:
#generate a confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10,4
Actual 1,3,47
