# 2.0 Random Forest (Split)

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

# Dataset used

In [2]:
### Dataset from FYP1
df1_B_imb = pd.read_csv('CSurvey1_Trans_Binary_imb.csv')
df1_5L_imb = pd.read_csv('CSurvey1_Trans_5level_imb.csv')

df1_B_ROS = pd.read_csv('CSurvey1_Trans_Binary_ROS.csv')
df1_5L_ROS = pd.read_csv('CSurvey1_Trans_5level_ROS.csv')

df1_B_SMOTE = pd.read_csv('CSurvey1_Trans_Binary_SMOTE.csv')
df1_5L_SMOTE = pd.read_csv('CSurvey1_Trans_5level_SMOTE.csv')

### Dataset from FYP2
df2_B_imb = pd.read_csv('CSurvey2_Trans_Binary_imb.csv')
df2_5L_imb = pd.read_csv('CSurvey2_Trans_5level_imb.csv')

df2_B_ROS = pd.read_csv('CSurvey2_Trans_Binary_ROS.csv')
df2_5L_ROS = pd.read_csv('CSurvey2_Trans_5level_ROS.csv')

df2_B_SMOTE = pd.read_csv('CSurvey2_Trans_Binary_SMOTE.csv')
df2_5L_SMOTE = pd.read_csv('CSurvey2_Trans_5level_SMOTE.csv')

### Dataset shape
print("df1_B_imb:", df1_B_imb.shape)
print("df1_5L_imb:", df1_5L_imb.shape)
print("df1_B_ROS:", df1_B_ROS.shape)
print("df1_5L_ROS:", df1_5L_ROS.shape)
print("df1_B_SMOTE:", df1_B_SMOTE.shape)
print("df1_5L_SMOTE:", df1_5L_SMOTE.shape)

print("df2_B_imb:", df2_B_imb.shape)
print("df2_5L_imb:", df2_5L_imb.shape)
print("df2_B_ROS:", df2_B_ROS.shape)
print("df2_5L_ROS:", df2_5L_ROS.shape)
print("df2_B_SMOTE:", df2_B_SMOTE.shape)
print("df2_5L_SMOTE:", df2_5L_SMOTE.shape)

df1_B_imb: (673, 41)
df1_5L_imb: (673, 41)
df1_B_ROS: (1338, 41)
df1_5L_ROS: (1695, 41)
df1_B_SMOTE: (1338, 41)
df1_5L_SMOTE: (1695, 41)
df2_B_imb: (614, 41)
df2_5L_imb: (614, 41)
df2_B_ROS: (1202, 41)
df2_5L_ROS: (1390, 41)
df2_B_SMOTE: (1202, 41)
df2_5L_SMOTE: (1390, 41)


# Binary Classification

## Student Background

In [3]:
rand_states = [7,69,101]

### SB before Data Balancing

In [4]:
tempB_SB_imb = []

y_train = df1_B_imb[['CGPA']]
X_train = df1_B_imb[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

y_test = df2_B_imb[['CGPA']]
X_test = df2_B_imb[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SB_imb.append([100,j,acc,pre,recall,f1,param])
        
dfB_SB_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SB_imb)
dfB_SB_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.973941,0.958001,0.973941,0.965906,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.97557,0.958035,0.97557,0.966723,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.97557,0.958035,0.97557,0.966723,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [5]:
dfB_SB_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.975027,0.958024,0.975027,0.966451


In [6]:
print("Imbalance Binary: Student Background\n")
for x in dfB_SB_imb["Best Param"]:
    print(x)

Imbalance Binary: Student Background

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 400}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### SB after RandomOverSampler

In [7]:
tempB_SB_ROS = []

y_train = df1_B_ROS[['CGPA']]
X_train = df1_B_ROS[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

y_test = df2_B_ROS[['CGPA']]
X_test = df2_B_ROS[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]


for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SB_ROS.append([100,j,acc,pre,recall,f1,param])
        
dfB_SB_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SB_ROS)
dfB_SB_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.497504,0.249374,0.497504,0.332222,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
1,100,69,0.497504,0.249374,0.497504,0.332222,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.498336,0.249583,0.498336,0.332593,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [8]:
dfB_SB_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.497781,0.249444,0.497781,0.332346


In [9]:
print("Binary: Student Background\n")
for x in dfB_SB_ROS["Best Param"]:
    print(x)

Binary: Student Background

{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}


### SB after SMOTE

In [10]:
tempB_SB_SMOTE = []

y_train = df1_B_SMOTE[['CGPA']]
X_train = df1_B_SMOTE[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

y_test = df2_B_SMOTE[['CGPA']]
X_test = df2_B_SMOTE[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]


for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SB_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
dfB_SB_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SB_SMOTE)
dfB_SB_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.531614,0.672097,0.531614,0.41152,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.531614,0.666652,0.531614,0.412627,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.536606,0.681541,0.536606,0.421053,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [11]:
dfB_SB_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.533278,0.67343,0.533278,0.415067


In [12]:
print("SMOTE Binary: Student Background\n")
for x in dfB_SB_SMOTE["Best Param"]:
    print(x)

SMOTE Binary: Student Background

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 400}


## Student History Grades

### SHG before Data Balancing

In [13]:
tempB_SHG_imb = []

y_train = df1_B_imb[['CGPA']]
X_train = df1_B_imb[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

y_test = df2_B_imb[['CGPA']]
X_test = df2_B_imb[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SHG_imb.append([100,j,acc,pre,recall,f1,param])
        
dfB_SHG_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SHG_imb)
dfB_SHG_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.977199,0.958069,0.977199,0.967539,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.977199,0.958069,0.977199,0.967539,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.977199,0.958069,0.977199,0.967539,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [14]:
dfB_SHG_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.977199,0.958069,0.977199,0.967539


In [15]:
print("Imbalance Binary: Student History Grades\n")
for x in dfB_SHG_imb["Best Param"]:
    print(x)

Imbalance Binary: Student History Grades

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### SHG after OverSampling

In [16]:
tempB_SHG_ROS = []

y_train = df1_B_ROS[['CGPA']]
X_train = df1_B_ROS[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

y_test = df2_B_ROS[['CGPA']]
X_test = df2_B_ROS[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SHG_ROS.append([100,j,acc,pre,recall,f1,param])
        
dfB_SHG_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SHG_ROS)
dfB_SHG_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.526622,0.63203,0.526622,0.408581,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.526622,0.63203,0.526622,0.408581,"{'criterion': 'entropy', 'max_features': 'auto..."
2,100,101,0.526622,0.63203,0.526622,0.408581,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [17]:
dfB_SHG_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.526622,0.63203,0.526622,0.408581


In [18]:
print("Binary: Student History Grades\n")
for x in dfB_SHG_ROS["Best Param"]:
    print(x)

Binary: Student History Grades

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}


### SHG after SMOTE

In [19]:
tempB_SHG_SMOTE = []

y_train = df1_B_SMOTE[['CGPA']]
X_train = df1_B_SMOTE[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

y_test = df2_B_SMOTE[['CGPA']]
X_test = df2_B_SMOTE[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SHG_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
dfB_SHG_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SHG_SMOTE)
dfB_SHG_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.659734,0.749896,0.659734,0.625999,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.657238,0.748331,0.657238,0.622631,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.659734,0.749896,0.659734,0.625999,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [20]:
dfB_SHG_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.658902,0.749374,0.658902,0.624876


In [21]:
print("SMOTE Binary: Student History Grades\n")
for x in dfB_SHG_SMOTE["Best Param"]:
    print(x)

SMOTE Binary: Student History Grades

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}


## Student Opinion Towards MMU Environment 

### SOTME before Data Balancing

In [22]:
tempB_SOTME_imb = []

y_train = df1_B_imb[['CGPA']]
X_train = df1_B_imb[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

y_test = df2_B_imb[['CGPA']]
X_test = df2_B_imb[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SOTME_imb.append([100,j,acc,pre,recall,f1,param])
        
dfB_SOTME_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SOTME_imb)
dfB_SOTME_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [23]:
dfB_SOTME_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.978827,0.958103,0.978827,0.968354


In [24]:
print("Imbalance Binary: Student Opinion Towards MMU Environment\n")
for x in dfB_SOTME_imb["Best Param"]:
    print(x)

Imbalance Binary: Student Opinion Towards MMU Environment

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### SOTME after RandomOverSampler

In [25]:
tempB_SOTME_ROS = []

y_train = df1_B_ROS[['CGPA']]
X_train = df1_B_ROS[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

y_test = df2_B_ROS[['CGPA']]
X_test = df2_B_ROS[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SOTME_ROS.append([100,j,acc,pre,recall,f1,param])
        
dfB_SOTME_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SOTME_ROS)
dfB_SOTME_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.499168,0.249792,0.499168,0.332963,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.499168,0.249792,0.499168,0.332963,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.499168,0.249792,0.499168,0.332963,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [26]:
dfB_SOTME_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.499168,0.249792,0.499168,0.332963


In [27]:
print("Binary: Student Opinion Towards MMU Environment\n")
for x in dfB_SOTME_ROS["Best Param"]:
    print(x)

Binary: Student Opinion Towards MMU Environment

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### SOTME after SMOTE

In [28]:
tempB_SOTME_SMOTE = []

y_train = df1_B_SMOTE[['CGPA']]
X_train = df1_B_SMOTE[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

y_test = df2_B_SMOTE[['CGPA']]
X_test = df2_B_SMOTE[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SOTME_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
dfB_SOTME_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SOTME_SMOTE)
dfB_SOTME_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.499168,0.491166,0.499168,0.352549,"{'criterion': 'entropy', 'max_features': 'sqrt..."
1,100,69,0.499168,0.491166,0.499168,0.352549,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.499168,0.491166,0.499168,0.352549,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [29]:
dfB_SOTME_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.499168,0.491166,0.499168,0.352549


In [30]:
print("SMOTE Binary: Student Opinion Towards MMU Environment\n")
for x in dfB_SOTME_SMOTE["Best Param"]:
    print(x)

SMOTE Binary: Student Opinion Towards MMU Environment

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}


## Student Family Background

### SFB before Data Balancing

In [31]:
tempB_SFB_imb = []

y_train = df1_B_imb[['CGPA']]
X_train = df1_B_imb[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

y_test = df2_B_imb[['CGPA']]
X_test = df2_B_imb[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SFB_imb.append([100,j,acc,pre,recall,f1,param])
        
dfB_SFB_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SFB_imb)
dfB_SFB_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [32]:
dfB_SFB_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.978827,0.958103,0.978827,0.968354


In [33]:
print("Imbalance Binary: Student Family Background\n")
for x in dfB_SFB_imb["Best Param"]:
    print(x)

Imbalance Binary: Student Family Background

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### SFB after RandomOverSampler

In [34]:
tempB_SFB_ROS = []

y_train = df1_B_ROS[['CGPA']]
X_train = df1_B_ROS[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

y_test = df2_B_ROS[['CGPA']]
X_test = df2_B_ROS[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SFB_ROS.append([100,j,acc,pre,recall,f1,param])
        
dfB_SFB_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SFB_ROS)
dfB_SFB_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.497504,0.249374,0.497504,0.332222,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.498336,0.249583,0.498336,0.332593,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.497504,0.249374,0.497504,0.332222,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [35]:
dfB_SFB_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.497781,0.249444,0.497781,0.332346


In [36]:
print("Binary: Student Family Background\n")
for x in dfB_SFB_ROS["Best Param"]:
    print(x)

Binary: Student Family Background

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}


### SFB after SMOTE

In [37]:
tempB_SFB_SMOTE = []

y_train = df1_B_SMOTE[['CGPA']]
X_train = df1_B_SMOTE[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

y_test = df2_B_SMOTE[['CGPA']]
X_test = df2_B_SMOTE[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SFB_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
dfB_SFB_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SFB_SMOTE)
dfB_SFB_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.493344,0.248325,0.493344,0.330362,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
1,100,69,0.494176,0.248536,0.494176,0.330735,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.493344,0.248325,0.493344,0.330362,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [38]:
dfB_SFB_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.493622,0.248395,0.493622,0.330486


In [39]:
print("SMOTE Binary: Student Family Background\n")
for x in dfB_SFB_SMOTE["Best Param"]:
    print(x)

SMOTE Binary: Student Family Background

{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 200}


## Student Lifestyle

### SL before Data Balancing

In [40]:
tempB_SL_imb = []

y_train = df1_B_imb[['CGPA']]
X_train = df1_B_imb[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

y_test = df2_B_imb[['CGPA']]
X_test = df2_B_imb[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SL_imb.append([100,j,acc,pre,recall,f1,param])
        
dfB_SL_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SL_imb)
dfB_SL_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [41]:
dfB_SL_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.978827,0.958103,0.978827,0.968354


In [42]:
print("Imbalance Binary: Student Lifestyle\n")
for x in dfB_SL_imb["Best Param"]:
    print(x)

Imbalance Binary: Student Lifestyle

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### SL after RandomOverSampler

In [43]:
tempB_SL_ROS = []

y_train = df1_B_ROS[['CGPA']]
X_train = df1_B_ROS[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

y_test = df2_B_ROS[['CGPA']]
X_test = df2_B_ROS[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SL_ROS.append([100,j,acc,pre,recall,f1,param])
        
dfB_SL_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SL_ROS)
dfB_SL_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [44]:
dfB_SL_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.5,0.25,0.5,0.333333


In [45]:
print("Binary: Student Lifestyle\n")
for x in dfB_SL_ROS["Best Param"]:
    print(x)

Binary: Student Lifestyle

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### SL after SMOTE

In [46]:
tempB_SL_SMOTE = []

y_train = df1_B_SMOTE[['CGPA']]
X_train = df1_B_SMOTE[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

y_test = df2_B_SMOTE[['CGPA']]
X_test = df2_B_SMOTE[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SL_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
dfB_SL_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SL_SMOTE)
dfB_SL_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.499168,0.249792,0.499168,0.332963,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
1,100,69,0.499168,0.249792,0.499168,0.332963,"{'criterion': 'entropy', 'max_features': 'auto..."
2,100,101,0.499168,0.249792,0.499168,0.332963,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [47]:
dfB_SL_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.499168,0.249792,0.499168,0.332963


In [48]:
print("SMOTE Binary: Student Lifestyle\n")
for x in dfB_SL_SMOTE["Best Param"]:
    print(x)

SMOTE Binary: Student Lifestyle

{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 300}


## All Attributes

### AA before Data Balancing

In [49]:
tempB_imb = []

y_train = df1_B_imb[['CGPA']]
X_train = df1_B_imb.drop(columns=['CGPA'])

y_test = df2_B_imb[['CGPA']]
X_test = df2_B_imb.drop(columns=['CGPA'])

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_imb.append([100,j,acc,pre,recall,f1,param])
        
dfB_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_imb)
dfB_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.978827,0.958103,0.978827,0.968354,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [50]:
dfB_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.978827,0.958103,0.978827,0.968354


In [51]:
print("Imbalance All Attributes\n")
for x in dfB_imb["Best Param"]:
    print(x)

Imbalance All Attributes

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### AA after RandomOverSampler

In [52]:
tempB_ROS = []

y_train = df1_B_ROS[['CGPA']]
X_train = df1_B_ROS.drop(columns=['CGPA'])

y_test = df2_B_ROS[['CGPA']]
X_test = df2_B_ROS.drop(columns=['CGPA'])

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_ROS.append([100,j,acc,pre,recall,f1,param])
        
dfB_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_ROS)
dfB_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [53]:
dfB_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.5,0.25,0.5,0.333333


In [54]:
print("All Attributes\n")
for x in dfB_ROS["Best Param"]:
    print(x)

All Attributes

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


### AA after SMOTE

In [55]:
tempB_SMOTE = []

y_train = df1_B_SMOTE[['CGPA']]
X_train = df1_B_SMOTE.drop(columns=['CGPA'])

y_test = df2_B_SMOTE[['CGPA']]
X_test = df2_B_SMOTE.drop(columns=['CGPA'])

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    tempB_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
dfB_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=tempB_SMOTE)
dfB_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."
2,100,101,0.5,0.25,0.5,0.333333,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [56]:
dfB_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.5,0.25,0.5,0.333333


In [57]:
print("SMOTE All Attributes\n")
for x in dfB_SMOTE["Best Param"]:
    print(x)

SMOTE All Attributes

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 100}


# 5-Level Classification

## Student Background

### SB before Data Balancing

In [58]:
temp5L_SB_imb = []

y_train = df1_5L_imb[['CGPA']]
X_train = df1_5L_imb[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

y_test = df2_5L_imb[['CGPA']]
X_test = df2_5L_imb[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SB_imb.append([100,j,acc,pre,recall,f1,param])
        
df5L_SB_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SB_imb)
df5L_SB_imb



Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.418567,0.387183,0.418567,0.394325,"{'criterion': 'entropy', 'max_features': 'log2..."
1,100,69,0.441368,0.405807,0.441368,0.413569,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
2,100,101,0.418567,0.385011,0.418567,0.394134,"{'criterion': 'entropy', 'max_features': 'auto..."


In [59]:
df5L_SB_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.426167,0.392667,0.426167,0.400676


In [60]:
print("Imbalance 5-Level: Student Background\n")
for x in df5L_SB_imb["Best Param"]:
    print(x)

Imbalance 5-Level: Student Background

{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 100}


### SB after RandomOverSampler

In [61]:
temp5L_SB_ROS = []

y_train = df1_5L_ROS[['CGPA']]
X_train = df1_5L_ROS[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

y_test = df2_5L_ROS[['CGPA']]
X_test = df2_5L_ROS[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SB_ROS.append([100,j,acc,pre,recall,f1,param])
        
df5L_SB_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SB_ROS)
df5L_SB_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.228777,0.200298,0.228777,0.187554,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
1,100,69,0.240288,0.220826,0.240288,0.200524,"{'criterion': 'gini', 'max_features': 'log2', ..."
2,100,101,0.241007,0.211127,0.241007,0.198663,"{'criterion': 'entropy', 'max_features': 'auto..."


In [62]:
df5L_SB_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.236691,0.21075,0.236691,0.19558


In [63]:
print("5-Level: Student Background\n")
for x in df5L_SB_ROS["Best Param"]:
    print(x)

5-Level: Student Background

{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 100}


### SB after SMOTE

In [64]:
temp5L_SB_SMOTE = []

y_train = df1_5L_SMOTE[['CGPA']]
X_train = df1_5L_SMOTE[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

y_test = df2_5L_SMOTE[['CGPA']]
X_test = df2_5L_SMOTE[['Age','Gender','Nationality', 'Living Status', 'Home Town', 'Faculty', 'Education Level',
                     'Years of Study', 'Completed Credit Hours']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SB_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
df5L_SB_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SB_SMOTE)
df5L_SB_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.253957,0.279901,0.253957,0.23993,"{'criterion': 'entropy', 'max_features': 'auto..."
1,100,69,0.259712,0.296202,0.259712,0.245529,"{'criterion': 'gini', 'max_features': 'log2', ..."
2,100,101,0.255396,0.260745,0.255396,0.237614,"{'criterion': 'entropy', 'max_features': 'log2..."


In [65]:
df5L_SB_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.256355,0.27895,0.256355,0.241024


In [66]:
print("SMOTE 5-Level: Student Background\n")
for x in df5L_SB_SMOTE["Best Param"]:
    print(x)

SMOTE 5-Level: Student Background

{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 200}
{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 300}


## Student History Grades

### SHG before Data Balancing

In [67]:
temp5L_SHG_imb = []

y_train = df1_5L_imb[['CGPA']]
X_train = df1_5L_imb[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

y_test = df2_5L_imb[['CGPA']]
X_test = df2_5L_imb[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SHG_imb.append([100,j,acc,pre,recall,f1,param])
        
df5L_SHG_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SHG_imb)
df5L_SHG_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.508143,0.475272,0.508143,0.479525,"{'criterion': 'entropy', 'max_features': 'auto..."
1,100,69,0.508143,0.480351,0.508143,0.477303,"{'criterion': 'entropy', 'max_features': 'auto..."
2,100,101,0.519544,0.485346,0.519544,0.490567,"{'criterion': 'entropy', 'max_features': 'auto..."


In [68]:
df5L_SHG_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.511944,0.480323,0.511944,0.482465


In [69]:
print("Imbalance 5-Level: Student History Grades\n")
for x in df5L_SHG_imb["Best Param"]:
    print(x)

Imbalance 5-Level: Student History Grades

{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}


### SHG after RandomOverSampler

In [70]:
temp5L_SHG_ROS = []

y_train = df1_5L_ROS[['CGPA']]
X_train = df1_5L_ROS[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

y_test = df2_5L_ROS[['CGPA']]
X_test = df2_5L_ROS[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SHG_ROS.append([100,j,acc,pre,recall,f1,param])
        
df5L_SHG_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SHG_ROS)
df5L_SHG_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.325899,0.313251,0.325899,0.305578,"{'criterion': 'gini', 'max_features': 'log2', ..."
1,100,69,0.315108,0.29914,0.315108,0.291939,"{'criterion': 'entropy', 'max_features': 'auto..."
2,100,101,0.336691,0.321789,0.336691,0.314318,"{'criterion': 'entropy', 'max_features': 'sqrt..."


In [71]:
df5L_SHG_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.325899,0.311394,0.325899,0.303945


In [72]:
print("5-Level: Student History Grades\n")
for x in df5L_SHG_ROS["Best Param"]:
    print(x)

5-Level: Student History Grades

{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 400}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 300}


### SHG after SMOTE

In [73]:
temp5L_SHG_SMOTE = []

y_train = df1_5L_SMOTE[['CGPA']]
X_train = df1_5L_SMOTE[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

y_test = df2_5L_SMOTE[['CGPA']]
X_test = df2_5L_SMOTE[['Number of Subjects Failed','SPM Result (A)','SPM Result (B)','SPM Result (C)']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SHG_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
df5L_SHG_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SHG_SMOTE)
df5L_SHG_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.386331,0.380919,0.386331,0.373795,"{'criterion': 'entropy', 'max_features': 'sqrt..."
1,100,69,0.392086,0.384825,0.392086,0.379474,"{'criterion': 'entropy', 'max_features': 'auto..."
2,100,101,0.386331,0.380812,0.386331,0.373682,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [74]:
df5L_SHG_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.388249,0.382185,0.388249,0.37565


In [75]:
print("SMOTE 5-Level: Student History Grades\n")
for x in df5L_SHG_SMOTE["Best Param"]:
    print(x)

SMOTE 5-Level: Student History Grades

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 200}


## Student Opinion Towards MMU Environment

### SOTME before Data Balancing

In [76]:
temp5L_SOTME_imb = []

y_train = df1_5L_imb[['CGPA']]
X_train = df1_5L_imb[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

y_test = df2_5L_imb[['CGPA']]
X_test = df2_5L_imb[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SOTME_imb.append([100,j,acc,pre,recall,f1,param])
        
df5L_SOTME_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SOTME_imb)
df5L_SOTME_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.395765,0.336116,0.395765,0.351065,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.385993,0.33366,0.385993,0.350264,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
2,100,101,0.389251,0.336072,0.389251,0.353215,"{'criterion': 'gini', 'max_features': 'log2', ..."


In [77]:
df5L_SOTME_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.390337,0.335283,0.390337,0.351515


In [78]:
print("Imbalance 5-Level: Student Opinion Towards MMU Environment\n")
for x in df5L_SOTME_imb["Best Param"]:
    print(x)

Imbalance 5-Level: Student Opinion Towards MMU Environment

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 300}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 400}
{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 400}


### SOTME after RandomOverSampler

In [79]:
temp5L_SOTME_ROS = []

y_train = df1_5L_ROS[['CGPA']]
X_train = df1_5L_ROS[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

y_test = df2_5L_ROS[['CGPA']]
X_test = df2_5L_ROS[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SOTME_ROS.append([100,j,acc,pre,recall,f1,param])
        
df5L_SOTME_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SOTME_ROS)
df5L_SOTME_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.173381,0.131381,0.173381,0.142745,"{'criterion': 'entropy', 'max_features': 'auto..."
1,100,69,0.173381,0.132382,0.173381,0.143007,"{'criterion': 'entropy', 'max_features': 'auto..."
2,100,101,0.17482,0.131147,0.17482,0.14511,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [80]:
df5L_SOTME_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.173861,0.131637,0.173861,0.143621


In [81]:
print("5-Level: Student Opinion Towards MMU Environment\n")
for x in df5L_SOTME_ROS["Best Param"]:
    print(x)

5-Level: Student Opinion Towards MMU Environment

{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 300}


### SOTME after SMOTE

In [82]:
temp5L_SOTME_SMOTE = []

y_train = df1_5L_SMOTE[['CGPA']]
X_train = df1_5L_SMOTE[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

y_test = df2_5L_SMOTE[['CGPA']]
X_test = df2_5L_SMOTE[['Reason Study at Multimedia University', 'Lecture Class Capacity','Tutorial Class Capacity', 
                     'Classroom Facilities', 'Campus Environment']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SOTME_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
df5L_SOTME_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SOTME_SMOTE)
df5L_SOTME_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.221583,0.265531,0.221583,0.212389,"{'criterion': 'gini', 'max_features': 'auto', ..."
1,100,69,0.223022,0.265249,0.223022,0.213603,"{'criterion': 'entropy', 'max_features': 'log2..."
2,100,101,0.220144,0.263959,0.220144,0.211074,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [83]:
df5L_SOTME_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.221583,0.264913,0.221583,0.212356


In [84]:
print("SMOTE 5-Level: Student Opinion Towards MMU Environment\n")
for x in df5L_SOTME_SMOTE["Best Param"]:
    print(x)

SMOTE 5-Level: Student Opinion Towards MMU Environment

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 300}


## Student Family Background

### SFB before Data Balancing

In [85]:
temp5L_SFB_imb = []

y_train = df1_5L_imb[['CGPA']]
X_train = df1_5L_imb[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

y_test = df2_5L_imb[['CGPA']]
X_test = df2_5L_imb[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SFB_imb.append([100,j,acc,pre,recall,f1,param])
        
df5L_SFB_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SFB_imb)
df5L_SFB_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.399023,0.341923,0.399023,0.358937,"{'criterion': 'gini', 'max_features': 'log2', ..."
1,100,69,0.405537,0.349718,0.405537,0.363913,"{'criterion': 'entropy', 'max_features': 'sqrt..."
2,100,101,0.410423,0.351731,0.410423,0.366716,"{'criterion': 'entropy', 'max_features': 'sqrt..."


In [86]:
df5L_SFB_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.404995,0.347791,0.404995,0.363189


In [87]:
print("Imbalance 5-Level: Student Family Background\n")
for x in df5L_SFB_imb["Best Param"]:
    print(x)

Imbalance 5-Level: Student Family Background

{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 400}


### SFB after RandomOverSampler

In [88]:
temp5L_SFB_ROS = []

y_train = df1_5L_ROS[['CGPA']]
X_train = df1_5L_ROS[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

y_test = df2_5L_ROS[['CGPA']]
X_test = df2_5L_ROS[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SFB_ROS.append([100,j,acc,pre,recall,f1,param])
        
df5L_SFB_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SFB_ROS)
df5L_SFB_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.190647,0.131595,0.190647,0.143446,"{'criterion': 'entropy', 'max_features': 'auto..."
1,100,69,0.18705,0.120438,0.18705,0.14048,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
2,100,101,0.183453,0.116128,0.183453,0.135776,"{'criterion': 'gini', 'max_features': 'log2', ..."


In [89]:
df5L_SFB_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.18705,0.12272,0.18705,0.1399


In [90]:
print("5-Level: Student Family Background\n")
for x in df5L_SFB_ROS["Best Param"]:
    print(x)

5-Level: Student Family Background

{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 300}
{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 200}


### SFB after SMOTE

In [91]:
temp5L_SFB_SMOTE = []

y_train = df1_5L_SMOTE[['CGPA']]
X_train = df1_5L_SMOTE[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

y_test = df2_5L_SMOTE[['CGPA']]
X_test = df2_5L_SMOTE[['Family Size', "Parent's Education Level.Father", "Parent's Education Level.Mother", 
                     "Father's Occupation", "Mother's Occupation", 'Financial Difficulties', 'Financial Support']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SFB_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
df5L_SFB_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SFB_SMOTE)
df5L_SFB_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.214388,0.189178,0.214388,0.180948,"{'criterion': 'entropy', 'max_features': 'log2..."
1,100,69,0.213669,0.204132,0.213669,0.178296,"{'criterion': 'entropy', 'max_features': 'auto..."
2,100,101,0.203597,0.192355,0.203597,0.168528,"{'criterion': 'entropy', 'max_features': 'sqrt..."


In [92]:
df5L_SFB_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.210552,0.195222,0.210552,0.175924


In [93]:
print("SMOTE 5-Level: Student Family Background\n")
for x in df5L_SFB_SMOTE["Best Param"]:
    print(x)

SMOTE 5-Level: Student Family Background

{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 200}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 400}
{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 300}


## Student Lifestyle

### SL before Data Balancing

In [94]:
temp5L_SL_imb = []

y_train = df1_5L_imb[['CGPA']]
X_train = df1_5L_imb[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

y_test = df2_5L_imb[['CGPA']]
X_test = df2_5L_imb[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SL_imb.append([100,j,acc,pre,recall,f1,param])
        
df5L_SL_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SL_imb)
df5L_SL_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.429967,0.346199,0.429967,0.35813,"{'criterion': 'entropy', 'max_features': 'sqrt..."
1,100,69,0.428339,0.347559,0.428339,0.36051,"{'criterion': 'entropy', 'max_features': 'sqrt..."
2,100,101,0.420195,0.332683,0.420195,0.351766,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [95]:
df5L_SL_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.426167,0.342147,0.426167,0.356802


In [96]:
print("Imbalance 5-Level: Student Lifestyle\n")
for x in df5L_SL_imb["Best Param"]:
    print(x)

Imbalance 5-Level: Student Lifestyle

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 400}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 400}


### SL after RandomOverSampler

In [97]:
temp5L_SL_ROS = []

y_train = df1_5L_ROS[['CGPA']]
X_train = df1_5L_ROS[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

y_test = df2_5L_ROS[['CGPA']]
X_test = df2_5L_ROS[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SL_ROS.append([100,j,acc,pre,recall,f1,param])
        
df5L_SL_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SL_ROS)
df5L_SL_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.220144,0.16002,0.220144,0.158617,"{'criterion': 'gini', 'max_features': 'log2', ..."
1,100,69,0.215827,0.154936,0.215827,0.153065,"{'criterion': 'entropy', 'max_features': 'log2..."
2,100,101,0.205755,0.120672,0.205755,0.136214,"{'criterion': 'entropy', 'max_features': 'log2..."


In [98]:
df5L_SL_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.213909,0.145209,0.213909,0.149299


In [99]:
print("5-Level: Student Lifestyle\n")
for x in df5L_SL_ROS["Best Param"]:
    print(x)

5-Level: Student Lifestyle

{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 400}
{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 400}
{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 100}


### SL after SMOTE

In [100]:
temp5L_SL_SMOTE = []

y_train = df1_5L_SMOTE[['CGPA']]
X_train = df1_5L_SMOTE[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

y_test = df2_5L_SMOTE[['CGPA']]
X_test = df2_5L_SMOTE[['Relationship Status', 'Scholarship', 'PTPTN', 'Skipping Class', 'Additional Course',
                    'Study Behavior', 'Study Time', 'Extra Curricular Activities', 'Part-time Job', 'Commute Time',
                    'Gaming', 'Alcohol Consumption', 'I have enough free time after class', 'Social Interaction',
                    'I hang out with friends']]

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SL_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
df5L_SL_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SL_SMOTE)
df5L_SL_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.266187,0.208943,0.266187,0.232364,"{'criterion': 'entropy', 'max_features': 'log2..."
1,100,69,0.267626,0.207065,0.267626,0.231212,"{'criterion': 'entropy', 'max_features': 'sqrt..."
2,100,101,0.263309,0.201843,0.263309,0.22668,"{'criterion': 'entropy', 'max_features': 'auto..."


In [101]:
df5L_SL_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.265707,0.20595,0.265707,0.230085


In [102]:
print("SMOTE 5-Level: Student Lifestyle\n")
for x in df5L_SL_SMOTE["Best Param"]:
    print(x)

SMOTE 5-Level: Student Lifestyle

{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 400}
{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 400}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 400}


## All Attributes

### AA before Data Balancing

In [103]:
temp5L_imb = []

y_train = df1_5L_imb[['CGPA']]
X_train = df1_5L_imb.drop(columns=['CGPA'])

y_test = df2_5L_imb[['CGPA']]
X_test = df2_5L_imb.drop(columns=['CGPA'])

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_imb.append([100,j,acc,pre,recall,f1,param])
        
df5L_imb = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_imb)
df5L_imb

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.5,0.447008,0.5,0.439274,"{'criterion': 'entropy', 'max_features': 'auto..."
1,100,69,0.526059,0.504118,0.526059,0.467575,"{'criterion': 'gini', 'max_features': 'sqrt', ..."
2,100,101,0.513029,0.46222,0.513029,0.450841,"{'criterion': 'gini', 'max_features': 'auto', ..."


In [104]:
df5L_imb.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.513029,0.471115,0.513029,0.452563


In [105]:
print("Imbalance All Attributes\n")
for x in df5L_imb["Best Param"]:
    print(x)

Imbalance All Attributes

{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 400}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 300}


### AA after RandomOverSampler

In [106]:
temp5L_ROS = []

y_train = df1_5L_ROS[['CGPA']]
X_train = df1_5L_ROS.drop(columns=['CGPA'])

y_test = df2_5L_ROS[['CGPA']]
X_test = df2_5L_ROS.drop(columns=['CGPA'])

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_ROS.append([100,j,acc,pre,recall,f1,param])
        
df5L_ROS = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_ROS)
df5L_ROS

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.274101,0.30625,0.274101,0.214967,"{'criterion': 'entropy', 'max_features': 'log2..."
1,100,69,0.291367,0.365368,0.291367,0.23756,"{'criterion': 'entropy', 'max_features': 'log2..."
2,100,101,0.291367,0.314364,0.291367,0.239758,"{'criterion': 'gini', 'max_features': 'sqrt', ..."


In [107]:
df5L_ROS.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.285612,0.328661,0.285612,0.230762


In [108]:
print("All Attributes\n")
for x in df5L_ROS["Best Param"]:
    print(x)

All Attributes

{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 200}
{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 200}


### AA after SMOTE

In [109]:
temp5L_SMOTE = []

y_train = df1_5L_SMOTE[['CGPA']]
X_train = df1_5L_SMOTE.drop(columns=['CGPA'])

y_test = df2_5L_SMOTE[['CGPA']]
X_test = df2_5L_SMOTE.drop(columns=['CGPA'])

for j in rand_states:
    param_grid = {'n_estimators': [100, 200, 300, 400],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion' :['gini', 'entropy']}

    model = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, n_jobs=-1, cv=10, verbose=1)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test, pred, average='weighted')
    recall = recall_score(y_test, pred, average='weighted')
    f1 = f1_score(y_test, pred, average='weighted')
    param = model.best_params_

    temp5L_SMOTE.append([100,j,acc,pre,recall,f1,param])
        
df5L_SMOTE = pd.DataFrame(columns=['Test Size','Random State','Accuracy','Precision','Recall','F1 Score','Best Param'],data=temp5L_SMOTE)
df5L_SMOTE

Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)


Fitting 10 folds for each of 24 candidates, totalling 240 fits


  self.best_estimator_.fit(X, y, **fit_params)
  warn(


Unnamed: 0,Test Size,Random State,Accuracy,Precision,Recall,F1 Score,Best Param
0,100,7,0.338129,0.26842,0.338129,0.297391,"{'criterion': 'gini', 'max_features': 'log2', ..."
1,100,69,0.353957,0.281779,0.353957,0.312257,"{'criterion': 'entropy', 'max_features': 'log2..."
2,100,101,0.34964,0.276523,0.34964,0.308257,"{'criterion': 'entropy', 'max_features': 'auto..."


In [110]:
df5L_SMOTE.groupby('Test Size')[['Accuracy','Precision','Recall','F1 Score']].mean()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Test Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.347242,0.275574,0.347242,0.305968


In [111]:
print("SMOTE All Attributes\n")
for x in df5L_SMOTE["Best Param"]:
    print(x)

SMOTE All Attributes

{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 300}
{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 400}
