In [None]:
# Importing python libraries
#
import pandas as pd
import numpy as np
from scipy.stats import mode
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set(style = 'darkgrid')
import requests
from io import StringIO
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz 
from IPython.display import Image
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score, roc_curve, auc
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
student_data = pd.read_csv('../data/cleaned_students_dataset.csv')


In [None]:
student_data.head()

In [None]:
student_data.shape

In [None]:
# label encode final_grade
le = preprocessing.LabelEncoder()
student_data.final_grade = le.fit_transform(student_data.final_grade)

In [None]:
# Drop the target columns from the predictive features
X = student_data.drop(labels=['final_grade','final_score'],axis=1)

In [None]:
# set the target variable
y = student_data.final_grade

In [None]:
# get dummy varibles for the predictive features
X = pd.get_dummies(X)


In [None]:
X.head()

In [None]:
X.shape

In [None]:
y.head()

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state = 101)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

### Logistic Regression

In [None]:
# define and configure the model
# l1 regularization gives better results
model = LogisticRegression()
# fit the model
model.fit(X_train, y_train)

# evaluate the model
log_preds = model.predict(X_test)
print('Accuracy : ',accuracy_score(y_test, log_preds))

print(f'The accuracy score is: {accuracy_score(y_test, log_preds)} and the f1 score is {f1_score(y_test, log_preds)}')
print(f'The recall score is: {recall_score(y_test, log_preds)}')
print('\n')
print(f'{classification_report(y_test, log_preds)}')
matrix = confusion_matrix(y_test, log_preds)
print(matrix)

In [None]:
sns.heatmap(matrix,annot = True, fmt = "d")

### Decision Tree Classifier

In [None]:

# define and configure the model
model = DecisionTreeClassifier()
# fit the model
model.fit(X_train, y_train)
# evaluate the model
dc_preds = model.predict(X_test)
print('Accuracy : ',accuracy_score(y_test, dc_preds))

print(f'The accuracy score is: {accuracy_score(y_test, dc_preds)} and the f1 score is {f1_score(y_test, dc_preds)}')
print(f'The recall score is: {recall_score(y_test, dc_preds)}')
print('\n')
print(f'{classification_report(y_test, dc_preds)}')
matrix = confusion_matrix(y_test, dc_preds)
print(matrix)

In [None]:
sns.heatmap(matrix,annot = True, fmt = "d")

### Random Forest Classifier

In [None]:
# define and configure the model
model = RandomForestClassifier()

# fit the model
model.fit(X_train, y_train)

# evaluate the model
rf_preds = model.predict(X_test)
print('Accuracy : ',accuracy_score(y_test, rf_preds))

print(f'The accuracy score is: {accuracy_score(y_test, rf_preds)} and the f1 score is {f1_score(y_test, rf_preds)}')
print(f'The recall score is: {recall_score(y_test, rf_preds)}')
print('\n')
print(f'{classification_report(y_test, rf_preds)}')
matrix = confusion_matrix(y_test, rf_preds)
print(matrix)

In [None]:
sns.heatmap(matrix,annot = True, fmt = "d")

In [None]:
# Checking for the most important features that contribute most in predicting the target
# Creating a dataframe of features and their respective importances
#
rf_impo_df = pd.DataFrame({'feature': X.columns, 'importance': np.round(model.feature_importances_, 4)}).set_index('feature').sort_values(by = 'importance', ascending = False)
rf_impo_df

In [None]:
# Creating a bar chart of feature importance in descending order
#
rf_impo_df = rf_impo_df[:15].sort_values(by = 'importance', ascending = True)
rf_impo_df.plot(kind = 'barh', figsize = (10, 10), color = 'purple')
plt.legend(loc = 'center right')
plt.title('Bar chart showing feature importance', color = 'indigo', fontsize = 14)
plt.xlabel('Features', fontsize = 12, color = 'indigo')
plt.show()

### Using the most important features to build a random forest model

In [None]:
rf_impo_df.index

# Data Modelling

## Random Forest Classifier

### Remodelling with the most important features only

In [None]:
# Selecting only important features and the y variable
#
X = X[rf_impo_df.index]

y = y

In [None]:
X.shape, y.shape

In [None]:
# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

# define and configure the model
model = RandomForestClassifier()

# fit the model
model.fit(X_train, y_train)

# evaluate the model
rf_preds = model.predict(X_test)
print('Accuracy : ',accuracy_score(y_test, rf_preds))

print(f'The accuracy score is: {accuracy_score(y_test, rf_preds)} and the f1 score is {f1_score(y_test, rf_preds)}')
print(f'The recall score is: {recall_score(y_test, rf_preds)}')
print('\n')
print(f'{classification_report(y_test, rf_preds)}')
matrix = confusion_matrix(y_test, rf_preds)
print(matrix)

In [None]:
sns.heatmap(matrix,annot = True, fmt = "d")

### Parameter Tuning

In [None]:
# Previewing the parameters to tune
#
RandomForestClassifier()


In [None]:
# Creating a dictionary of parameters to tune
#
params = {'n_estimators': [10, 20, 30, 50, 100],
         'max_depth': [1, 2, 3, 4, 5]}

# Setting the number of folds to 10 and instantiating the model
# 
kfold = KFold(n_splits=10, shuffle=True)
model = RandomForestClassifier()

search = GridSearchCV(model, param_grid=params, scoring = 'f1', cv = kfold)

# Fitting the grid search with the X and the y variables
#
search.fit(X, y)

# Checking for the best parameters
#
print(f'The best parameters are: {search.best_params_}')

In [None]:
X.shape, y.shape

In [None]:
# Applying the best parameters to the model


# Splitting the data into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

## define and configure the model
model = RandomForestClassifier(max_depth = 3, n_estimators = 100)

# fit the model
model.fit(X_train, y_train)

# evaluate the model
rf_preds = model.predict(X_test)
print('Accuracy : ',accuracy_score(y_test, rf_preds))

print(f'The accuracy score is: {accuracy_score(y_test, rf_preds)} and the f1 score is {f1_score(y_test, rf_preds)}')
print(f'The recall score is: {recall_score(y_test, rf_preds)}')
print('\n')
print(f'{classification_report(y_test, rf_preds)}')
matrix = confusion_matrix(y_test, rf_preds)
print(matrix)

In [None]:
sns.heatmap(matrix,annot = True, fmt = "d")

*Parameter tuning has increased the f1 score*

### Cross Validation to check for the stability of the model

In [None]:
# Performing cross validation of ten folds
#
scores = cross_val_score(model, X, y, scoring = 'f1', cv = 10)

# Calculating the mean of the cross validation scores
#
print(f'Mean of cross validation scores is {scores.mean()}')

# Calculating the variance of the cross validation scores from the mean
#
print(f'Standard deviation of the cross validation scores is {scores.std()}')

### Plotting ROC Index Curve and comparing AUC

In [None]:
logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, log_preds)
auc_logistic = auc(logistic_fpr, logistic_tpr)

decision_tree_fpr, decision_tree_tpr, threshold = roc_curve(y_test, dc_preds)
auc_decision_tree = auc(decision_tree_fpr, decision_tree_tpr)

random_forest_fpr, random_forest_tpr, threshold = roc_curve(y_test, rf_preds)
auc_random_forest = auc(random_forest_fpr, random_forest_tpr)



plt.figure(figsize=(5, 5), dpi=100)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)
plt.plot(decision_tree_fpr, decision_tree_tpr, marker='.', label='decision_tree (auc = %0.3f)' % auc_decision_tree)
plt.plot(random_forest_fpr, random_forest_tpr, marker='.', label='random_forest (auc = %0.3f)' % auc_random_forest)

plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')

plt.legend()

plt.show()

### From the above chart, Random Forest offers the best RUC and AUC performance