In [16]:
import json
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from time import time
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import math

column_names = "code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result"
dummies_col_names = "gender_1,gender_2,gender_3,gender_0,disability_1,disability_2,disability_0,age_band_1,age_band_2,age_band_3,age_band_0,highest_education_1,highest_education_2,highest_education_3,highest_education_4,highest_education_5,highest_education_0,imd_band_1,imd_band_2,imd_band_3,imd_band_4,imd_band_5,imd_band_6,imd_band_7,imd_band_8,imd_band_9,imd_band_10,imd_band_0,code_presentation_1,code_presentation_2,code_presentation_3,code_presentation_4,code_presentation_0,code_module_1,code_module_2,code_module_3,code_module_4,code_module_5,code_module_6,code_module_7,code_module_0,region_1,region_2,region_3,region_4,region_15,region_6,region_7,region_8,region_9,region_10,region_11,region_12,region_13,region_14,region_0"
categorical_col_names = "code_module,code_presentation,gender,region,highest_education,imd_band,age_band,disability"

column_names = column_names.split(',')
dummies_col_names = dummies_col_names.split(',')
categorical_col_names = categorical_col_names.split(',')


df = pd.read_csv('studentInfo.csv')
df = df.drop(['id_student'], axis=1)

class_counts = df['final_result'].value_counts()
print(class_counts)


final_result
Pass           12361
Withdrawn      10156
Fail            7052
Distinction     3024
Name: count, dtype: int64


In [17]:
# DATA.ENCODER()
# Open the JSON file
with open('encoding.json', 'r') as f:
        # Load the JSON file into a dictionary
        mappings = json.load(f)

# Replace NaN values with 'Unknown'
for column_name in column_names:
        if column_name in categorical_col_names:
            df[column_name] = df[column_name].fillna("Unknown")

# Replace values in each column based on the mappings
for column, mapping in mappings.items():
        if column in df.columns:
            df[column] = df[column].map(mapping)

In [18]:
# CREATE_RANDOMFOREST_MODEL()

#df = df.sample(frac=0.2)

grid = {
            'n_estimators': [int(x) for x in range(100, 1100, 200)],
            'max_depth': [int(x) for x in range(5, 20, 5)],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4, 0.1, 0.2, 0.3, 0.4, 0.5]
}

rf = RandomForestClassifier(n_jobs=-1)
model = GridSearchCV(estimator=rf, param_grid=grid, cv=3, n_jobs=-1, verbose=2)

In [19]:
def feature_engineering(df):
    """
    Perform feature engineering on the given DataFrame.
    Adds new features by combining existing features.

    <Feature: num_of_prev_attempts + studied_credits>, 
    <Feature: num_of_prev_attempts / studied_credits>, 
    <Feature: studied_credits / num_of_prev_attempts>, 
    <Feature: num_of_prev_attempts * studied_credits>, 
    <Feature: num_of_prev_attempts - studied_credits>,
    <Feature: num_of_prev_attempts ^ 2>,
    <Feature: studied_credits ^ 2>,
    <Feature: num_of_prev_attempts ^ 3>,
    <Feature: studied_credits ^ 3>,
    <Feature: log(num_of_prev_attempts)>,
    <Feature: log(studied_credits)>

    Args:
        df (pandas.DataFrame): The DataFrame to perform feature engineering on.

    Returns:
        pandas.DataFrame: The DataFrame with the new features added.
    """
    #Feature: num_of_prev_attempts + studied_credits
    df['num_of_prev_attempts + studied_credits'] = df['num_of_prev_attempts'] + df['studied_credits']

    #Feature: num_of_prev_attempts / studied_credits
    df['num_of_prev_attempts / studied_credits'] = np.where(df['studied_credits'] == 0, 0, df['num_of_prev_attempts'] / df['studied_credits'])

    #Feature: studied_credits / num_of_prev_attempts
    df['studied_credits / num_of_prev_attempts'] = np.where(df['num_of_prev_attempts'] == 0, 0, df['studied_credits'] / df['num_of_prev_attempts'])

    #Feature: num_of_prev_attempts * studied_credits
    df['num_of_prev_attempts * studied_credits'] = df['num_of_prev_attempts'] * df['studied_credits']

    #Feature: num_of_prev_attempts - studied_credits
    df['num_of_prev_attempts - studied_credits'] = df['num_of_prev_attempts'] - df['studied_credits']

    #Feature: num_of_prev_attempts ^ 2
    df['num_of_prev_attempts ^ 2'] = df['num_of_prev_attempts'] ** 2

    #Feature: studied_credits ^ 2
    df['studied_credits ^ 2'] = df['studied_credits'] ** 2

    #Feature: num_of_prev_attempts ^ 3
    df['num_of_prev_attempts ^ 3'] = df['num_of_prev_attempts'] ** 3

    #Feature: studied_credits ^ 3
    df['studied_credits ^ 3'] = df['studied_credits'] ** 3

    #Feature: log(num_of_prev_attempts)
    df['log(num_of_prev_attempts)'] = df['num_of_prev_attempts'].apply(lambda x: 0 if x == 0 else math.log(x))

    #Feature: log(studied_credits)
    df['log(studied_credits)'] = df['studied_credits'].apply(lambda x: 0 if x == 0 else math.log(x))

    return df

In [20]:
# TRAIN_MODEL()

time_start = time()
print("0-Dataset Head:")
print(df.head(3))
print()

# Split the dataset into features and target
# X has features minus the target column
X = df.drop(['final_result'], axis=1)
print("1/11-target droped from dataset")

if False:
    # Feature Engineering
    print("X columns: ", X.columns)
    print("X head: ", X.head(3))
    X = feature_engineering(X)
    print("1.5-Feature Engineering applied to dataset")


0-Dataset Head:
   code_module  code_presentation  gender  region  highest_education  \
0            1                  2       1       2                  4   
1            1                  2       2       1                  4   
2            1                  2       2      15                  3   

   imd_band  age_band  num_of_prev_attempts  studied_credits  disability  \
0        10         3                     0              240           1   
1         3         2                     0               60           1   
2         4         2                     0               60           2   

   final_result  
0             1  
1             1  
2             0  

1/11-target droped from dataset


In [21]:
# create_oneHotEncoder_and_encode()

def create_oneHotEncoder_and_encode(df):
    """
    Create a OneHotEncoder and fit it to the given DataFrame and encode the DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The DataFrame to fit the encoder to and to encode.
    
    Returns:
    df_encoded (pandas.DataFrame): The DataFrame with the categorical columns encoded.
    """

    # Create a OneHotEncoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Fit the encoder using the larger dataset
    encoder.fit(df[categorical_col_names])

    # Save the encoder to a file
    #dump(encoder, 'OneHotEncoder.joblib')

    # Transform the DataFrame and convert the sparse matrix to a dense array
    df_encoded = pd.DataFrame(encoder.transform(df[categorical_col_names]).toarray())

    # Convert the data type of the columns to int (ou bool?)
    df_encoded = df_encoded.astype(bool)

    return df_encoded

In [22]:
# TRAIN_MODEL() CONTINUES

X = create_oneHotEncoder_and_encode(X)
print("2/11-encoder created and encode applied to dataset")

class_counts = df['final_result'].value_counts()
print()
print("Class counts: ")
print(class_counts)
print()

# y has the target column and is converted to binary
y = df['final_result']
print("3/11-target column extracted from dataset into y")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("4/11-dataset split into training and testing sets")

# Apply SMOTE to the training data
""" sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)
print("4.1-SMOTE applied to training data") """

print()
print("Head of X_train:")
print(X_train.head(2))
print()
#print("X Dtypes")
#print(X_train.dtypes)
#print()
print("Head of y_train:")
print(y_train.head(2))
print()
#print("Y Dtypes")
#print(y_train.dtypes)

# Train the model using the training sets
model.fit(X_train, y_train)
print("5/11-model trained")
time_fit = time()

# Predict the response for test dataset
y_pred = model.predict(X_test)
print("6/11-model prediction completed")

# Save the model
#database.update_model_file(database_name, model_id, model)
#print("7/11-model file updated")

# Store the evaluation in the database
matrix = confusion_matrix(y_test, y_pred)
#database.store_evaluation(database_name, model_id, matrix)
#print("8/11-evaluation stored in database")

# set model atribute is_trained to true
#database.set_model_trained(database_name, model_id)
#print("9/11-model trained set to true")

#if database.set_ds_train_id(database_name=database_name, model_id=model_id, ds_id=ds_id):
    #print("10/11-dataset train id set")
        
# Predict the probabilities for the test dataset
""" y_score = model.predict_proba(X_test)[:, 1]
vz.create_ROC(model_id, y_test, y_score)
vz.create_confusion_matrix(database_name, model_id)
vz.create_PRC(model_id, y_test, y_score)
print("11/11-visualizations created") """

# print f1 score
tn, fp, fn, tp = matrix.ravel()
    
# Convert numpy.int64 types to int
fp, fn, tp, tn = int(fp), int(fn), int(tp), int(tn)

precision = tp / float(tp + fp)
recall = tp / float(tp + fn)
f1 = 2 * (precision * recall) / float(precision + recall)
f1 = round(f1, 2)

scores = cross_val_score(model, X, y, cv=5)
print()
print("Cross-validated scores:", scores)

print()
print("Previous score: 0.53")
print("Current F1 Score: ", f1)
print()

time_end = time()
print(f"Time to fit: {(time_fit - time_start) / 60} minutes")
print(f"Time elapsed: {(time_end - time_start) / 60} minutes")
print("-train_model finished")

# save the parameters in a file
f1 = f1*100
with open(f'best_parameters_rf_score{f1}.json', 'w') as f:
    json.dump(model.best_params_, f)

2/11-encoder created and encode applied to dataset

Class counts: 
final_result
0    17208
1    15385
Name: count, dtype: int64

3/11-target column extracted from dataset into y
4/11-dataset split into training and testing sets

Head of X_train:
          0      1      2      3      4      5      6      7     8      9   \
24791  False  False  False  False  False   True  False  False  True  False   
3490   False   True  False  False  False  False  False  False  True  False   

       ...     37     38     39     40     41     42     43     44    45  \
24791  ...  False  False  False  False  False   True  False  False  True   
3490   ...  False  False  False  False  False  False   True  False  True   

          46  
24791  False  
3490   False  

[2 rows x 47 columns]

Head of y_train:
24791    1
3490     1
Name: final_result, dtype: int64

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
5/11-model trained
6/11-model prediction completed
Fitting 3 folds for each of 360 c