In [1]:
# Importing required libraries
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import time

In [2]:
# Import the student drop out dataset from UCI repo assign the feature matrix to 'X' and target labels to 'y'
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 
print(predict_students_dropout_and_academic_success.metadata) 
print(predict_students_dropout_and_academic_success.variables) 

{'uci_id': 697, 'name': "Predict Students' Dropout and Academic Success", 'repository_url': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success', 'data_url': 'https://archive.ics.uci.edu/static/public/697/data.csv', 'abstract': "A dataset created from a higher education institution (acquired from several disjoint databases) related to students enrolled in different undergraduate degrees, such as agronomy, design, education, nursing, journalism, management, social service, and technologies.\nThe dataset includes information known at the time of student enrollment (academic path, demographics, and social-economic factors) and the students' academic performance at the end of the first and second semesters. \nThe data is used to build classification models to predict students' dropout and academic sucess. The problem is formulated as a three category classification task, in which there is a strong imbalance towards one of the classes.", 'area': 'Social Sc

In [3]:
# We analyze the number of datapoints present for each class label 
label_counts = y.value_counts()
label_counts_df = pd.DataFrame(label_counts).reset_index()
label_counts_df.columns = ['target', 'count']

# Display the table
print(label_counts_df)

     target  count
0  Graduate   2209
1   Dropout   1421
2  Enrolled    794


In [4]:
# Split the data into training and test dataset
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  y = column_or_1d(y, warn=True)


In [5]:
# Initialize a randon forest classifier with 25 trees, min_sample_leaf =2, max_depth = 10, min_sample_split=2
random_forest_classifier = RandomForestClassifier(min_samples_leaf=2, max_depth=10, min_samples_split=2, n_estimators=25, random_state=142)

start_time = time.time()

random_forest_classifier.fit(X_train, y_train) 

dt_training_time = time.time() - start_time
   
print(f'Training time : {dt_training_time}')

Training time : 0.05907297134399414


In [6]:
# Using the trained model to predict the test data, measure accuracy and output the classification report. 
start_time = time.time()

y_pred = random_forest_classifier.predict(X_test)

prediction_time = time.time() - start_time

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(classification_report(y_test, y_pred))

print(f'Prediction time : {prediction_time / y_pred.size }')

Accuracy: 75.82%
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       316
           1       0.53      0.27      0.36       151
           2       0.76      0.94      0.84       418

    accuracy                           0.76       885
   macro avg       0.70      0.65      0.66       885
weighted avg       0.74      0.76      0.74       885

Prediction time : 3.3683022536800404e-06


In [7]:
# We take the estimators object from the random forest classifier observe the accuracies of the top 10 and bottom 10 trees

top_10_trees = random_forest_classifier.estimators_[:10]
top_10_accuracies = []
for tree in top_10_trees:
    y_pred_top = tree.predict(X_test.values)
    accuracy_top = accuracy_score(y_test, y_pred_top)
    top_10_accuracies.append(accuracy_top)

print("Top 10 Trees Accuracies:", top_10_accuracies)
print("Average Accuracy of Top 10 Trees:", sum(top_10_accuracies) / len(top_10_accuracies))

bottom_10_trees = random_forest_classifier.estimators_[-10:]
bottom_10_accuracies = []
for tree in bottom_10_trees:
    y_pred_bottom = tree.predict(X_test.values)
    accuracy_bottom = accuracy_score(y_test, y_pred_bottom)
    bottom_10_accuracies.append(accuracy_bottom)

print("Bottom 10 Trees Accuracies:", bottom_10_accuracies)
print("Average Accuracy of Bottom 10 Trees:", sum(bottom_10_accuracies) / len(bottom_10_accuracies))


Top 10 Trees Accuracies: [0.688135593220339, 0.6824858757062147, 0.6768361581920904, 0.6677966101694915, 0.6903954802259887, 0.7016949152542373, 0.6802259887005649, 0.7028248587570621, 0.7073446327683616, 0.6734463276836158]
Average Accuracy of Top 10 Trees: 0.6871186440677965
Bottom 10 Trees Accuracies: [0.7107344632768362, 0.6858757062146893, 0.6757062146892655, 0.696045197740113, 0.6779661016949152, 0.6937853107344633, 0.6824858757062147, 0.6700564971751413, 0.6632768361581921, 0.6836158192090396]
Average Accuracy of Bottom 10 Trees: 0.683954802259887


In [8]:
# Initialize a randon forest classifier with 50 trees, min_sample_leaf =2, max_depth = 10, min_sample_split=2
random_forest_classifier = RandomForestClassifier(min_samples_leaf=2, max_depth=10, min_samples_split=2, n_estimators=50, random_state=142)

start_time = time.time()

random_forest_classifier.fit(X_train, y_train) 

dt_training_time = time.time() - start_time
   
print(f'Training time : {dt_training_time}')

Training time : 0.10665392875671387


In [9]:
# Using the trained model to predict the test data, measure accuracy and output the classification report. 
start_time = time.time()

y_pred = random_forest_classifier.predict(X_test)

prediction_time = time.time() - start_time
   
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(classification_report(y_test, y_pred))
print(f'Prediction time : {prediction_time / y_pred.size }')

Accuracy: 76.38%
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       316
           1       0.56      0.28      0.37       151
           2       0.75      0.94      0.84       418

    accuracy                           0.76       885
   macro avg       0.72      0.66      0.67       885
weighted avg       0.75      0.76      0.74       885

Prediction time : 5.39230088056144e-06


In [10]:
# We take the estimators object from the random forest classifier observe the accuracies of the top 10 and bottom 10 trees

top_10_trees = random_forest_classifier.estimators_[:10]
top_10_accuracies = []
for tree in top_10_trees:
    y_pred_top = tree.predict(X_test.values)
    accuracy_top = accuracy_score(y_test, y_pred_top)
    top_10_accuracies.append(accuracy_top)

print("Top 10 Trees Accuracies:", top_10_accuracies)
print("Average Accuracy of Top 10 Trees:", sum(top_10_accuracies) / len(top_10_accuracies))

bottom_10_trees = random_forest_classifier.estimators_[-10:]
bottom_10_accuracies = []
for tree in bottom_10_trees:
    y_pred_bottom = tree.predict(X_test.values)
    accuracy_bottom = accuracy_score(y_test, y_pred_bottom)
    bottom_10_accuracies.append(accuracy_bottom)

print("Bottom 10 Trees Accuracies:", bottom_10_accuracies)
print("Average Accuracy of Bottom 10 Trees:", sum(bottom_10_accuracies) / len(bottom_10_accuracies))

Top 10 Trees Accuracies: [0.688135593220339, 0.6824858757062147, 0.6768361581920904, 0.6677966101694915, 0.6903954802259887, 0.7016949152542373, 0.6802259887005649, 0.7028248587570621, 0.7073446327683616, 0.6734463276836158]
Average Accuracy of Top 10 Trees: 0.6871186440677965
Bottom 10 Trees Accuracies: [0.6757062146892655, 0.6847457627118644, 0.6655367231638418, 0.6892655367231638, 0.6485875706214689, 0.6847457627118644, 0.6587570621468927, 0.6655367231638418, 0.6745762711864407, 0.6598870056497175]
Average Accuracy of Bottom 10 Trees: 0.6707344632768362


In [11]:
# We are using GridSearchCV to get the better performing hyper parameters values for max_depth, min_samples_split, min_samples_leaf
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [25, 50],            
    'max_depth': [4, 10, 15, 20, None],           
    'min_samples_split': [2, 5, 10],            
    'min_samples_leaf': [1, 2, 4],              
    'max_features': ['sqrt', 'log2', None]      
}


random_forest_classifier = RandomForestClassifier(random_state=142)
grid_search = GridSearchCV(estimator=random_forest_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)

best_random_forest_classifier = grid_search.best_estimator_
y_pred = best_random_forest_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Tuned Parameters: {accuracy * 100:.2f}%")

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=   0.1s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=   0.1s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=   0.1s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=25; total time=   0.1s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=25; total time=   0.1s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=25; total time=   0.1s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=25; total time=   0.1s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=

In [12]:
# Training and testing a model with 100 tree to observe the effect of adding more tree
random_forest_classifier = RandomForestClassifier(min_samples_leaf=2, max_depth=10, min_samples_split=2, n_estimators=50, random_state=142)

start_time = time.time()
random_forest_classifier.fit(X_train, y_train) 
dt_training_time = time.time() - start_time
   
print(f'Training time : {dt_training_time}')

start_time = time.time()
y_pred = random_forest_classifier.predict(X_test)
prediction_time = time.time() - start_time
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(f'Prediction time : {prediction_time / y_pred.size }')

Training time : 0.11209416389465332
Accuracy: 76.38%
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       316
           1       0.56      0.28      0.37       151
           2       0.75      0.94      0.84       418

    accuracy                           0.76       885
   macro avg       0.72      0.66      0.67       885
weighted avg       0.75      0.76      0.74       885

Prediction time : 3.6611395367121292e-06


In [13]:
# Training and testing a model with 100 tree to observe the effect of adding more tree
random_forest_classifier = RandomForestClassifier(min_samples_leaf=2, max_depth=20, min_samples_split=2, n_estimators=50, max_features='log2', random_state=142)

start_time = time.time()
random_forest_classifier.fit(X_train, y_train) 
dt_training_time = time.time() - start_time
   
print(f'Training time : {dt_training_time}')

start_time = time.time()
y_pred = random_forest_classifier.predict(X_test)
prediction_time = time.time() - start_time
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))
print(f'Prediction time : {prediction_time / y_pred.size }')

Training time : 0.1221170425415039
Accuracy: 76.50%
              precision    recall  f1-score   support

           0       0.85      0.75      0.80       316
           1       0.49      0.30      0.37       151
           2       0.77      0.94      0.85       418

    accuracy                           0.76       885
   macro avg       0.70      0.66      0.67       885
weighted avg       0.75      0.76      0.75       885

Prediction time : 4.786690749691031e-06
