In [1]:
# Importing required libraries
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import time


In [2]:
# Import the student drop out dataset from UCI repo assign the feature matrix to 'X' and target labels to 'y'
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 
print(predict_students_dropout_and_academic_success.metadata) 
print(predict_students_dropout_and_academic_success.variables) 

{'uci_id': 697, 'name': "Predict Students' Dropout and Academic Success", 'repository_url': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success', 'data_url': 'https://archive.ics.uci.edu/static/public/697/data.csv', 'abstract': "A dataset created from a higher education institution (acquired from several disjoint databases) related to students enrolled in different undergraduate degrees, such as agronomy, design, education, nursing, journalism, management, social service, and technologies.\nThe dataset includes information known at the time of student enrollment (academic path, demographics, and social-economic factors) and the students' academic performance at the end of the first and second semesters. \nThe data is used to build classification models to predict students' dropout and academic sucess. The problem is formulated as a three category classification task, in which there is a strong imbalance towards one of the classes.", 'area': 'Social Sc

In [3]:
# Split the data into training and test dataset
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training dataset sample count: {len(X_train)}')
print(f'Test dataset sample count: {len(X_test)}')

Training dataset sample count: 3539
Test dataset sample count: 885


  y = column_or_1d(y, warn=True)


In [4]:
# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=2, min_samples_split=10, random_state=42)  

start_time = time.time()

clf.fit(X_train, y_train)

dt_training_time = time.time() - start_time
   
print(f'Training time : {dt_training_time}')


Training time : 0.014155864715576172


In [5]:
# Using the trained model to predict the test data, measure accuracy and output the classification report. 
start_time = time.time()

y_pred = clf.predict(X_test)

prediction_time = time.time() - start_time

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(classification_report(y_test, y_pred))

print(f'Prediction time : {prediction_time / y_pred.size }')


Accuracy: 73.56%
              precision    recall  f1-score   support

           0       0.88      0.65      0.75       316
           1       0.46      0.31      0.37       151
           2       0.72      0.95      0.82       418

    accuracy                           0.74       885
   macro avg       0.69      0.64      0.65       885
weighted avg       0.74      0.74      0.72       885

Prediction time : 1.8305697683560646e-06


In [6]:
# We are using GridSearchCV to get the better performing hyper parameters values for max_depth, min_samples_split, min_samples_leaf

from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],        
    'max_depth': [None, 5, 10, 15, 20],     
    'min_samples_split': [2, 5, 10, 15],     
    'min_samples_leaf': [1, 2, 4, 6],       
    'max_features': [None, 'sqrt', 'log2'] 
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred.flatten())
print(f"Test Accuracy with Tuned Parameters: {accuracy * 100:.2f}%")

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_sp



In [7]:
min_values = X.min()  
max_values = X.max() 

min_max_df = pd.DataFrame({'min': min_values, 'max': max_values})

print(min_max_df)

                                                  min          max
Marital Status                                   1.00     6.000000
Application mode                                 1.00    57.000000
Application order                                0.00     9.000000
Course                                          33.00  9991.000000
Daytime/evening attendance                       0.00     1.000000
Previous qualification                           1.00    43.000000
Previous qualification (grade)                  95.00   190.000000
Nacionality                                      1.00   109.000000
Mother's qualification                           1.00    44.000000
Father's qualification                           1.00    44.000000
Mother's occupation                              0.00   194.000000
Father's occupation                              0.00   195.000000
Admission grade                                 95.00   190.000000
Displaced                                        0.00     1.00