<a href="https://colab.research.google.com/github/kossi-26/Refractory_Machine-Learning/blob/Refactory-final-project/Group%201_data_science_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
import joblib
import warnings
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
np.random.seed(42)
sns.set_theme()

In [None]:
# Load the primary UCI dataset
df = pd.read_csv('data.csv', delimiter=';')
print(f'Original UCI data shape: {df.shape}')

Original UCI data shape: (4424, 37)


In [None]:
# Add simulated Tanzania-specific features
np.random.seed(42)  # For reproducibility

# Rural/Urban divide
df['Location_Type'] = np.random.choice(['Urban', 'Rural'], size=len(df), p=[0.7, 0.3])

# Internet access (lower in rural areas)
df['Internet_Access'] = np.where(
    df['Location_Type'] == 'Urban',
    np.clip(np.random.normal(0.6, 0.2, len(df)), 0.1, 1.0),
    np.clip(np.random.normal(0.3, 0.15, len(df)), 0.05, 0.8)
).round(3)

# Teacher qualifications (lower in rural areas)
df['Teacher_Quality'] = np.where(
    df['Location_Type'] == 'Urban',
    np.clip(np.random.normal(0.75, 0.15, len(df)), 0.4, 1.0),
    np.clip(np.random.normal(0.5, 0.2, len(df)), 0.2, 0.9)
).round(3)

# Distance to school (greater in rural areas)
df['School_Distance_km'] = np.where(
    df['Location_Type'] == 'Urban',
    np.clip(np.random.gamma(2, 0.5, len(df)), 0.5, 5),
    np.clip(np.random.gamma(3, 2, len(df)), 1, 15)
).round(3)

print(f'Enhanced dataset shape: {df.shape}') # New features added - Location_Type, Internet_Access, Teacher_Quality, School_Distance_km
df.head()

Enhanced dataset shape: (4424, 41)


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Location_Type,Internet_Access,Teacher_Quality,School_Distance_km
0,1,17,5,171,1,1,122.0,1,19,12,...,0.0,0,10.8,1.4,1.74,Dropout,Urban,0.415,0.704,1.86
1,1,15,1,9254,1,1,160.0,1,1,3,...,13.666667,0,13.9,-0.3,0.79,Graduate,Rural,0.381,0.792,7.205
2,1,1,5,9070,1,1,122.0,1,37,37,...,0.0,0,10.8,1.4,1.74,Dropout,Rural,0.249,0.451,9.939
3,1,17,2,9773,1,1,122.0,1,38,37,...,12.4,0,9.4,-0.8,-3.12,Graduate,Urban,0.557,0.821,0.55
4,2,39,1,8014,0,1,100.0,1,37,38,...,13.0,0,13.9,-0.3,0.79,Graduate,Urban,0.699,0.692,2.14


In [None]:
# Show basic stats of the enhanced dataset
df.describe().round(3)

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Internet_Access,Teacher_Quality,School_Distance_km
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,...,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0
mean,1.179,18.669,1.728,8856.643,0.891,4.578,132.613,1.873,19.562,22.275,...,8.063,4.436,10.23,0.15,11.566,1.228,0.002,0.511,0.68,2.512
std,0.606,17.485,1.314,2063.566,0.312,10.217,13.188,6.915,15.603,15.343,...,3.948,3.015,5.211,0.754,2.664,1.383,2.27,0.225,0.193,2.956
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06,0.05,0.2,0.5
25%,1.0,1.0,1.0,9085.0,1.0,1.0,125.0,1.0,2.0,3.0,...,6.0,2.0,10.75,0.0,9.4,0.3,-1.7,0.344,0.562,0.618
50%,1.0,17.0,1.0,9238.0,1.0,1.0,133.1,1.0,19.0,19.0,...,8.0,5.0,12.2,0.0,11.1,1.4,0.32,0.507,0.7,1.214
75%,1.0,39.0,2.0,9556.0,1.0,1.0,140.0,1.0,37.0,37.0,...,10.0,6.0,13.333,0.0,13.9,2.6,1.79,0.673,0.825,2.941
max,6.0,57.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,...,33.0,20.0,18.571,12.0,16.2,3.7,3.51,1.0,1.0,15.0


## Data Dictionary: Enhanced Student Dropout Prediction Dataset
**Source:** UCI ML Repository + Simulated Specific Features

**Citation:** Realinho, V., Vieira Martins, M., Machado, J., & Baptista, L. (2021)  

**DOI:** https://doi.org/10.24432/C5MC89  

**Students:** 4,424 | **Features:** 36 original + 4 simulated + 1 Target

## Target Variable
| Variable | Values | Description |
|----------|---------|-------------|
| **Target** | 0, 1 | 0=Dropout, 1=Graduate (filtered from original 3-class target) |

## New Simulated Features (Tanzania Context)
| Variable | Type | Values/Range | Description |
|----------|------|--------------|-------------|
| **Location_Type** | Categorical | Urban, Rural | School location type (70% Urban, 30% Rural) |
| **Internet_Access** | Numerical | 0.0-1.0 | Internet access quality score (0.0=None, 1.0=Perfect) |
| **Teacher_Quality** | Numerical | 0.0-1.0 | Teacher qualification score (0.0=Poor, 1.0=Excellent) |
| **School_Distance_km** | Numerical | 0.5-15 km | Distance from home to school in kilometers |

## Original Key Variables
- **Demographics:** Age at enrollment, Gender, Marital status, Nationality

- **Academic Background:** Previous qualification, Admission grade, Course

- **Academic Performance:** Curricular units approved/not approved, Grades

- **Financial:** Tuition fees up to date, Scholarship holder

- **Economic Context:** GDP, Unemployment rate

- **Family Background:** Mother's/Father's qualification and occupation

In [None]:
# Filter to binary targets for modeling
df = df[df['Target'].isin(['Graduate', 'Dropout'])].copy()
df['y'] = df['Target'].map({'Dropout': 0, 'Graduate': 1})
X = df.drop(columns=['Target', 'y'])
y = df['y']