# Setup

## Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import scipy as sp

In [2]:
print("NumPy: %s | Pandas: %s | Scikit-learn: %s | SciPy: %s" % (np.__version__, pd.__version__, sk.__version__, sp.__version__))

NumPy: 1.24.3 | Pandas: 2.0.2 | Scikit-learn: 1.2.2 | SciPy: 1.10.1


## Data loading

In [3]:
data = "./data/train.csv"
data_cleaned = "./data/train_cleaned.csv"
df = pd.read_csv(data)
df_cleaned = pd.read_csv(data_cleaned)

In [4]:
df.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [5]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,id,Application mode,Course,Previous qualification (grade),Admission grade,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Target
0,1,0,1,9238,126.0,122.6,1,0,1,18,6,6,6,14.5,6,7,6,12.428571,Graduate
1,2,1,17,9238,125.0,119.8,1,0,0,18,6,8,4,11.6,6,9,0,0.0,Dropout
2,3,2,17,9254,137.0,144.7,1,1,0,18,6,0,0,0.0,6,0,0,0.0,Dropout
3,4,3,1,9500,131.0,126.1,1,0,1,18,7,9,7,12.59125,8,11,7,12.82,Enrolled
4,5,4,1,9500,132.0,120.1,1,0,0,18,7,12,6,12.933333,7,12,6,12.933333,Graduate


# Machine Learning Approach

## Importing Scikit-learn base

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix

## Pipeline Declaration

### Data splitting

In [8]:
x = df.drop(columns=["id", "Target"])
y = df["Target"]

### Model

In [9]:
model = RandomForestClassifier()

### Cross-Validation algorithms

In [10]:
cv_outer = RepeatedStratifiedKFold(n_splits = 8, n_repeats = 2, random_state = 42)
cv_inner = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 42)

### Recursive Features Elimination

In [11]:
rfe = RFECV(DecisionTreeClassifier(), cv = cv_inner, min_features_to_select = 10, scoring = "accuracy")

### Pipeline

In [12]:
pipeline = Pipeline(steps = [
    ("rfe", rfe), ("classifier", model)
])

### Nested (Double) Cross-Validation

In [13]:
params = {
    "classifier__n_estimators": [50, 100],
    "classifier__max_depth": [10, 20],
}

search = GridSearchCV(pipeline, params, scoring = "accuracy", cv = cv_inner, n_jobs = 3, refit = True)

In [14]:
nested_scores = cross_val_score(search, x, y, scoring = "accuracy", cv = 10, n_jobs = 3)

In [16]:
print("Accuracy: %.3f (%.3f)" % (np.mean(nested_scores), np.std(nested_scores)))

Accuracy: 0.826 (0.004)


In [17]:
search.fit(x, y)

## Predict

### Get Data

In [35]:
test = pd.read_csv("data/test.csv")

In [43]:
X = test.drop(columns=["id"])
ans = test[["id"]]

### Get Estimator

In [40]:
estimator = search.best_estimator_

### Predicting

In [41]:
y_pred = estimator.predict(X)

In [42]:
y_pred

array(['Dropout', 'Graduate', 'Graduate', ..., 'Dropout', 'Dropout',
       'Dropout'], dtype=object)

### Saving results

In [51]:
ans.loc[:, "Target"] = y_pred

In [53]:
filepath = "./data/answer.csv"

In [54]:
ans.to_csv(filepath, index = False)