In [18]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm import SVC


I. Data preparation

In [19]:
# Loading data
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nationality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

In [21]:
df.isnull().sum()
# We don't have any null values in dataset, but to be safe still we will be using imputer in pipelines

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Nationality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrollment                                 0
International                                     0
Curricular u

In [22]:
# Divide dataset on features set and target
X = df.drop(["Target"],axis=1)
df["Target"] = LabelEncoder().fit_transform(df["Target"])
y = df["Target"]



In [23]:
numerical_features = [
    "Application order","Age at enrollment", "Curricular units 1st sem (credited)", "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)","Curricular units 1st sem (approved)","Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)","Curricular units 2nd sem (credited)","Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)","Curricular units 2nd sem (approved)", "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)","Unemployment rate","Inflation rate","GDP"
]
categorical_features = [
    "Marital status","Application mode","Course","Daytime/evening attendance","Previous qualification","Nationality",
    "Mother's qualification","Father's qualification","Mother's occupation","Father's occupation","Displaced",
    "Educational special needs","Debtor","Tuition fees up to date","Gender","Scholarship holder","International",
]


II. Pipelines

In [24]:
numerical_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

In [25]:
preprocessor = ColumnTransformer([
    ('num',numerical_pipeline,numerical_features),
    ('cat',categorical_pipeline,categorical_features)
])


In [26]:
# Regression pipeline
regression_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',LogisticRegression(max_iter=1000))
])

In [27]:
# Random forest pipeline
random_forest_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',RandomForestClassifier(n_estimators=300))
])

In [28]:
# SVR pipeline
svr_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',SVC(decision_function_shape='ovo'))
])

In [29]:
# Decision tree pipeline
decision_tree_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',DecisionTreeClassifier(max_depth=17))
])

III. Training model


In [30]:
X_train,X_test,y_train,y_test =train_test_split(X,y,random_state=42,test_size=0.3)

In [31]:
regression_pipeline.fit(X_train,y_train)
y_pred = regression_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7658132530120482
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.80      0.79       441
           1       0.57      0.32      0.41       245
           2       0.79      0.92      0.85       642

    accuracy                           0.77      1328
   macro avg       0.72      0.68      0.68      1328
weighted avg       0.75      0.77      0.75      1328



In [32]:
random_forest_pipeline.fit(X_train,y_train)
y_pred = random_forest_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7673192771084337
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.80       441
           1       0.63      0.28      0.39       245
           2       0.76      0.94      0.84       642

    accuracy                           0.77      1328
   macro avg       0.74      0.67      0.68      1328
weighted avg       0.75      0.77      0.74      1328



In [33]:
svr_pipeline.fit(X_train,y_train)
y_pred = svr_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7733433734939759
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.77      0.80       441
           1       0.57      0.35      0.43       245
           2       0.78      0.94      0.85       642

    accuracy                           0.77      1328
   macro avg       0.73      0.69      0.70      1328
weighted avg       0.76      0.77      0.76      1328



In [34]:
decision_tree_pipeline.fit(X_train,y_train)
y_pred = decision_tree_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6927710843373494
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.73      0.72       441
           1       0.38      0.36      0.37       245
           2       0.79      0.79      0.79       642

    accuracy                           0.69      1328
   macro avg       0.63      0.63      0.63      1328
weighted avg       0.69      0.69      0.69      1328

