In [None]:
# %pip install -r requirements.txt

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline
import seaborn as sns

sns.set_style("whitegrid")

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df.head()

In [6]:
random_state = 42

# Handling Missing Values

In [None]:
train_df.info()

No missing values

# Handling Duplicate Values

In [None]:
train_df.duplicated().sum()


No duplicate values

# Looking at column distributions

In [None]:
categorical_columns = [
    "Marital status",
    "Application mode",
    "Application order",
    "Course",
    "Daytime/evening attendance",
    "Previous qualification",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    "Displaced",
    "Educational special needs",
    "Debtor",
    "Tuition fees up to date",
    "Gender",
    "Scholarship holder",
    "International",
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (without evaluations)",
]

numerical_columns = [
    "Previous qualification (grade)",
    "Admission grade",
    "Age at enrollment",
    "Curricular units 1st sem (grade)",
    "Curricular units 2nd sem (grade)",
    "Unemployment rate",
    "Inflation rate",
    "GDP",
]

## Looking at distributions for numerical columns

In [None]:
for column in numerical_columns:
    print(column)
    # # Calculate the IQR (Interquartile Range)
    # Q1 = train_df[column].quantile(0.25)
    # Q3 = train_df[column].quantile(0.75)
    # IQR = Q3 - Q1

    # # Calculate the bin width using Freedman-Diaconis rule
    # bin_width = 2 * IQR * (len(train_df[column]) ** (-1/3))

    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    sns.kdeplot(ax=axes[0], x=train_df[column], fill=True)
    # sns.histplot(ax=axes[0], x=train_df[column], binwidth=bin_width, kde=True)
    axes[0].set_title(column)

    # sns.histplot(ax=axes[1], x=train_df[column], hue=train_df["Target"], binwidth=bin_width, kde=True)
    sns.kdeplot(ax=axes[1], x=train_df[column], hue=train_df["Target"], fill=True)
    axes[1].set_title(f"{column} per Target")
    
    plt.tight_layout()
    plt.show()

## Looking at distribution of categorical columns

In [None]:
for column in categorical_columns:
    print(column)

    # Create a figure with two subplots side by side
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    ratios = train_df[column].value_counts(normalize=True)
    sns.barplot(ax=axes[0], x=ratios.index, y=ratios.to_list())
    axes[0].set_title(column)
    axes[0].tick_params(axis="x", labelrotation=90)
    axes[0].set_ylim(0, 1)

    grouped_counts = train_df.groupby("Target")[column].value_counts(normalize=True)
    flattened_df = grouped_counts.reset_index()
    flattened_df.columns = ["Target", column, "Proportion"]
    sns.barplot(
        x=flattened_df[column], y=flattened_df["Proportion"], hue=flattened_df["Target"]
    )
    axes[1].set_title(f"{column} per Target")
    axes[1].tick_params(axis="x", labelrotation=90)
    axes[1].set_ylim(0, 1)

    plt.tight_layout()
    plt.show()

# Drop unneeded columns, like id

In [None]:
columns_to_drop = ["id"]
train_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)

# Handling class imbalance

In [None]:
normalized_proportions = train_df["Target"].value_counts(normalize=True)
normalized_proportions

There is imbalance

In [7]:
# Define Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

over = SMOTE(sampling_strategy="auto", random_state=random_state)
model = RandomForestClassifier(random_state=random_state)

pipeline = ImbPipeline(steps=[("over", over), ("model", model)])

X = train_df.drop(columns="Target")
y = train_df["Target"]

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

param_grid = {"model__n_estimators" : [5],
              "model__criterion" : ["gini", "entropy"],
              "model__max_depth" : [3]}

grid_search = GridSearchCV(estimator= pipeline,
                       param_grid = param_grid,
                       scoring= "accuracy",
                       cv=5)


X = train_df.drop(columns="Target")
y = train_df["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)


# X_test = test_df.drop(columns="Target")
# y_test = test_df["Target"]

# Evaluate on the test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_val, y_val)
print("Test accuracy: ", test_accuracy)

test_predictions = best_model.predict(test_df)

prediction_df = pd.DataFrame({
    "id": test_df["id"],
    "Target": test_predictions
})

# Save the DataFrame to a CSV file
prediction_df.to_csv("test_predictions.csv", index=False)

# # Print the test accuracy
# test_accuracy = accuracy_score(y_test, test_predictions)
# print("Test accuracy: ", test_accuracy)

Best parameters found:  {'model__criterion': 'entropy', 'model__max_depth': 3, 'model__n_estimators': 5}
Best accuracy:  0.7822231356713637
Test accuracy:  0.78659174072138
