In [49]:
# import library and load dataset

import pandas as pd

# load dataset
data_path = '../data/raw/data.csv' 
data = pd.read_csv(data_path, delimiter=';')



In [50]:
# Initial Data Inspection
print("Initial Data Overview:")
print(data.info())
print("Sample Data:\n", data.head())
print("Missing values per column:\n", data.isnull().sum())

Initial Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualifica

In [51]:
# Encode Target Variable
# Encoding the target variable where 'Dropout' -> 0, 'Enrolled' and 'Graduate' -> 1
data['Target'] = data['Target'].map({'Dropout': 0, 'Enrolled': 1, 'Graduate': 1})

# Rename 'Nacionality' to 'Nationality'
data.rename(columns={'Nacionality': 'Nationality'}, inplace=True)

# Drop Gender column to avoid potential discrimination
data.drop(columns=['Gender'], inplace=True)


In [52]:
# Count the number of students for each nationality
nationality_counts = data['Nationality'].value_counts().sort_index()

# Display the counts
print("Number of students per nationality:")
print(nationality_counts)

Number of students per nationality:
Nationality
1      4314
2         2
6        13
11        3
13        1
14        1
17        1
21        2
22       13
24        5
25        2
26       14
32        1
41       38
62        2
100       3
101       2
103       3
105       2
108       1
109       1
Name: count, dtype: int64


In [53]:
# Drop the Nationality column as it is redundant with the internationaly feature
data = data.drop(columns=['Nationality'])

In [54]:
# Save Cleaned Data for EDA
# Save this unscaled version for EDA
eda_output_path = '../data/processed/data_for_eda.csv'
data.to_csv(eda_output_path, index=False)
print(f"Unscaled cleaned dataset saved for EDA to {eda_output_path}")

Unscaled cleaned dataset saved for EDA to ../data/processed/data_for_eda.csv


In [55]:
# Step 5: Normalize Numerical Features

# Separate categorical and numerical columns
numerical_cols = ['Admission grade', 'Unemployment rate', 'Inflation rate', 'GDP', 'Previous qualification (grade)', 
                  'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 
                  'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)',
                  'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 
                  'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)']  
categorical_cols = [col for col in data.columns if col not in numerical_cols]

# Normalize numerical columns only
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Save the fully processed, normalized dataset for modeling
model_output_path = '../data/processed/data_cleaned_scaled.csv'
data.to_csv(model_output_path, index=False)
print(f"Fully processed and scaled dataset saved for modeling to {model_output_path}")

Fully processed and scaled dataset saved for modeling to ../data/processed/data_cleaned_scaled.csv


# iterate through 2 to len(X) components to find the best number of components

n_components = np.arange(2, len(X.columns)+1)
scores = []
for n in n_components:
    pca = PCA(n_components=n)
    x_train_pca = pca.fit_transform(X_train)
    RandomForest = RandomForestClassifier()
    score = cross_val_score(RandomForest, x_train_pca, y_train, cv=10).mean()
    print("n_components: {}, score: {}".format(n, score))
    scores.append(score)

print(scores)
plt.plot(n_components, scores, marker='o')
plt.xlabel('PCA Components')
plt.ylabel('Accuracy')
plt.title('PCA Components vs Accuracy')
plt.show()