# Exercise 1: Prediction Models

You will practice the basic steps to fit and to use a machine learning model.

In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

In [2]:
# Load data
X_train = pd.read_csv("ex1_train.csv", header=None)
X_test = pd.read_csv("ex1_test.csv", header=None)
y_train = pd.read_csv("ex1_class_train.csv", header=None)
y_test = pd.read_csv("ex1_class_test.csv", header=None)

# Part 1: Default XGBoost Classifier

**TODO: Fit the model and predict for test data in the following cell**

In [3]:
# 1) Create an XGBoost classifier instance
xgb_classifier = xgb.XGBClassifier()

# 2) Fit the classifier using X_train and y_train
xgb_classifier.fit(X_train, y_train)

# 3) Make prediction over X_test
y_pred_default = xgb_classifier.predict(X_test)

In [4]:
# Evaluate the default model
accuracy_default = accuracy_score(y_test, y_pred_default)
precision_default = precision_score(y_test, y_pred_default)
recall_default = recall_score(y_test, y_pred_default)
f1_default = f1_score(y_test, y_pred_default)

print("Default Model Performance:")
print(f"Accuracy: {accuracy_default:.4f}")
print(f"Precision: {precision_default:.4f}")
print(f"Recall: {recall_default:.4f}")
print(f"F1 Score: {f1_default:.4f}")

Default Model Performance:
Accuracy: 0.7032
Precision: 0.7216
Recall: 0.7186
F1 Score: 0.7201


You should achieve F1 score>0.65 to pass Part 1.

# Part 2: Hyperparameter Tuning with Cross-Validation

In [5]:
# Define candidate hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}

**TODO: Find the best hyperparameters and use them to fit an improved classifier in the following cell**

In [6]:
# 1) use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid)
grid_search.fit(X_train, y_train)

# 2) fit an XGBoost classifier using the best hyperparameters
best_xgb_classifier = xgb.XGBClassifier(**grid_search.best_params_)
best_xgb_classifier.fit(X_train, y_train)

# 3) make prediction over X_test. The prediction output should be named y_pred_tuned
y_pred_tuned = best_xgb_classifier.predict(X_test)

In [7]:
# Evaluate the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)

print("Tuned Model Performance:")
print(f"Accuracy: {accuracy_tuned:.4f}")
print(f"Precision: {precision_tuned:.4f}")
print(f"Recall: {recall_tuned:.4f}")
print(f"F1 Score: {f1_tuned:.4f}")

# Analysis
print(f"Improvement in F1 Score: {f1_tuned - f1_default:.4f}")

Tuned Model Performance:
Accuracy: 0.7186
Precision: 0.7276
Recall: 0.7519
F1 Score: 0.7396
Improvement in F1 Score: 0.0195


To pass Part 2, your new F1 score should be higher 0.65 and the one in Part 1.