



# Most models covered in our classes and their optimizations

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [8]:
train_data = pd.read_csv("../data/preprocessed_data.csv")
test_data = pd.read_csv("../data/preprocessed_test_data.csv")

X_train = train_data.drop(["Depression"], axis=1)
y_train = train_data.Depression

### Majority Class Classifier

In [21]:
start_id = 140700
ids = np.arange(start_id, start_id + len(test_data))

y_pred = np.zeros(len(test_data))

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("../submissions/majority_submission.csv", index=False)

### Default params K-Nearest Neighbour Classifier

In [9]:
# Training simple KNN Classifier

from sklearn.impute import SimpleImputer

# Impute with the mean value
imputer = SimpleImputer(strategy='mean')  
X_imputed = imputer.fit_transform(X_train)


knn = KNeighborsClassifier()
knn.fit(X_imputed, y_train)

In [10]:
test_imputed = imputer.fit_transform(test_data)
y_pred = knn.predict(test_imputed)

start_id = 140700
ids = np.arange(start_id, start_id + len(test_data))

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("../submissions/knn_submission.csv", index=False)

### Default params DecisionTree

In [22]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(test_data)

start_id = 140700
ids = np.arange(start_id, start_id + len(test_data))

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("../submissions/default_dt_submission.csv", index=False)

### Default params RandomForest

In [16]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(test_data)

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("../submissions/default_rf_submission.csv", index=False)

### Default params XGBoost

In [17]:
xgbc = xgb.XGBClassifier()
xgbc.fit(X_train, y_train)
y_pred = xgbc.predict(test_data)

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("../submissions/default_xgb_submission.csv", index=False)

### Default params SVM

In [20]:
svc = SVC()
svc.fit(X_imputed, y_train)
y_pred = svc.predict(test_imputed)

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("../submissions/default_svm_submission.csv", index=False)

# ------------------------------------------------

At this point, it's clear that ensemble classifiers do the best in this task, so the models worth optimizing are XGBoost and RandomForest. Another classifier that did well was SVM, but even the model with default parameters took over three and a half minutes to train.

# Hyperparameter tuning for best models

### XGBoost Classifier with tuned hyperparameters

In [23]:
# Training XGBoost Classifier

# Define the parameter grid
xgbc_params = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.3],  # 'eta' in native XGBoost
    "n_estimators": [100],  # Number of boosting rounds
    "objective": ["multi:softmax"],  # Multiclass classification
    "num_class": [3],               # Number of classes
    "subsample": [0.8, 1.0],             # Row sampling
    "colsample_bytree": [0.8, 1.0]       # Feature sampling
}

# Initialize the classifier
xgbc = xgb.XGBClassifier(eval_metric="mlogloss", num_class=2)

# Perform grid search
xgbc_cross = GridSearchCV(estimator=xgbc, param_grid=xgbc_params, cv=3, scoring="accuracy", verbose=1)
xgbc_cross.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [24]:
#y_predictions = xgbc_cross.predict(test_data)
#y_pred = y_predictions[:, 1]
y_pred = xgbc_cross.predict(test_data)

In [27]:
xgbc_cross.best_params_

{'colsample_bytree': 1.0,
 'learning_rate': 0.3,
 'max_depth': 3,
 'n_estimators': 100,
 'num_class': 3,
 'objective': 'multi:softmax',
 'subsample': 0.8}

In [12]:

start_id = 140700
ids = np.arange(start_id, start_id + len(test_data))

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("../submissions/xgbost_submission.csv", index=False)

### Random Forest Classifier with tuned hyperparameters

In [17]:
# Random Forest Classifier

rf = RandomForestClassifier(random_state=0)
rf_params = {
    "n_estimators": [100, 500],
    "max_depth": ["sqrt", "log2", None],
}

rf_cross = GridSearchCV(rf, rf_params)
rf_cross.fit(X_train, y_train)

20 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/V

In [18]:
rf_cross.best_params_

{'max_depth': None, 'n_estimators': 500}

In [19]:
y_pred = rf_cross.predict(test_data)

In [20]:
start_id = 140700
ids = np.arange(start_id, start_id + len(test_data))

submission_df = pd.DataFrame({"id": ids, "label": y_pred})
submission_df.to_csv("rf_submission.csv", index=False)

### Random Forest Classifier tuned by ChatGPT

In [22]:
# Random Forest ChatGPT

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Step 1: Define the data
X, y = X_train, y_train  # Replace with your training features and labels

# Step 2: Define a baseline model
rf = RandomForestClassifier(random_state=42)
baseline_score = np.mean(cross_val_score(rf, X, y, cv=3, scoring='accuracy'))
print(f"Baseline Accuracy: {baseline_score:.4f}")


# Step 5: Tune `min_samples_split`
min_samples_split_range = [2, 5, 10]
best_min_samples_split = 2

for split in min_samples_split_range:
    rf = RandomForestClassifier(
        n_estimators=500, max_depth=None, min_samples_split=split, random_state=42
    )
    score = np.mean(cross_val_score(rf, X, y, cv=3, scoring='accuracy'))
    print(f"min_samples_split={split}, Accuracy: {score:.4f}")
    if score > best_score:
        best_score = score
        best_min_samples_split = split

print(f"Best min_samples_split: {best_min_samples_split}, Accuracy: {best_score:.4f}")

# Step 6: Continue for other hyperparameters (min_samples_leaf, max_features, etc.)
min_samples_leaf_range = [1, 2, 5, 10]
best_min_samples_leaf = 1

for leaf in min_samples_leaf_range:
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=best_min_samples_split,
        min_samples_leaf=leaf,
        random_state=42
    )
    score = np.mean(cross_val_score(rf, X, y, cv=3, scoring='accuracy'))
    print(f"min_samples_leaf={leaf}, Accuracy: {score:.4f}")
    if score > best_score:
        best_score = score
        best_min_samples_leaf = leaf

print(f"Best min_samples_leaf: {best_min_samples_leaf}, Accuracy: {best_score:.4f}")

# Step 7: Tune max_features
max_features_range = ['sqrt', 'log2', None]
best_max_features = None

for feature in max_features_range:
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=best_min_samples_split,
        min_samples_leaf=best_min_samples_leaf,
        max_features=feature,
        random_state=42
    )
    score = np.mean(cross_val_score(rf, X, y, cv=3, scoring='accuracy'))
    print(f"max_features={feature}, Accuracy: {score:.4f}")
    if score > best_score:
        best_score = score
        best_max_features = feature

print(f"Best max_features: {best_max_features}, Accuracy: {best_score:.4f}")


Baseline Accuracy: 0.9367
min_samples_split=2, Accuracy: 0.9372
min_samples_split=5, Accuracy: 0.9373
min_samples_split=10, Accuracy: 0.9375
Best min_samples_split: 10, Accuracy: 0.9375
min_samples_leaf=1, Accuracy: 0.9375
min_samples_leaf=2, Accuracy: 0.9376
min_samples_leaf=5, Accuracy: 0.9372
min_samples_leaf=10, Accuracy: 0.9370
Best min_samples_leaf: 2, Accuracy: 0.9376
max_features=sqrt, Accuracy: 0.9376
max_features=log2, Accuracy: 0.9376
max_features=None, Accuracy: 0.9360
Best max_features: None, Accuracy: 0.9376


In [23]:
best_rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=10, min_samples_leaf=2, max_features=None)
best_rf.fit(X_train, y_train)
best_rf_pred = best_rf.predict(test_data)


In [24]:
start_id = 140700
ids = np.arange(start_id, start_id + len(test_data))

submission_df = pd.DataFrame({"id": ids, "label": best_rf_pred})
submission_df.to_csv("rf_submission2.csv", index=False)