In [None]:
!pip install xgboost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import ensemble
from sklearn.metrics import (accuracy_score,
                             mean_squared_error,
                             precision_score, 
                             recall_score, 
                             f1_score, 
                             roc_curve, 
                             auc)

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.utils.fixes import parse_version

# Model Prediction

In [None]:
input_train = pd.read_csv("Training_Validation_Test_Datasets/task2_input_train.csv", index_col=0)
input_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_input_validate.csv", index_col=0)
input_test = pd.read_csv("Training_Validation_Test_Datasets/task2_input_test.csv", index_col=0)

output_train = pd.read_csv("Training_Validation_Test_Datasets/task2_output_train.csv")["phq_sum"]
output_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_output_validate.csv")["phq_sum"]
output_test = pd.read_csv("Training_Validation_Test_Datasets/task2_output_test.csv")["phq_sum"]

## 1. Gradient Boosting

In [None]:
lr = 0.1
n = 50
depth = 4
min_sample = 500

params = {"n_estimators": n,
          "max_depth": depth,
          "min_samples_split": min_sample,
          "learning_rate": lr,
          "loss": "squared_error",
          "random_state": 42}

reg = GradientBoostingRegressor(**params)
reg.fit(input_train, output_train.values)

predictions = reg.predict(input_validate)
mse = mean_squared_error(output_validate, predictions)
mae = mean_absolute_error(output_validate, predictions)

y_pred_gb = reg.predict(input_test)

## 2. XGBoost

In [None]:
dtrain = xgb.DMatrix(input_train, label=output_train)
dtest = xgb.DMatrix(input_test, label=output_test)

params = {
        "objective": "reg:squarederror",  
        "eval_metric": "rmse",      
        "eta": 0.06,                      
        "max_depth": 3,      
        "subsample": 0.6,       
        "colsample_bytree": 0.2,
    }

evals = [(dtrain, "train")]
model = xgb.train(params, dtrain, num_boost_round=500, 
                  evals=evals, early_stopping_rounds=50, verbose_eval=100)

y_pred_xgb = model.predict(dtest)

## 3. Random Forest

In [None]:
param_grid = {
    "n_estimators": 50,
    "max_depth": 10}

rf_best_model = RandomForestRegressor(
    n_estimators=param_grid["n_estimators"],
    max_depth=param_grid["max_depth"],
    random_state=42)

rf_best_model.fit(input_train, output_train)
y_pred_rf = rf_best_model.predict(input_test)

## 4. Neural Networks

# Binary Task

In [None]:
# Binary task for random forest

PHQ_CUTOFF = 10  # Set your cutoff threshold

# Ensure y_test and y_pred_rf are NumPy arrays
y_test = np.array(output_test).ravel()  # Assuming output_test is your test labels
y_pred_rf = np.array(y_pred_rf)

# Convert predictions and true labels to binary
y_test_binary = np.where(y_test >= PHQ_CUTOFF, 1, 0)
y_pred_binary = np.where(y_pred_rf >= PHQ_CUTOFF, 1, 0)

# Calculate AUROC and accuracy
fpr, tpr, _ = roc_curve(y_test_binary, y_pred_rf)
roc_auc = auc(fpr, tpr)

accuracy = accuracy_score(y_test_binary, y_pred_binary)

print(f"AUROC: {roc_auc:.6f}")
print(f"Accuracy: {accuracy:.6f}")

# Plot AUROC
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f"Random Forest (AUROC = {roc_auc:.6f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUROC Curve for Random Forest")
plt.legend(loc="lower right")
plt.grid(alpha=0.4)
plt.show()

# Print accuracy
print(f"Accuracy: {accuracy:.6f}")

In [None]:
# Binary task for xgboost

PHQ_CUTOFF = 10  

output_test = np.array(output_test)
y_pred_xgb = np.array(y_pred_xgb)

# Convert predictions and true labels to binary
y_test_binary = np.where(y_test >= PHQ_CUTOFF, 1, 0)
y_pred_binary = np.where(y_pred_xgb >= PHQ_CUTOFF, 1, 0)

# Calculate F1 score, recall, precision, AUROC and accuracy
fpr, tpr, _ = roc_curve(y_test_binary, y_pred_xgb)
roc_auc = auc(fpr, tpr)

accuracy = accuracy_score(y_test_binary, y_pred_binary)

print(f"AUROC: {roc_auc:.6f}")
print(f"Accuracy: {accuracy:.6f}")

# Plot AUROC
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f"XGBoost (AUROC = {roc_auc:.6f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUROC Curve for XGBoost")
plt.legend(loc="lower right")
plt.grid(alpha=0.4)
plt.show()

# Print accuracy
print(f"Accuracy: {accuracy:.6f}")

In [None]:
#plotting the ROC curve
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

#ROC AUC in scikit-learn
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_pred_probs)

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    auc
)

# 假设你有：
# y_test_binary：测试集真实标签（0/1）
# y_pred_binary：模型预测的标签（0/1）
# y_pred_xgb：模型预测的概率（用于画 ROC）

# ROC Curve & AUROC
fpr, tpr, _ = roc_curve(y_test_binary, y_pred_xgb)
roc_auc = auc(fpr, tpr)

# Accuracy
accuracy = accuracy_score(y_test_binary, y_pred_binary)

# Precision, Recall, F1
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)

# 输出结果
print(f"Accuracy : {accuracy:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall   : {recall:.6f}")
print(f"F1 Score : {f1:.6f}")
print(f"AUROC    : {roc_auc:.6f}")
