conda 22.9.0


In [52]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [53]:
train_df = pd.read_csv("train_df.csv")
valid_df = pd.read_csv("valid_df.csv")
test_df = pd.read_csv("test_df.csv")


# Prepare the data
X_train = train_df.drop('is_corona_positive', axis=1)
y_train = train_df['is_corona_positive']

X_valid = valid_df.drop('is_corona_positive', axis=1)
y_valid = valid_df['is_corona_positive']

In [55]:
%%time

from sklearn.metrics import f1_score

# Define the LightGBM model
model = lgb.LGBMClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5],
    'num_leaves': [20, 30, 40],
    'max_depth': [5, 10, 15],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    verbose=3,
    n_jobs=-1
)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train a new LightGBM model using the best hyperparameters
best_model = lgb.LGBMClassifier(random_state=42, **best_params)
best_model.fit(X_train, y_train)

# Evaluate the best model on the validation set
y_pred_valid = best_model.predict(X_valid)
f1_score = f1_score(y_valid, y_pred_valid)

print("Best Hyperparameters:", best_params)
print("Best Weighted F1 score:", best_score)
print("Validation F1 score:", f1_score)



Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 15, 'num_leaves': 40, 'subsample': 0.8}
Best Weighted F1 score: 0.999711312603155
Validation F1 score: 0.7090374724467302
CPU times: user 14.4 s, sys: 3.56 s, total: 17.9 s
Wall time: 3min 42s


In [56]:
best_roc_auc_model = lgb.LGBMClassifier(colsample_bytree= 0.9, learning_rate= 0.1, max_depth= 10, num_leaves = 40, subsample= 0.8, random_state=42)
best_weighted_f1 = lgb.LGBMClassifier(colsample_bytree= 1.0, learning_rate= 0.1, max_depth= 15, num_leaves = 40, subsample= 0.8, random_state=42)

In [60]:
from sklearn.metrics import classification_report

best_roc_auc_model.fit(X_train, y_train)
y_pred = best_roc_auc_model.predict(X_valid)
print("Best ROC AUC score")
print(classification_report(y_valid, y_pred))

best_weighted_f1.fit(X_train, y_train)
y_pred = best_weighted_f1.predict(X_valid)
print("Best Weighted F1 score")
print(classification_report(y_valid, y_pred))

Best ROC AUC score
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     26009
           1       0.76      0.66      0.71      1461

    accuracy                           0.97     27470
   macro avg       0.87      0.82      0.85     27470
weighted avg       0.97      0.97      0.97     27470

Best Weighted F1 score
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     26009
           1       0.77      0.66      0.71      1461

    accuracy                           0.97     27470
   macro avg       0.87      0.82      0.85     27470
weighted avg       0.97      0.97      0.97     27470

