<a href="https://colab.research.google.com/github/jatinbansal21/datahacks/blob/main/DataHacks(gfg).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score



# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

# # Display the first few rows of each dataset to ensure they loaded correctly
# print(train_features.head())
# print(train_labels.head())
# print(test_features.head())
# print(submission_format.head())

# Handle missing values (if necessary)
train_features.fillna(-1, inplace=True)
test_features.fillna(-1, inplace=True)

# List of categorical features
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
                        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
                        'employment_industry', 'employment_occupation']

# Convert all categorical columns to strings
for col in categorical_features:
    train_features[col] = train_features[col].astype(str)
    test_features[col] = test_features[col].astype(str)

# Initialize LabelEncoder
le = LabelEncoder()

# Encode each categorical column
for col in categorical_features:
    train_features[col] = le.fit_transform(train_features[col])
    test_features[col] = le.transform(test_features[col])

# Split the data
X = train_features.drop(columns=['respondent_id'])
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']

X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Define LightGBM parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'early_stopping_rounds': 10,  # Perform early stopping
    'verbose_eval': 10  # Print evaluation metric every 10 rounds
}

# Train model for xyz_vaccine
d_train_xyz = lgb.Dataset(X_train, label=y_train_xyz)
d_val_xyz = lgb.Dataset(X_val, label=y_val_xyz)
clf_xyz = lgb.train(params, d_train_xyz, num_boost_round=100, valid_sets=[d_train_xyz, d_val_xyz])

# Train model for seasonal_vaccine
d_train_seasonal = lgb.Dataset(X_train, label=y_train_seasonal)
d_val_seasonal = lgb.Dataset(X_val, label=y_val_seasonal)
clf_seasonal = lgb.train(params, d_train_seasonal, num_boost_round=100, valid_sets=[d_train_seasonal, d_val_seasonal])

# Generate predictions
X_test = test_features.drop(columns=['respondent_id'])

y_pred_xyz = clf_xyz.predict(X_test)
y_pred_seasonal = clf_seasonal.predict(X_test)
y_pred_val_xyz = clf_xyz.predict(X_val)
y_pred_val_seasonal = clf_seasonal.predict(X_val)

# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_val_xyz, y_pred_val_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, y_pred_val_seasonal)

# Calculate the mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

# Prepare the submission
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_pred_xyz,
    'seasonal_vaccine': y_pred_seasonal
})

submission.to_csv('submission.csv',index=False)

[LightGBM] [Info] Number of positive: 4544, number of negative: 16821
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 189
[LightGBM] [Info] Number of data points in the train set: 21365, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.212684 -> initscore=-1.308820
[LightGBM] [Info] Start training from score -1.308820
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[75]	training's auc: 0.889885	valid_1's auc: 0.871705
[LightGBM] [Info] Number of positive: 9984, number of negative: 11381
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005972 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`