In [2]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier

In [5]:
#Step 2: Load Data
train_features = pd.read_csv('/content/training_set_features.csv')
train_labels = pd.read_csv('/content/training_set_labels.csv')
test_features = pd.read_csv('/content/test_set_features.csv')
submission_format = pd.read_csv('/content/submission_format.csv')

In [6]:
# Step 3: Preprocess Data
# Merge the training features and labels
train_data = train_features.merge(train_labels, on='respondent_id')

# Separate features and target variables
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing pipelines for both numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [7]:
# Step 4: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Step 5: Build the Model
# RandomForestClassifier inside a MultiOutputClassifier to handle multiple labels
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])


In [9]:
# Step 6: Train the Model
model.fit(X_train, y_train)

# Step 7: Evaluate the Model
y_pred = model.predict_proba(X_val)
y_pred_df = pd.DataFrame({
    'xyz_vaccine': [pred[1] for pred in y_pred[0]],
    'seasonal_vaccine': [pred[1] for pred in y_pred[1]]
})
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_df['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_df['seasonal_vaccine'])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

ROC AUC for xyz_vaccine: 0.864173999277244
ROC AUC for seasonal_vaccine: 0.8570519011081396
Mean ROC AUC: 0.8606129501926918


In [11]:
final_predictions = model.predict_proba(test_features.drop(columns=['respondent_id']))

submission_df = pd.DataFrame({
    'respondent_id': submission_format['respondent_id'],
    'xyz_vaccine': [pred[1] for pred in final_predictions[0]],
    'seasonal_vaccine': [pred[1] for pred in final_predictions[1]]
})

submission_df.to_csv('submission.csv', index=False)

# Directly download the submission file
files.download('submission.csv')
print("Submission file saved and downloaded as submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Submission file saved and downloaded as submission.csv
