In [41]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

# Merge the training features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Identify categorical columns
categorical_columns = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own',
                       'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
numerical_columns = train_data.drop(categorical_columns + ['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1).columns

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', IterativeImputer(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

# Create the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rf_model)])

# Split the data into features and targets
features = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
targets = train_data[['xyz_vaccine', 'seasonal_vaccine']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict_proba(X_test)

# Extract the predicted probabilities for each target variable
xyz_pred = y_pred[0][:, 1]
seasonal_pred = y_pred[1][:, 1]

# Evaluate the model using ROC AUC score
xyz_auc = roc_auc_score(y_test['xyz_vaccine'], xyz_pred)
seasonal_auc = roc_auc_score(y_test['seasonal_vaccine'], seasonal_pred)
mean_auc = (xyz_auc + seasonal_auc) / 2

print(f"XYZ Vaccine ROC AUC: {xyz_auc:.4f}")
print(f"Seasonal Vaccine ROC AUC: {seasonal_auc:.4f}")
print(f"Mean ROC AUC: {mean_auc:.4f}")

# Make predictions on the test set features
test_features_processed = test_features.drop('respondent_id', axis=1)
test_pred = pipeline.predict_proba(test_features_processed)

# Extract the predicted probabilities for each target variable
test_xyz_pred = test_pred[0][:, 1]
test_seasonal_pred = test_pred[1][:, 1]

# Create submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_xyz_pred,
    'seasonal_vaccine': test_seasonal_pred
})
submission.to_csv('submission.csv', index=False)



XYZ Vaccine ROC AUC: 0.8539
Seasonal Vaccine ROC AUC: 0.8557
Mean ROC AUC: 0.8548
