In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression


In [5]:
training_set_features = pd.read_csv('training_set_features.csv')
training_set_labels = pd.read_csv('training_set_labels.csv')
test_set_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [6]:
training_set_features.columns, training_set_labels.columns, test_set_features.columns, submission_format.columns

(Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
        'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands',
        'behavioral_large_gatherings', 'behavioral_outside_home',
        'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
        'chronic_med_condition', 'child_under_6_months', 'health_worker',
        'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
        'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
        'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
        'education', 'race', 'sex', 'income_poverty', 'marital_status',
        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
        'household_adults', 'household_children', 'employment_industry',
        'employment_occupation'],
       dtype='object'),
 Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object'),
 Index(['respondent_id', 'xyz_co

In [7]:
df_train = pd.merge(training_set_features, training_set_labels, on='respondent_id')

X = df_train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = df_train[['xyz_vaccine', 'seasonal_vaccine']]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [10]:
import warnings
warnings.filterwarnings('ignore')
# Define the models
models = {'Logistic Regression': LogisticRegression()}

# Train and evaluate each model
for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', MultiOutputClassifier(model))])
    clf.fit(X_train, y_train)

    # Predict on validation set
    y_valid_pred = clf.predict_proba(X_valid)

    # Calculate ROC AUC
    roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_valid_pred[0][:, 1], multi_class='ovr')
    roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_valid_pred[1][:, 1], multi_class='ovr')
    mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

    print(f'\n{name} Results:')
    print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
    print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
    print(f'Mean ROC AUC: {mean_roc_auc}')


Logistic Regression Results:
ROC AUC for xyz_vaccine: 0.8317947457327989
ROC AUC for seasonal_vaccine: 0.8560944283113324
Mean ROC AUC: 0.8439445870220657


In [11]:
import warnings
warnings.filterwarnings('ignore')

best_model = 'Logistic Regression'
best_clf = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', MultiOutputClassifier(models[best_model]))])
best_clf.fit(X, y)

X_test = test_set_features.drop(columns=['respondent_id'])
y_test_pred = best_clf.predict_proba(X_test)

df_submission = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': y_test_pred[0][:, 1],
    'seasonal_vaccine': y_test_pred[1][:, 1]
})
submission_file_path = "submission.csv"
df_submission.to_csv(submission_file_path, index=False)