In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [3]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [4]:
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_data.select_dtypes(include=['number']).columns.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine']).tolist()

In [5]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [6]:
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))

In [7]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

In [8]:
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
clf.fit(X_train, y_train)

In [10]:
y_pred = clf.predict_proba(X_valid)

In [11]:
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred[1][:, 1])

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {(roc_auc_xyz + roc_auc_seasonal) / 2}')

ROC AUC for xyz_vaccine: 0.8313554194085168
ROC AUC for seasonal_vaccine: 0.8560669086421782
Mean ROC AUC: 0.8437111640253474


In [12]:
test_pred = clf.predict_proba(test_features)
test_pred

[array([[0.94996602, 0.05003398],
        [0.95363338, 0.04636662],
        [0.63295279, 0.36704721],
        ...,
        [0.8640632 , 0.1359368 ],
        [0.94009743, 0.05990257],
        [0.41867727, 0.58132273]]),
 array([[0.70297395, 0.29702605],
        [0.95355935, 0.04644065],
        [0.48363975, 0.51636025],
        ...,
        [0.80379437, 0.19620563],
        [0.63895958, 0.36104042],
        [0.46376245, 0.53623755]])]

In [13]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred[0][:, 1],
    'seasonal_vaccine': test_pred[1][:, 1]
})

In [14]:
submission.to_csv('submission_final.csv',index=False)