In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Load datasets
train_features = pd.read_csv('/kaggle/input/mydataset/training_set_features.csv')
train_labels = pd.read_csv('/kaggle/input/mydataset/training_set_labels.csv')
test_features = pd.read_csv('/kaggle/input/mydataset/test_set_features.csv')

In [3]:
# Drop respondent_id columns
train_features = train_features.drop(columns=['respondent_id'])
test_features_ids = test_features['respondent_id']
test_features = test_features.drop(columns=['respondent_id'])

In [4]:
# Identify categorical and numerical columns
categorical_cols = train_features.select_dtypes(include=['object']).columns
numerical_cols = train_features.select_dtypes(include=['number']).columns
print(categorical_cols)
print(numerical_cols)

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation'],
      dtype='object')
Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children'],
      dtype='object')


In [5]:
# Basic Preprocessing

# Handle missing values
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

train_features[numerical_cols] = imputer_num.fit_transform(train_features[numerical_cols])
test_features[numerical_cols] = imputer_num.transform(test_features[numerical_cols])

train_features[categorical_cols] = imputer_cat.fit_transform(train_features[categorical_cols])
test_features[categorical_cols] = imputer_cat.transform(test_features[categorical_cols])

In [6]:
# Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# One-hot encoding
train_features_encoded = pd.DataFrame(
    encoder.fit_transform(train_features[categorical_cols]),
    columns=encoder.get_feature_names_out(categorical_cols)
)

test_features_encoded = pd.DataFrame(
    encoder.transform(test_features[categorical_cols]),
    columns=encoder.get_feature_names_out(categorical_cols)
)

# Drop original categorical columns and concatenate the encoded columns
train_features = train_features.drop(columns=categorical_cols)
test_features = test_features.drop(columns=categorical_cols)

train_features = pd.concat([train_features.reset_index(drop=True), train_features_encoded], axis=1)
test_features = pd.concat([test_features.reset_index(drop=True), test_features_encoded], axis=1)



In [7]:
# Scale numerical features
scaler = StandardScaler()
train_features[numerical_cols] = scaler.fit_transform(train_features[numerical_cols])
test_features[numerical_cols] = scaler.transform(test_features[numerical_cols])

In [8]:
# Split data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [9]:
# Define the model
gnb = GaussianNB()
multilabel_model = MultiOutputClassifier(gnb)

In [10]:
# Train the model
multilabel_model.fit(X_train, y_train)

In [11]:
# Predict probabilities on the validation set
y_valid_pred_proba = multilabel_model.predict_proba(X_valid)

In [12]:
print(y_valid_pred_proba)

[array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), array([[2.01244945e-02, 9.79875505e-01],
       [7.35984970e-02, 9.26401503e-01],
       [9.99915142e-01, 8.48580274e-05],
       ...,
       [1.00000000e+00, 4.89835604e-10],
       [1.00000000e+00, 1.01459098e-31],
       [1.00000000e+00, 8.16030964e-15]]), array([[1.54912003e-04, 9.99845088e-01],
       [7.90038428e-04, 9.99209962e-01],
       [1.21048217e-03, 9.98789518e-01],
       ...,
       [9.99997559e-01, 2.44124817e-06],
       [1.00000000e+00, 8.59105237e-20],
       [1.00000000e+00, 7.25457925e-13]])]


In [13]:
# Extract probabilities for each label
y_valid_pred_proba_xyz = y_valid_pred_proba[0][:, 1]
y_valid_pred_proba_seasonal = y_valid_pred_proba[1][:, 1]
print(y_valid_pred_proba_xyz)
print(y_valid_pred_proba_seasonal)

[0. 0. 0. ... 0. 0. 0.]
[9.79875505e-01 9.26401503e-01 8.48580274e-05 ... 4.89835604e-10
 1.01459098e-31 8.16030964e-15]


In [14]:
# Compute ROC AUC scores for each label
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_valid_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_valid_pred_proba_seasonal)

print(f'ROC AUC score for XYZ vaccine: {roc_auc_xyz}')
print(f'ROC AUC score for Seasonal vaccine: {roc_auc_seasonal}')


ROC AUC score for XYZ vaccine: 0.5
ROC AUC score for Seasonal vaccine: 0.7058448672500555


In [15]:
# Average ROC AUC score
average_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
print(f'Average ROC AUC score: {average_roc_auc}')

Average ROC AUC score: 0.6029224336250277


In [16]:
# Predict on test set
test_pred_proba = multilabel_model.predict_proba(test_features)

# Extract probabilities
test_pred_proba_xyz = test_pred_proba[0][:, 1]
test_pred_proba_seasonal = test_pred_proba[1][:, 1]


In [17]:
# Create submission dataframe
submission = pd.DataFrame({
    'respondent_id': test_features_ids,
    'xyz_vaccine': test_pred_proba_xyz,
    'seasonal_vaccine': test_pred_proba_seasonal
})

# Save to CSV
submission.to_csv('sub1.csv', index=False)