In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [2]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [3]:
train_labels = train_labels.dropna(subset=['xyz_vaccine', 'seasonal_vaccine'])
train_features = train_features.loc[train_labels.index]

In [4]:
X_train = train_features.drop(columns=['respondent_id'])
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']
X_test = test_features.drop(columns=['respondent_id'])

In [5]:
imputer = SimpleImputer(strategy='most_frequent')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(include=[np.number]).columns

In [6]:
encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(pd.DataFrame(X_train_imputed, columns=X_train.columns)[categorical_features])
X_test_encoded = encoder.transform(pd.DataFrame(X_test_imputed, columns=X_test.columns)[categorical_features])

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(pd.DataFrame(X_train_imputed, columns=X_train.columns)[numerical_features])
X_test_scaled = scaler.transform(pd.DataFrame(X_test_imputed, columns=X_test.columns)[numerical_features])

In [8]:
X_train_preprocessed = np.hstack((X_train_scaled, X_train_encoded))
X_test_preprocessed = np.hstack((X_test_scaled, X_test_encoded))

In [9]:
logreg_xyz = LogisticRegression(random_state=42, max_iter=1000)
logreg_seasonal = LogisticRegression(random_state=42, max_iter=1000)
logreg_xyz.fit(X_train_preprocessed, y_xyz)
logreg_seasonal.fit(X_train_preprocessed, y_seasonal)

In [10]:
y_xyz_train_pred_prob= logreg_xyz.predict_proba(X_train_preprocessed)[:, 1]
y_seasonal_train_pred_prob = logreg_seasonal.predict_proba(X_train_preprocessed)[:, 1]

In [11]:
xyz_auc_roc = roc_auc_score(y_xyz, y_xyz_train_pred_prob)
seasonal_auc_roc = roc_auc_score(y_seasonal, y_seasonal_train_pred_prob)

In [12]:
print(f'xyz Vaccine AUC-ROC: {xyz_auc_roc:.4f}')
print(f'Seasonal Vaccine AUC-ROC: {seasonal_auc_roc:.4f}')

xyz Vaccine AUC-ROC: 0.8380
Seasonal Vaccine AUC-ROC: 0.8559


In [13]:
y_xyz_test_pred_prob = logreg_xyz.predict_proba(X_test_preprocessed)[:, 1]
y_seasonal_test_pred_prob = logreg_seasonal.predict_proba(X_test_preprocessed)[:, 1]

In [18]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_xyz_test_pred_prob,
    'seasonal_vaccine': y_seasonal_test_pred_prob
})

In [19]:
submission.head()


Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.055682,0.293623
1,26708,0.047063,0.045313
2,26709,0.409674,0.591009
3,26710,0.498439,0.881007
4,26711,0.162582,0.46359


In [20]:
submission.to_csv('submission.csv', index=False)

In [21]:
final_submission = pd.read_csv('submission.csv')


In [22]:
final_submission.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.055682,0.293623
1,26708,0.047063,0.045313
2,26709,0.409674,0.591009
3,26710,0.498439,0.881007
4,26711,0.162582,0.46359
