<a href="https://colab.research.google.com/github/harshi-xyz/Datahack/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load and Prepare the Data

In [2]:
import pandas as pd

train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

train_data = pd.merge(train_features, train_labels, on='respondent_id')

train_data = train_data.drop(columns=['respondent_id'])

test_ids = test_features['respondent_id']
test_features = test_features.drop(columns=['respondent_id'])

X = train_data.drop(columns=['xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]


Preprocessing

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X_test_features = preprocessor.transform(test_features)


Model Building and Training

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

logreg = LogisticRegression()
rf = RandomForestClassifier()

multi_target_logreg = MultiOutputClassifier(logreg, n_jobs=-1)
multi_target_rf = MultiOutputClassifier(rf, n_jobs=-1)

multi_target_logreg.fit(X_train, y_train)
multi_target_rf.fit(X_train, y_train)

y_pred_logreg = multi_target_logreg.predict_proba(X_test)
y_pred_rf = multi_target_rf.predict_proba(X_test)

y_pred_logreg = pd.DataFrame({
    'xyz_vaccine': y_pred_logreg[0][:, 1],
    'seasonal_vaccine': y_pred_logreg[1][:, 1]
})

y_pred_rf = pd.DataFrame({
    'xyz_vaccine': y_pred_rf[0][:, 1],
    'seasonal_vaccine': y_pred_rf[1][:, 1]
})

roc_auc_logreg = roc_auc_score(y_test, y_pred_logreg, average='macro')
roc_auc_rf = roc_auc_score(y_test, y_pred_rf, average='macro')

print(f'ROC AUC Score for Logistic Regression: {roc_auc_logreg}')
print(f'ROC AUC Score for Random Forest: {roc_auc_rf}')


ROC AUC Score for Logistic Regression: 0.843578822347677
ROC AUC Score for Random Forest: 0.8370674860400737


Make Predictions on the Test Set

In [5]:

best_model = multi_target_rf if roc_auc_rf > roc_auc_logreg else multi_target_logreg

y_pred_final = best_model.predict_proba(X_test_features)

submission = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': y_pred_final[0][:, 1],
    'seasonal_vaccine': y_pred_final[1][:, 1]
})

submission.to_csv('submission.csv', index=False)
