In [16]:
import pandas as pd

train_features = pd.read_csv(r"C:\Users\somes\OneDrive\ドキュメント\training_set_features.csv")
train_labels = pd.read_csv(r"C:\Users\somes\OneDrive\ドキュメント\training_set_labels.csv")
test_features = pd.read_csv(r"C:\Users\somes\OneDrive\ドキュメント\test_set_features.csv")


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

train_data = pd.merge(train_features, train_labels, on='respondent_id')

X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)

y_pred = pipeline.predict_proba(X_val)
y_pred = pd.DataFrame({
    'xyz_vaccine': y_pred[0][:, 1],
    'seasonal_vaccine': y_pred[1][:, 1]
})

roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred['seasonal_vaccine'])
roc_auc_avg = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Average ROC AUC: {roc_auc_avg}')


ROC AUC for xyz_vaccine: 0.864173999277244
ROC AUC for seasonal_vaccine: 0.8570519011081396
Average ROC AUC: 0.8606129501926918


In [18]:
test_ids = test_features['respondent_id']
X_test = test_features.drop(columns=['respondent_id'])
test_pred = pipeline.predict_proba(X_test)
test_pred = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': test_pred[0][:, 1],
    'seasonal_vaccine': test_pred[1][:, 1]
})

test_pred.to_csv('submission.csv', index=False)
