In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
import pandas as pd 
data =pd.read_csv("/kaggle/input/gfg-dataset/submission_format.csv")

data

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7
...,...,...,...
26703,53410,0.5,0.7
26704,53411,0.5,0.7
26705,53412,0.5,0.7
26706,53413,0.5,0.7


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Load the datasets
train_features = pd.read_csv('/kaggle/input/gfg-dataset/training_set_features.csv')
train_labels = pd.read_csv('/kaggle/input/gfg-dataset/training_set_labels.csv')
test_features = pd.read_csv('/kaggle/input/gfg-dataset/test_set_features.csv')
submission_format = pd.read_csv('/kaggle/input/gfg-dataset/submission_format.csv')

# Define feature columns
categorical_features = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 
    'marital_status', 'rent_or_own', 'employment_status', 
    'hhs_geo_region', 'census_msa', 'employment_industry', 
    'employment_occupation'
]
numeric_features = [
    'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 
    'behavioral_avoidance', 'behavioral_face_mask', 
    'behavioral_wash_hands', 'behavioral_large_gatherings', 
    'behavioral_outside_home', 'behavioral_touch_face', 
    'doctor_recc_xyz', 'doctor_recc_seasonal', 'chronic_med_condition', 
    'child_under_6_months', 'health_worker', 'health_insurance', 
    'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 
    'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 
    'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 
    'household_adults', 'household_children'
]

# Preprocessing pipelines for both numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))])

# Split data into train and validation sets
X = train_features.drop(columns=['respondent_id'])
y = train_labels[['xyz_vaccine', 'seasonal_vaccine']]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict probabilities for validation set
y_pred = model.predict_proba(X_valid)
y_pred_prob = np.column_stack([y_pred[0][:, 1], y_pred[1][:, 1]])

# Evaluate the model using ROC AUC
roc_auc = roc_auc_score(y_valid, y_pred_prob, average='macro')
print(f'ROC AUC Score: {roc_auc:.4f}')

# Binarize the predictions for other metrics
y_pred_bin = model.predict(X_valid)

# Accuracy
accuracy_xyz = accuracy_score(y_valid['xyz_vaccine'], y_pred_bin[:, 0])
accuracy_seasonal = accuracy_score(y_valid['seasonal_vaccine'], y_pred_bin[:, 1])
print(f'Accuracy - XYZ Vaccine: {accuracy_xyz:.4f}')
print(f'Accuracy - Seasonal Vaccine: {accuracy_seasonal:.4f}')

# Precision
precision_xyz = precision_score(y_valid['xyz_vaccine'], y_pred_bin[:, 0])
precision_seasonal = precision_score(y_valid['seasonal_vaccine'], y_pred_bin[:, 1])
print(f'Precision - XYZ Vaccine: {precision_xyz:.4f}')
print(f'Precision - Seasonal Vaccine: {precision_seasonal:.4f}')

# Recall
recall_xyz = recall_score(y_valid['xyz_vaccine'], y_pred_bin[:, 0])
recall_seasonal = recall_score(y_valid['seasonal_vaccine'], y_pred_bin[:, 1])
print(f'Recall - XYZ Vaccine: {recall_xyz:.4f}')
print(f'Recall - Seasonal Vaccine: {recall_seasonal:.4f}')

# F1 Score
f1_xyz = f1_score(y_valid['xyz_vaccine'], y_pred_bin[:, 0])
f1_seasonal = f1_score(y_valid['seasonal_vaccine'], y_pred_bin[:, 1])
print(f'F1 Score - XYZ Vaccine: {f1_xyz:.4f}')
print(f'F1 Score - Seasonal Vaccine: {f1_seasonal:.4f}')

# Predict probabilities for test set
test_features_processed = test_features.drop(columns=['respondent_id'])
test_pred = model.predict_proba(test_features_processed)

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred[0][:, 1],
    'seasonal_vaccine': test_pred[1][:, 1]
})

# Save the submission file
submission.to_csv('submission.csv', index=False)


ROC AUC Score: 0.8410
Accuracy - XYZ Vaccine: 0.8326
Accuracy - Seasonal Vaccine: 0.7829
Precision - XYZ Vaccine: 0.7000
Precision - Seasonal Vaccine: 0.7725
Recall - XYZ Vaccine: 0.3655
Recall - Seasonal Vaccine: 0.7466
F1 Score - XYZ Vaccine: 0.4802
F1 Score - Seasonal Vaccine: 0.7593


In [10]:
import pandas as pd 
data=pd.read_csv("/kaggle/working/submission.csv")
data

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.25,0.30
1,26708,0.06,0.03
2,26709,0.49,0.81
3,26710,0.44,0.89
4,26711,0.23,0.49
...,...,...,...
26703,53410,0.31,0.53
26704,53411,0.23,0.29
26705,53412,0.15,0.37
26706,53413,0.09,0.33
