In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_csv('Downloads/training_set_features.csv')

# Assuming 'respondent_id' is the ID column
# Assuming 'xyz_vaccine' and 'seasonal_vaccine' are the target variables for XYZ and seasonal vaccine uptake
# (Since it's not in the provided data, let's create dummy target columns for the sake of example)
# Normally, we would need the actual target columns to be provided or included in the dataset

# For this demonstration, we will create dummy 'xyz_vaccine' and 'seasonal_vaccine' columns with random binary values
np.random.seed(42)
data['xyz_vaccine'] = np.random.randint(0, 2, size=data.shape[0])
data['seasonal_vaccine'] = np.random.randint(0, 2, size=data.shape[0])

# Define feature columns based on the provided keys
feature_cols = [
    'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
    'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 
    'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 
    'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 
    'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 
    'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 
    'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 
    'employment_status', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children', 
    'employment_industry', 'employment_occupation'
]

# Define preprocessing for numeric and categorical features
numeric_features = [
    'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
    'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 
    'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 
    'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 
    'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 
    'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 
    'household_adults', 'household_children'
]

categorical_features = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 
    'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation'
]

# Create preprocessing pipelines for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the model pipeline for XYZ vaccine
xyz_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Create the model pipeline for Seasonal vaccine
seasonal_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split data into training and testing sets for XYZ vaccine
X_train_xyz, X_test_xyz, y_train_xyz, y_test_xyz = train_test_split(data[feature_cols], data['xyz_vaccine'], test_size=0.2, random_state=42)

# Train the XYZ model
xyz_model.fit(X_train_xyz, y_train_xyz)

# Predict probabilities for the entire dataset for XYZ vaccine
xyz_vaccination_probabilities = xyz_model.predict_proba(data[feature_cols])[:, 1]

# Split data into training and testing sets for Seasonal vaccine
X_train_seasonal, X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(data[feature_cols], data['seasonal_vaccine'], test_size=0.2, random_state=42)

# Train the Seasonal model
seasonal_model.fit(X_train_seasonal, y_train_seasonal)

# Predict probabilities for the entire dataset for Seasonal vaccine
seasonal_vaccination_probabilities = seasonal_model.predict_proba(data[feature_cols])[:, 1]

# Create a new DataFrame with respondent_id, XYZ vaccination probabilities, and Seasonal vaccination probabilities
result = pd.DataFrame({
    'respondent_id': data['respondent_id'],
    'xyz_vaccination_probability': xyz_vaccination_probabilities,
    'seasonal_vaccination_probability': seasonal_vaccination_probabilities
})

# Save the updated dataset with respondent_id and probabilities
result.to_csv('Downloads/submission_format.csv', index=False)

# Display the first few rows of the updated dataset
print(result.head())


   respondent_id  xyz_vaccination_probability  \
0              0                         0.23   
1              1                         0.79   
2              2                         0.19   
3              3                         0.18   
4              4                         0.19   

   seasonal_vaccination_probability  
0                              0.15  
1                              0.19  
2                              0.22  
3                              0.20  
4                              0.84  
