In [1]:
import pandas as pd

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

# Display the first few rows of the datasets
print(train_features.head())
print(train_labels.head())
print(test_features.head())
print(submission_format.head())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [16]:
# Merge features and labels on respondent_id
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Display the first few rows of the merged dataset
print(train_data.head())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [17]:
# Separate features and targets
X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train_xyz = train_data['xyz_vaccine']
y_train_seasonal = train_data['seasonal_vaccine']
X_test = test_features.drop(columns=['respondent_id'])

# Display the first few rows of the features and targets
print(X_train.head())
print(y_train_xyz.head())
print(y_train_seasonal.head())
print(X_test.head())

   xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0          1.0            0.0                        0.0   
1          3.0            2.0                        0.0   
2          1.0            1.0                        0.0   
3          1.0            1.0                        0.0   
4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0   
1                          0.0                      1.0   
2                          0.

In [18]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Identify numerical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation'],
      dtype='object')
Numerical columns: Index(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children'],
      dtype='object')


In [19]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Define the models
model_xyz = RandomForestClassifier(n_estimators=100, random_state=0)
model_seasonal = RandomForestClassifier(n_estimators=100, random_state=0)

# Create the pipelines
clf_xyz = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model_xyz)
])

clf_seasonal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model_seasonal)
])

# Train the models
clf_xyz.fit(X_train, y_train_xyz)
clf_seasonal.fit(X_train, y_train_seasonal)

In [21]:
# Make predictions for each target variable
xyz_pred_proba = clf_xyz.predict_proba(X_test)[:, 1]  # Probabilities for xyz_vaccine
seasonal_pred_proba = clf_seasonal.predict_proba(X_test)[:, 1]  # Probabilities for seasonal_vaccine

# Ensure probabilities are in the correct range
xyz_pred_proba = xyz_pred_proba.clip(0, 1)
seasonal_pred_proba = seasonal_pred_proba.clip(0, 1)

# Display the predicted probabilities
print(xyz_pred_proba[:5])
print(seasonal_pred_proba[:5])

[0.18 0.05 0.34 0.5  0.28]
[0.36 0.09 0.73 0.83 0.52]


In [22]:
# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_pred_proba,
    'seasonal_vaccine': seasonal_pred_proba
})

# Ensure the submission matches the format of submission_format.csv
submission = submission_format[['respondent_id']].merge(submission, on='respondent_id')

# Save the submission file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
