<a href="https://colab.research.google.com/github/harshsrivastava321/Summer-Analytics2024/blob/main/summer_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load data
train_data = pd.read_csv('training_set_features.csv')
test_data = pd.read_csv('test_set_features.csv')
training_data = pd.read_csv('training_set_labels.csv')


In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Define the columns to be encoded and scaled
categorical_cols = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
binary_cols = [col for col in train_data.columns if train_data[col].nunique() == 2]
numerical_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64'] and col not in binary_cols]

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_cols] = imputer.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = imputer.transform(test_data[categorical_cols])

# Encode categorical features
encoder = LabelEncoder()
for col in categorical_cols:
    train_data[col] = encoder.fit_transform(train_data[col])
    test_data[col] = encoder.transform(test_data[col])

# Scale numerical features
scaler = StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

In [3]:
# Define target variables
target_cols = ['xyz_vaccine', 'seasonal_vaccine']

# Separate features and targets
X_train = train_data.drop(columns=['respondent_id'] )
y_train = training_data[target_cols]
X_test = test_data.drop(columns=['respondent_id'])


In [4]:
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Wrap it with MultiOutputClassifier to handle multilabel classification
model = MultiOutputClassifier(xgb_model, n_jobs=-1)

# Train the model
model.fit(X_train, y_train)


In [5]:
# Make predictions
predictions = model.predict_proba(X_test)

# Extract probabilities for each label
xyz_vaccine_pred = predictions[0][:, 1]
seasonal_vaccine_pred = predictions[1][:, 1]

# Create submission DataFrame
submission = pd.DataFrame({
    'respondent_id': test_data['respondent_id'],
    'xyz_vaccine': xyz_vaccine_pred,
    'seasonal_vaccine': seasonal_vaccine_pred
})


In [6]:
# Save to CSV
submission.to_csv('submission.csv', index=False)


In [7]:
from sklearn.metrics import roc_auc_score

# Calculate ROC AUC
xyz_vaccine_pred = xyz_vaccine_pred[:y_train.shape[0]]
seasonal_vaccine_pred = seasonal_vaccine_pred[:y_train.shape[0]]

roc_auc_xyz = roc_auc_score(y_train['xyz_vaccine'], xyz_vaccine_pred)
roc_auc_seasonal = roc_auc_score(y_train['seasonal_vaccine'], seasonal_vaccine_pred)


# Mean ROC AUC
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

mean_roc_auc

0.49888032086180795