## GeekForGeek Hackathon

In [14]:
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score


In [15]:
# Unzip the dataset
with zipfile.ZipFile('dataset and all.zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_data')


In [16]:
# Load the CSV files
train_features = pd.read_csv('extracted_data/training_set_features.csv')
train_labels = pd.read_csv('extracted_data/training_set_labels.csv')
test_features = pd.read_csv('extracted_data/test_set_features.csv')
submission_format = pd.read_csv('extracted_data/submission_format.csv')


In [17]:
# Define the preprocessing steps for numerical and categorical features
numerical_features = train_features.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_features.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [18]:
# Fit the preprocessor on the entire training set
preprocessor.fit(train_features)

# Transform the entire training data
X_train = preprocessor.transform(train_features)

# Transform the test data
X_test = preprocessor.transform(test_features)


In [19]:
# Use a smaller subset of the data for training to fit into RAM
X_train_small, _, y_train_small, _ = train_test_split(X_train, train_labels, train_size=0.1, random_state=42)

In [20]:
# Create the model pipeline
model = Pipeline(steps=[
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=25, random_state=42)))
])


In [22]:
# Train the model
model.fit(X_train_small, y_train_small)


In [24]:
 #Split the remaining data for validation
_, X_valid, _, y_valid = train_test_split(X_train, train_labels, test_size=0.2, random_state=42)

In [25]:
# Predict probabilities on the validation set
y_valid_pred = model.predict_proba(X_valid)

In [26]:
# Extract the probabilities for each label
y_valid_pred_xyz = y_valid_pred[0][:, 1]
y_valid_pred_seasonal = y_valid_pred[1][:, 1]

In [27]:
# Calculate the ROC AUC score for each label
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_valid_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_valid_pred_seasonal)
average_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

In [29]:
# Predict probabilities on the test set
test_pred = model.predict_proba(X_test)

# Create the submission dataframe
submission = submission_format.copy()
submission['xyz_vaccine'] = test_pred[0][:, 1]
submission['seasonal_vaccine'] = test_pred[1][:, 1]

# Save the submission file
submission.to_csv('submission.csv', index=False)

In [28]:
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Average ROC AUC: {average_roc_auc}')

ROC AUC for xyz_vaccine: 0.4997734034238459
ROC AUC for seasonal_vaccine: 0.7057915919931028
Average ROC AUC: 0.6027824977084744
