Common work

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
features_df = pd.read_csv("training_set_features.csv", index_col='respondent_id')
labels_df = pd.read_csv("training_set_labels.csv", index_col='respondent_id')
features_df
# labels_df

In [None]:
features_df.describe()

In [None]:
features_df.isnull().any()

In [None]:
features_df.loc[features_df['h1n1_concern'] == np.nan]

In [None]:
# %matplotlib qt
# %matplotlib inline

In [None]:
fig, axs = plt.subplots(1, 2)
sns.countplot(ax=axs[0], data=features_df, x='h1n1_knowledge').set(title='Knowledge of H1N1')
# g.set(xticklabels=['No knowledge', 'Little knowledge', 'A lot of knowledge'])
axs[0].set_xticklabels(['No knowledge', 'Little knowledge', 'A lot of knowledge'])
# axs[0].suptitle('Knowledge of H1N1')
sns.countplot(ax=axs[1], data=features_df, x='h1n1_concern').set(title='Concern about H1N1')
axs[1].set_xticklabels(['Not at all concerned', 'Not very concerned', 'Somewhat concerned', 'Very Concerned'])
# axs[1].suptitle('Concern about H1N1')
plt.tight_layout()
plt.show()

In [None]:
features_df[features_df.isna().any(axis=1)]

In [None]:
fig, axs = plt.subplots(1, 2)
sns.countplot(ax=axs[0], data=features_df, x='opinion_h1n1_vacc_effective').set(title='Opinion on H1N1 Vaccine Effectiveness')
# g.set(xticklabels=['No knowledge', 'Little knowledge', 'A lot of knowledge'])
axs[0].set_xticklabels(['Not at all effective', 'Not very effective', 'Don\'t Know', 'Somewhat effective', 'Very effective'])
# axs[0].suptitle('Knowledge of H1N1')
sns.countplot(ax=axs[1], data=features_df, x='opinion_h1n1_sick_from_vacc').set(title='Opinion on H1N1 Vaccine Symptoms')
axs[1].set_xticklabels(['Not at all worried', 'Not very worried', 'Don\'t Know', 'Somewhat worried', 'Very worried'])
# axs[1].suptitle('Concern about H1N1')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
def my_custom_roc_auc_func(estimator, X, y=None):
    y_score = np.transpose([y_pred[:, 1] for y_pred in estimator.predict_proba(X)])
    return roc_auc_score(y, y_score, average='macro')

In [None]:
from sklearn.model_selection import cross_val_score
def cross_validate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring=my_custom_roc_auc_func)
    # print("Cross-validation scores:", scores)
    print("Mean AUC Score:", scores.mean())

## Kyros Karayiannis' Approach

### Data Analysis

In [None]:
df_combined_kyros = features_df.join(labels_df)

In [None]:
df_combined_kyros

In [None]:
df_combined_kyros[['h1n1_knowledge', 'h1n1_vaccine']].corr(method='spearman')

In [None]:
df_combined_kyros[['opinion_h1n1_sick_from_vacc', 'opinion_h1n1_vacc_effective']].corr(method='spearman')

In [None]:
df_combined_kyros[['opinion_seas_sick_from_vacc', 'opinion_seas_vacc_effective']].corr(method='spearman')

In [None]:
df_combined_kyros[['chronic_med_condition', 'h1n1_vaccine']].corr(method='spearman')

In [None]:
df_combined_kyros[['chronic_med_condition', 'seasonal_vaccine']].corr(method='spearman')

In [None]:
cols = ['h1n1_knowledge', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_sick_from_vacc', 'opinion_h1n1_vacc_effective', 'opinion_seas_vacc_effective', 'chronic_med_condition']
df_combined_kyros[cols + ['h1n1_vaccine','seasonal_vaccine']].corr(method='spearman')

In [None]:
sns.heatmap(df_combined_kyros[cols + ['h1n1_vaccine','seasonal_vaccine']].corr(method='spearman'), annot=True)

Above we saw some code to collect colleration among some columns to help us with our observation



In [None]:
proportions_h1n1_facemask = df_combined_kyros.groupby('behavioral_face_mask')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_facemask = df_combined_kyros.groupby('behavioral_face_mask')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='behavioral_face_mask', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_facemask, ax=axs[0])
sns.barplot(x='behavioral_face_mask', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_facemask, ax=axs[1])

In [None]:
proportions_h1n1_washhands = df_combined_kyros.groupby('behavioral_wash_hands')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_washhands = df_combined_kyros.groupby('behavioral_wash_hands')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='behavioral_wash_hands', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_washhands, ax=axs[0])
sns.barplot(x='behavioral_wash_hands', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_washhands, ax=axs[1])

In [None]:
proportions_h1n1_agegroup = df_combined_kyros.groupby('age_group')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_agegroup = df_combined_kyros.groupby('age_group')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='age_group', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_agegroup, ax=axs[0])
sns.barplot(x='age_group', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_agegroup, ax=axs[1])

In [None]:
proportions_h1n1_race = df_combined_kyros.groupby('race')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_race = df_combined_kyros.groupby('race')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='race', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_race, ax=axs[0])
sns.barplot(x='race', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_race, ax=axs[1])

In [None]:
proportions_h1n1_education = df_combined_kyros.groupby('education')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_education = df_combined_kyros.groupby('education')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='education', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_education, ax=axs[0])
sns.barplot(x='education', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_education, ax=axs[1])

In [None]:
proportions_h1n1_sex = df_combined_kyros.groupby('sex')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_sex = df_combined_kyros.groupby('sex')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='sex', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_sex, ax=axs[0])
sns.barplot(x='sex', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_sex, ax=axs[1])

We plot some graphs to show the relationship of some columns with the vaccines uptakes

### Pre-Processing - Kyros

In [None]:
df_combined_kyros[df_combined_kyros.isna().any(axis=1)]

In [None]:
for column in df_combined_kyros.columns.values.tolist():
    print(column)
    print (df_combined_kyros[column].unique())
    print("")

In [None]:
df_combined_kyros = df_combined_kyros.drop(['employment_occupation','employment_industry','health_insurance'],axis=1)
df_combined_kyros = df_combined_kyros.dropna()
df_combined_kyros

In [None]:
df_combined_kyros = df_combined_kyros.drop_duplicates()
print(df_combined_kyros.shape)
df_combined_kyros

In [None]:
missingdata = df_combined_kyros.notnull()
for column in missingdata.columns.values.tolist():
    print(column)
    print (missingdata[column].value_counts())
    print("")

We drop the NaN values , drop any duplicates and check if our dataset is clean


In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df_combined_kyros['age_group'] = le.fit_transform(df_combined_kyros['age_group'])
df_combined_kyros['race'] = le.fit_transform(df_combined_kyros['race'])
df_combined_kyros['sex'] = le.fit_transform(df_combined_kyros['sex'])
df_combined_kyros['education'] = le.fit_transform(df_combined_kyros['education'])
df_combined_kyros['income_poverty'] = le.fit_transform(df_combined_kyros['income_poverty'])
df_combined_kyros['marital_status'] = le.fit_transform(df_combined_kyros['marital_status'])
df_combined_kyros['rent_or_own'] = le.fit_transform(df_combined_kyros['rent_or_own'])
df_combined_kyros['employment_status'] = le.fit_transform(df_combined_kyros['employment_status'])
df_combined_kyros['hhs_geo_region'] = le.fit_transform(df_combined_kyros['hhs_geo_region'])
df_combined_kyros['census_msa'] = le.fit_transform(df_combined_kyros['census_msa'])

In [None]:
for column in df_combined_kyros.columns.values.tolist():
    print(column)
    print (df_combined_kyros[column].unique())
    print("")

Encoding any non-categorical values

In [None]:
from sklearn.model_selection import train_test_split

X_kyros = df_combined_kyros.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
y_kyros = df_combined_kyros[['h1n1_vaccine', 'seasonal_vaccine']]

from sklearn.feature_selection import SelectKBest, chi2

k = 15
skb = SelectKBest(score_func=chi2, k=k)

X_k_best = skb.fit_transform(X_kyros, y_kyros)
k_best_indices = skb.get_support(indices=True)
k_best_features = X_kyros.columns[k_best_indices].tolist()
k_best_features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_k_best, y_kyros, test_size=0.3, random_state=42, stratify=y_kyros)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

Performing feature selection and choosing our test and train dataset

### Models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score

#### Decision Tree

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'criterion': ['gini','entropy'],
    'max_depth' : range(1,10),
    'min_samples_split':[3,8,10],
    'min_samples_leaf':[1,3,5]
}

dt_model = DecisionTreeClassifier()

grid_search = GridSearchCV(dt_model, param_grid=param_grid, cv=10, scoring=my_custom_roc_auc_func)
grid_search.fit(X_train, y_train)

best_estimator = grid_search.best_estimator_
best_parameters = grid_search.best_params_

y_predicted = best_estimator.predict_proba(X_test)


y_score = np.transpose([y_pred[:, 1] for y_pred in best_estimator.predict_proba(X_test)])
y_auc = roc_auc_score(y_test, y_score, average='macro')

print("Best Parameters:", best_parameters)
print("AUC Scores:", y_auc)

In [None]:
dt_model =  tree.DecisionTreeClassifier(**best_parameters)
dt_model.fit(X_train,y_train)
print("Decision tree Model AUC on our test set", my_custom_roc_auc_func(dt_model,X_test,y_test))

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score
predicted_dt = dt_model.predict(X_test)

print(multilabel_confusion_matrix(y_test, predicted_dt))
print(accuracy_score(y_test, predicted_dt))
print(f1_score(y_test, predicted_dt, average='micro'))

In [None]:
from sklearn.metrics import roc_curve, auc

predict_prob = dt_model.predict_proba(X_test)

y_score = np.transpose([y_pred[:, 1] for y_pred in dt_model.predict_proba(X_test)])
pred = roc_auc_score(y_test, y_score, average=None)
pred

In [None]:
cross_validate_model(dt_model, X_train, y_train)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle('Decision Tree Receiver Operating Characteristic Curves')
fpr, tpr, thresholds = roc_curve(y_test['h1n1_vaccine'], y_score[:, 0])
ax[0].plot(fpr, tpr, label='H1N1 (AUC = {:.4f})'.format(pred[0]))
ax[0].plot([0, 1], [0, 1], linestyle='--')
ax[0].set_ylabel('True Positive Rate')
ax[0].set_xlabel('False Positive Rate')
ax[0].set_title('H1N1')
ax[0].legend(loc='lower right')

fpr, tpr, thresholds = roc_curve(y_test['seasonal_vaccine'], y_score[:, 1])
ax[1].plot(fpr, tpr, label='Seasonal (AUC = {:.4f})'.format(pred[1]))
ax[1].plot([0, 1], [0, 1], linestyle='--')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_title('Seasonal')
ax[1].legend(loc='lower right')

In the above code , briefly we trained one decision tree model , performed hyperparameter tuning and presented the ROC-AUC graphs for both vaccines

#### Artificial Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(activation='logistic')

mlp_model.fit(X_train, y_train)

predicted_mlp = mlp_model.predict(X_test)

print(multilabel_confusion_matrix(y_test, predicted_mlp))
print(accuracy_score(y_test, predicted_mlp))
print(f1_score(y_test, predicted_mlp, average='micro'))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(10,), (20,), (30,)],
    'activation': ['relu'],
    'alpha': [0.0001, 0.001, 0.01],
    'solver': ['adam'],
    'max_iter': [1000, 1500, 2000, 2500],
    'random_state': [42]
}

mlp = MLPClassifier()
grid_search_mlp = GridSearchCV(mlp, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search_mlp.fit(X_train, y_train)

best_estimator_mlp = grid_search_mlp.best_estimator_
best_parameters_mlp = grid_search_mlp.best_params_

y_pred_proba_mlp = best_estimator_mlp.predict_proba(X_test)

auc_scores_mlp = []
for i in range(y_test.shape[1]):
    auc = roc_auc_score(y_test.iloc[:, i], y_pred_proba_mlp[:, i])
    auc_scores_mlp.append(auc)

average_auc = sum(auc_scores_mlp) / len(auc_scores_mlp)

print("Best Parameters:", best_parameters_mlp)
print("AUC Score:", average_auc)

In [None]:
mlp_model = MLPClassifier(hidden_layer_sizes=(30,), activation='relu', solver='adam', alpha=0.01, max_iter=1000, random_state=42)

mlp_model.fit(X_train, y_train)

predicted_mlp = mlp_model.predict(X_test)

print(multilabel_confusion_matrix(y_test, predicted_mlp))
print(accuracy_score(y_test, predicted_mlp))
print(f1_score(y_test, predicted_mlp, average='micro'))

In [None]:
auc_scores_mlp = []
for i in range(y_test.shape[1]):
    auc = roc_auc_score(y_test.iloc[:, i], y_pred_proba_mlp[:, i])
    auc_scores_mlp.append(auc)

average_auc = sum(auc_scores_mlp) / len(auc_scores_mlp)
auc_scores_mlp, average_auc

In [None]:
scores_cv_mlp = cross_val_score(mlp_model, X_train, y_train, cv=5, scoring='roc_auc')
print("Mean AUC Score:", scores_cv_mlp.mean())

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fpr, tpr, thresholds = roc_curve(y_test['h1n1_vaccine'], y_score[:, 0])
fig.suptitle('Artificial Neural Network Receiver Operating Characteristic Curves')
ax[0].plot(fpr, tpr, label='H1N1 (AUC = {:.4f})'.format(auc_scores_mlp[0]))
ax[0].plot([0, 1], [0, 1], linestyle='--')
ax[0].set_ylabel('True Positive Rate')
ax[0].set_xlabel('False Positive Rate')
ax[0].set_title('H1N1')
ax[0].legend(loc='lower right')

fpr, tpr, thresholds = roc_curve(y_test['seasonal_vaccine'], y_score[:, 1])
ax[1].plot(fpr, tpr, label='Seasonal (AUC = {:.4f})'.format(auc_scores_mlp[1]))
ax[1].plot([0, 1], [0, 1], linestyle='--')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_title('Seasonal')
ax[1].legend(loc='lower right')

Trained a ANN model, again performed hyperparameter tuning and outputed the mean AUC score. Lastly we plotted the ROC-AUC scores for both the vaccines

## Alexandros Michaelides' Approach

### Data Analysis

In [None]:
features_df_alex = features_df.copy()
labels_df_alex = labels_df.copy()
labels_df_alex

In [None]:
df_combined_alex = features_df_alex.join(labels_df_alex)
df_combined_alex[['h1n1_concern', 'h1n1_vaccine']].corr(method='spearman')

In [None]:
df_combined_alex[['opinion_h1n1_vacc_effective', 'h1n1_vaccine']].corr(method='spearman')

In [None]:
df_combined_alex[['opinion_seas_vacc_effective', 'seasonal_vaccine']].corr(method='spearman')

In [None]:
behavioural_cols = ['behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face']
df_combined_alex[behavioural_cols + ['h1n1_vaccine','seasonal_vaccine']].corr(method='spearman')

In [None]:
sns.heatmap(df_combined_alex[behavioural_cols + ['h1n1_vaccine','seasonal_vaccine']].corr(method='spearman'), annot=True)

Performed some correlations with a relevant heatmap to help us further with the report

In [None]:
proportions_h1n1_race = df_combined_alex.groupby('race')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_race = df_combined_alex.groupby('race')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='race', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_race, ax=axs[0])
sns.barplot(x='race', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_race, ax=axs[1])

In [None]:
labels_df_alex['h1n1_vaccine'].value_counts(), labels_df_alex['seasonal_vaccine'].value_counts()

In [None]:
# df_combined_alex['employment_occupation'].isnull().value_counts()

In [None]:
proportions_h1n1_large_gatherings = df_combined_alex.groupby('behavioral_large_gatherings')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_large_gatherings = df_combined_alex.groupby('behavioral_large_gatherings')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='behavioral_large_gatherings', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_large_gatherings, ax=axs[0])
sns.barplot(x='behavioral_large_gatherings', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_large_gatherings, ax=axs[1])

In [None]:
proportions_h1n1_touch_face = df_combined_alex.groupby('behavioral_touch_face')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_touch_face = df_combined_alex.groupby('behavioral_touch_face')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='behavioral_touch_face', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_touch_face, ax=axs[0])
sns.barplot(x='behavioral_touch_face', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_touch_face, ax=axs[1])

In [None]:
proportions_h1n1_hh_children = df_combined_alex.groupby('household_children')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_hh_children = df_combined_alex.groupby('household_children')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='household_children', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_hh_children, ax=axs[0])
sns.barplot(x='household_children', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_hh_children, ax=axs[1])

In [None]:
proportions_h1n1_hh_adults = df_combined_alex.groupby('household_adults')['h1n1_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
proportions_seasonal_hh_adults = df_combined_alex.groupby('household_adults')['seasonal_vaccine'].value_counts(normalize=True).rename('proportion').reset_index()
fig, axs = plt.subplots(2)
fig.tight_layout()
sns.barplot(x='household_adults', y='proportion', hue='h1n1_vaccine', data=proportions_h1n1_hh_adults, ax=axs[0])
sns.barplot(x='household_adults', y='proportion', hue='seasonal_vaccine', data=proportions_seasonal_hh_adults, ax=axs[1])

Plotted some barplots to show the relationship and compare different columns with both vaccine uptakes

### Pre-Processing

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')

df_combined_imputed = pd.DataFrame(imputer.fit_transform(df_combined_alex), columns=df_combined_alex.columns)
df_combined_imputed.isnull().any()

In [None]:
for column in df_combined_imputed.columns:
    print(column + ": " + str(df_combined_imputed[column].unique()))

In [None]:
from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder()

cols = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
df_combined_encoded = oh_encoder.fit_transform(df_combined_imputed[cols]).toarray()
df_combined_encoded = pd.DataFrame(df_combined_encoded, columns=oh_encoder.get_feature_names_out(cols))
df_combined_encoded = pd.concat([df_combined_imputed.drop(columns=cols), df_combined_encoded], axis=1)
df_combined_encoded

In [None]:
for column in df_combined_encoded.columns:
    df_combined_encoded[column] = df_combined_encoded[column].astype(int)

Using simple imputer to fill any NaN values with mode and one hot encoder to encode any non-categorical values

In [None]:
from sklearn.model_selection import train_test_split

X_combined = df_combined_encoded.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
y_combined = df_combined_encoded[['h1n1_vaccine', 'seasonal_vaccine']]

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

k = 30
skb = SelectKBest(score_func=chi2, k=k)

X_k_best = skb.fit_transform(X_combined, y_combined)
k_best_indices = skb.get_support(indices=True)
k_best_features = X_combined.columns[k_best_indices].tolist()
k_best_features

Feature selection with our chosen train and test dataset

### Models

In [None]:
# from sklearn.metrics import roc_auc_score
# def my_custom_roc_auc_func(estimator, X, y=None):
#     y_score = np.transpose([y_pred[:, 1] for y_pred in estimator.predict_proba(X)])
#     return roc_auc_score(y, y_score, average='macro')

In [None]:
# from sklearn.model_selection import cross_val_score
# def cross_validate_model(model, X, y):
#     scores = cross_val_score(model, X, y, cv=5, scoring=my_custom_roc_auc_func)
#     print("Cross-validation scores:", scores)
#     print("Mean auc:", scores.mean())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_k_best, y_combined, test_size=0.25, random_state=42, stratify=y_combined)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# y_train['h1n1_vaccine'].value_counts(), y_train['seasonal_vaccine'].value_counts()

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

param_grid = {
    'n_estimators': [5, 10, 20, 50, 100, 150, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto'],
    'random_state': [42]
}

rf_model = RandomForestClassifier()
grid_search = GridSearchCV(rf_model, param_grid=param_grid, cv=5, scoring=my_custom_roc_auc_func)

grid_search.fit(X_train, y_train)

best_estimator = grid_search.best_estimator_
best_parameters = grid_search.best_params_

y_pred_proba = best_estimator.predict_proba(X_test)

y_score = np.transpose([y_pred[:, 1] for y_pred in best_estimator.predict_proba(X_test)])
y_auc = roc_auc_score(y_test, y_score, average='macro')

print("Best Parameters:", best_parameters)
print("AUC Scores:", y_auc)

In [None]:
# rf_model = RandomForestClassifier(**best_parameters)
rf_model = RandomForestClassifier(max_depth=10, max_features='auto', min_samples_leaf=4, min_samples_split=10, n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
print("RF Model AUC on test set:", my_custom_roc_auc_func(rf_model, X_test, y_test))

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score
# Predict on the test set
predicted_rf = rf_model.predict(X_test)

# Print metrics
print(multilabel_confusion_matrix(y_test, predicted_rf))
print(accuracy_score(y_test, predicted_rf))
print(f1_score(y_test, predicted_rf, average='micro'))

In [None]:
from sklearn.metrics import roc_curve, auc

predict_prob = rf_model.predict_proba(X_test)
# roc_curve(y_test, predict_prob[:, 1], pos_label=1)

y_score_rf = np.transpose([y_pred[:, 1] for y_pred in rf_model.predict_proba(X_test)])
auc_scores_rf = roc_auc_score(y_test, y_score_rf, average=None)
auc_scores_rf

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fpr, tpr, thresholds = roc_curve(y_test['h1n1_vaccine'], y_score_rf[:, 0])
fig.suptitle('Random Forest Receiver Operating Characteristic Curves')
ax[0].plot(fpr, tpr, label='H1N1 (AUC = {:.4f})'.format(auc_scores_rf[0]))
ax[0].plot([0, 1], [0, 1], linestyle='--')
ax[0].set_ylabel('True Positive Rate')
ax[0].set_xlabel('False Positive Rate')
ax[0].set_title('H1N1')
ax[0].legend(loc='lower right')

fpr, tpr, thresholds = roc_curve(y_test['seasonal_vaccine'], y_score_rf[:, 1])
ax[1].plot(fpr, tpr, label='Seasonal (AUC = {:.4f})'.format(auc_scores_rf[1]))
ax[1].plot([0, 1], [0, 1], linestyle='--')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_title('Seasonal')
ax[1].legend(loc='lower right')

In [None]:
cross_validate_model(rf_model, X_train, y_train)

Performing Random forest model with hyperparameter tuning. Outputing the ROC-AUC score curves for both vaccines

#### Naive Bayes

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import CategoricalNB

param_grid_nb = {
    'estimator__alpha': [0.1, 1.0, 10.0],
    'estimator__fit_prior': [True, False]
}

nb_model = CategoricalNB()
grid_search_nb = GridSearchCV(MultiOutputClassifier(nb_model), param_grid=param_grid_nb, cv=5, scoring=my_custom_roc_auc_func)

grid_search_nb.fit(X_train, y_train)

best_estimator_nb = grid_search_nb.best_estimator_
best_parameters_nb = grid_search_nb.best_params_

y_pred_proba_nb = best_estimator_nb.predict_proba(X_test)

y_score_nb = np.transpose([y_pred[:, 1] for y_pred in best_estimator_nb.predict_proba(X_test)])
y_auc_nb = roc_auc_score(y_test, y_score_nb, average='macro')

print("Best Parameters:", best_parameters_nb)
print("AUC Scores:", y_auc_nb)

In [None]:
nb_model = CategoricalNB()
nb_model = MultiOutputClassifier(nb_model)
nb_model.fit(X_train, y_train)

In [None]:
print("RF Model AUC on test set:", my_custom_roc_auc_func(nb_model, X_test, y_test))

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score
# Predict on the test set
predicted_nb = nb_model.predict(X_test)

# Print metrics
print(multilabel_confusion_matrix(y_test, predicted_nb))
print(accuracy_score(y_test, predicted_nb))
print(f1_score(y_test, predicted_nb, average='micro'))

In [None]:
from sklearn.metrics import roc_curve, auc

predict_prob = nb_model.predict_proba(X_test)
# roc_curve(y_test, predict_prob[:, 1], pos_label=1)

y_score_nb = np.transpose([y_pred[:, 1] for y_pred in nb_model.predict_proba(X_test)])
auc_scores_nb = roc_auc_score(y_test, y_score_nb, average=None)
auc_scores_nb

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig.suptitle('Naïve Bayes Receiver Operating Characteristic Curves')
fpr, tpr, thresholds = roc_curve(y_test['h1n1_vaccine'], y_score_nb[:, 0])
ax[0].plot(fpr, tpr, label='H1N1 (AUC = {:.4f})'.format(auc_scores_nb[0]))
ax[0].plot([0, 1], [0, 1], linestyle='--')
ax[0].set_ylabel('True Positive Rate')
ax[0].set_xlabel('False Positive Rate')
ax[0].set_title('H1N1')
ax[0].legend(loc='lower right')

fpr, tpr, thresholds = roc_curve(y_test['seasonal_vaccine'], y_score_nb[:, 1])
ax[1].plot(fpr, tpr, label='Seasonal (AUC = {:.4f})'.format(auc_scores_nb[1]))
ax[1].plot([0, 1], [0, 1], linestyle='--')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_title('Seasonal')
ax[1].legend(loc='lower right')

In [None]:
cross_validate_model(nb_model, X_train, y_train)

Trained a Naïve Bayes model , with hyperparameter tuning. Calculated the ROC-AUC socres and plotted them for both vaccines

### Submission Format Code/ This was submitted to the competition webpage of the obtained dataset

In [None]:
df_test_set_features = pd.read_csv('test_set_features.csv', index_col='respondent_id')
# df_test_set_features.isnull().any()


imputer = SimpleImputer(strategy='most_frequent')
df_test_set_features = pd.DataFrame(imputer.fit_transform(df_test_set_features), columns=df_test_set_features.columns)

oh_encoder = OneHotEncoder()

cols = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
df_test_set_encoded = oh_encoder.fit_transform(df_test_set_features[cols]).toarray()
df_test_set_encoded = pd.DataFrame(df_test_set_encoded, columns=oh_encoder.get_feature_names_out(cols))
df_test_set_encoded = pd.concat([df_test_set_features.drop(columns=cols), df_test_set_encoded], axis=1)
df_test_set_encoded

df_test_set_encoded[k_best_features]

df_test_set_encoded

test_set_prob = rf_model.predict_proba(df_test_set_encoded[k_best_features])
test_set_prob_h1n1 = test_set_prob[0]
test_set_prob_seasonal = test_set_prob[1]

In [None]:
# df_submission = pd.DataFrame(test_set_prob_h1n1[:, 1], columns=['h1n1_vaccine'])
# df_submission = pd.concat([df_submission, pd.DataFrame(test_set_prob_seasonal[:, 1], columns=['seasonal_vaccine'])], axis=1)
# df_submission.index += 26707
# df_submission.index.name = 'respondent_id'
# df_submission.to_csv('rf_best_probs.csv')

In [None]:
test_set_prob_h1n1

In [None]:
df_test_set_features = pd.read_csv('test_set_features.csv', index_col='respondent_id')
# df_test_set_features.isnull().any()


# imputer = SimpleImputer(strategy='most_frequent')
# df_test_set_features = pd.DataFrame(imputer.fit_transform(df_test_set_features), columns=df_test_set_features.columns)

label_encoder = LabelEncoder() 

cols = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']
df_test_set_encoded_le = df_test_set_features.copy()

df_test_set_encoded_le['age_group'] = le.fit_transform(df_test_set_features['age_group'])
df_test_set_encoded_le['race'] = le.fit_transform(df_test_set_features['race'])
df_test_set_encoded_le['sex'] = le.fit_transform(df_test_set_features['sex'])
df_test_set_encoded_le['education'] = le.fit_transform(df_test_set_features['education'])
df_test_set_encoded_le['income_poverty'] = le.fit_transform(df_test_set_features['income_poverty'])
df_test_set_encoded_le['marital_status'] = le.fit_transform(df_test_set_features['marital_status'])
df_test_set_encoded_le['rent_or_own'] = le.fit_transform(df_test_set_features['rent_or_own'])
df_test_set_encoded_le['employment_status'] = le.fit_transform(df_test_set_features['employment_status'])
df_test_set_encoded_le['hhs_geo_region'] = le.fit_transform(df_test_set_features['hhs_geo_region'])
df_test_set_encoded_le['census_msa'] = le.fit_transform(df_test_set_features['census_msa'])

# df_test_set_encoded = pd.DataFrame(df_test_set_encoded, columns=label_encoder.get_feature_names_out(cols))
# df_test_set_encoded = pd.concat([df_test_set_features.drop(columns=cols), df_test_set_encoded], axis=1)

k_best_features_test = ['h1n1_concern',
 'h1n1_knowledge',
 'behavioral_face_mask',
 'doctor_recc_h1n1',
 'doctor_recc_seasonal',
 'chronic_med_condition',
 'health_worker',
 'opinion_h1n1_vacc_effective',
 'opinion_h1n1_risk',
 'opinion_h1n1_sick_from_vacc',
 'opinion_seas_vacc_effective',
 'opinion_seas_risk',
 'age_group',
 'rent_or_own',
 'household_children']

df_test_set_encoded_le[k_best_features_test]

imputer = SimpleImputer(strategy='most_frequent')
df_test_set_encoded_le = pd.DataFrame(imputer.fit_transform(df_test_set_encoded_le[k_best_features_test]), columns=df_test_set_encoded_le[k_best_features_test].columns)

test_set_prob_mlp = mlp_model.predict_proba(df_test_set_encoded_le[k_best_features_test])

test_set_prob_mlp

df_submission_mlp = pd.DataFrame(test_set_prob_mlp[:, 1], columns=['h1n1_vaccine'])
df_submission_mlp = pd.concat([df_submission_mlp, pd.DataFrame(test_set_prob_mlp[:, 1], columns=['seasonal_vaccine'])], axis=1)
df_submission_mlp.index += 26707
df_submission_mlp.index.name = 'respondent_id'
df_submission_mlp.to_csv('ann_best_probs.csv')
# df_submission_mlp