# Using RandomForestClassification to determine probability of adverse events for each covid related vaccine

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf

# Import our input dataset
vaxsymp = pd.read_csv('../Resources/Data/VAERSvaxsymptoms.csv')
vaxsymp.head()

Unnamed: 0.1,Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_NAME,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,0,916600,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Dysphagia,Epiglottitis,0,0,0
1,1,916601,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Anxiety,Dyspnoea,0,0,0
2,2,916602,COVID19,PFIZER\BIONTECH,COVID19 (COVID19 (PFIZER-BIONTECH)),Chest discomfort,Dysphagia,Pain in extremity,Visual impairment,0
3,3,916603,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Dizziness,Fatigue,Mobility decreased,0,0
4,4,916604,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Injection site erythema,Injection site pruritus,Injection site swelling,Injection site warmth,0


In [3]:
# Run once then comment out. Probably.
# del vaxsymp['Unnamed: 0']

In [4]:
vaxsymp.head()

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_NAME,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,916600,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Dysphagia,Epiglottitis,0,0,0
1,916601,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Anxiety,Dyspnoea,0,0,0
2,916602,COVID19,PFIZER\BIONTECH,COVID19 (COVID19 (PFIZER-BIONTECH)),Chest discomfort,Dysphagia,Pain in extremity,Visual impairment,0
3,916603,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Dizziness,Fatigue,Mobility decreased,0,0
4,916604,COVID19,MODERNA,COVID19 (COVID19 (MODERNA)),Injection site erythema,Injection site pruritus,Injection site swelling,Injection site warmth,0


In [5]:
# Generate our categorical variable list
vaxsymp_cat = vaxsymp.dtypes[vaxsymp.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
vaxsymp[vaxsymp_cat].nunique()

VAX_TYPE        1
VAX_MANU        4
VAX_NAME        4
SYMPTOM1    40103
SYMPTOM2    36205
SYMPTOM3    30205
SYMPTOM4    24847
SYMPTOM5    20025
dtype: int64

In [7]:
# Check the unique value counts to see if binning is required
# vaxsymp.VAX_MANU.value_counts()

MODERNA                 177058
PFIZER\BIONTECH         166125
JANSSEN                  40523
UNKNOWN MANUFACTURER       935
Name: VAX_MANU, dtype: int64

# Using OneHotEncoder to assign unique values

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(vaxsymp[vaxsymp_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(vaxsymp_cat)
encode_df.head()

Unnamed: 0,VAX_TYPE_COVID19,VAX_MANU_JANSSEN,VAX_MANU_MODERNA,VAX_MANU_PFIZER\BIONTECH,VAX_MANU_UNKNOWN MANUFACTURER,VAX_NAME_COVID19 (COVID19 (JANSSEN)),VAX_NAME_COVID19 (COVID19 (MODERNA)),VAX_NAME_COVID19 (COVID19 (PFIZER-BIONTECH)),VAX_NAME_COVID19 (COVID19 (UNKNOWN)),"SYMPTOM1_5-hydroxyindolacetic acid, Dyspnoea, Full blood count, Thyroxine",...,"SYMPTOM5_X-ray limb, nan",SYMPTOM5_X-ray normal,SYMPTOM5_X-ray of pelvis and hip,SYMPTOM5_X-ray of pelvis and hip abnormal,SYMPTOM5_X-ray of pelvis and hip normal,SYMPTOM5_X-ray with contrast lower gastrointestinal tract,"SYMPTOM5_X-ray, nan",SYMPTOM5_Xanthopsia,SYMPTOM5_Yawning,SYMPTOM5_Yellow skin
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Merge one-hot encoded features and drop the originals
vaxsymp = vaxsymp.merge(encode_df,left_index=True, right_index=True)
vaxsymp = vaxsymp.drop(vaxsymp_cat,1)
vaxsymp.head()

# Old code down here

In [7]:
# Remove loan status target from features data
y = loans_df.Loan_Status_Fully_Paid
X = loans_df.drop(columns=["Loan_Status_Fully_Paid","Loan_Status_Not_Paid"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.849


In [9]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
285/285 - 0s - loss: 0.3893 - accuracy: 0.8461
Loss: 0.38925760984420776, Accuracy: 0.8461453914642334
