# Preprocessing Data

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [None]:
# read in the TB_Data_2000_2022_total_count.csv
df = pd.read_csv("/content/drive/My Drive/project4/resources/TB_Data_2000_2022_total_count.csv")
df

Unnamed: 0,Indicator,Year,Geography,FIPS,Cases,Rate per 100000
0,Tuberculosis,2000,Alabama,1,310,7.0
1,Tuberculosis,2001,Alabama,1,264,5.9
2,Tuberculosis,2002,Alabama,1,233,5.2
3,Tuberculosis,2003,Alabama,1,258,5.7
4,Tuberculosis,2004,Alabama,1,211,4.7
...,...,...,...,...,...,...
1168,Tuberculosis,2018,Wyoming,56,1,0.2
1169,Tuberculosis,2019,Wyoming,56,1,0.2
1170,Tuberculosis,2020 (COVID-19 Pandemic),Wyoming,56,0,0.0
1171,Tuberculosis,2021,Wyoming,56,3,0.5


In [None]:
# check the data types in the aggregate case count dataset
df.dtypes

Indicator           object
Year                object
Geography           object
FIPS                 int64
Cases               object
Rate per 100000    float64
dtype: object

In [None]:
# update "2020 (COVID-19 Pandemic)" under the Year column to just the year
df.loc[df['Year'] == '2020 (COVID-19 Pandemic)', 'Year'] = '2020'

In [None]:
# convert the Year and Cases columns' data type to integer
df[['Year','Cases']] = df.replace(',', '', regex=True)[['Year','Cases']].astype(int)

In [None]:
# divide the years into pre- and post-covid

# first create a function to categorize pre-/post-covid
def covid(year):
    if year < 2020:
        return 'Pre'
    else:
        return 'Post'

# apply the function to the Year column using the apply() function
df['COVID'] = df['Year'].apply(covid)
df

Unnamed: 0,Indicator,Year,Geography,FIPS,Cases,Rate per 100000,COVID
0,Tuberculosis,2000,Alabama,1,310,7.0,Pre
1,Tuberculosis,2001,Alabama,1,264,5.9,Pre
2,Tuberculosis,2002,Alabama,1,233,5.2,Pre
3,Tuberculosis,2003,Alabama,1,258,5.7,Pre
4,Tuberculosis,2004,Alabama,1,211,4.7,Pre
...,...,...,...,...,...,...,...
1168,Tuberculosis,2018,Wyoming,56,1,0.2,Pre
1169,Tuberculosis,2019,Wyoming,56,1,0.2,Pre
1170,Tuberculosis,2020,Wyoming,56,0,0.0,Post
1171,Tuberculosis,2021,Wyoming,56,3,0.5,Post


In [None]:
# drop the Indicator, FIPS and Rate columns
df = df.drop(columns=['Indicator', 'FIPS', 'Rate per 100000'])

In [None]:
# convert the categorical data (Year and State) to numeric with 'pd.get_dummies'
numeric = pd.get_dummies(df, dtype=int)
numeric

Unnamed: 0,Year,Cases,Geography_Alabama,Geography_Alaska,Geography_Arizona,Geography_Arkansas,Geography_California,Geography_Colorado,Geography_Connecticut,Geography_Delaware,...,Geography_Texas,Geography_Utah,Geography_Vermont,Geography_Virginia,Geography_Washington,Geography_West Virginia,Geography_Wisconsin,Geography_Wyoming,COVID_Post,COVID_Pre
0,2000,310,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2001,264,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2002,233,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2003,258,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2004,211,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,2018,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1169,2019,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1170,2020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1171,2021,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [None]:
# filter pre- and post-COVID data
pre_covid_data = numeric[numeric['COVID_Pre'] == 1]
post_covid_data = numeric[numeric['COVID_Post'] == 1]

# pre-COVID data
X_pre_covid = pre_covid_data.drop(['Cases'], axis=1)
y_pre_covid = pre_covid_data['Cases']

# post-COVID data
X_post_covid = post_covid_data.drop(['Cases'], axis=1)
y_post_covid = post_covid_data['Cases']

In [None]:
# split pre-COVID data into training and testing datasets
X_train_pre_covid, X_test_pre_covid, y_train_pre_covid, y_test_pre_covid = train_test_split(X_pre_covid, y_pre_covid, random_state=1)

# split post-COVID data into training and testing datasets
X_train_post_covid, X_test_post_covid, y_train_post_covid, y_test_post_covid = train_test_split(X_post_covid, y_post_covid, random_state=1)

In [None]:
# create a StandardScaler instances
scaler = StandardScaler()

# fit the StandardScaler on pre-COVID (training) data
X_scaler = scaler.fit(X_train_pre_covid)

# transform the training and testing data using the fitted scaler
X_train_pre_covid_scaled = scaler.transform(X_train_pre_covid)
X_test_pre_covid_scaled = scaler.transform(X_test_pre_covid)
X_train_post_covid_scaled = scaler.transform(X_train_post_covid)
X_test_post_covid_scaled = scaler.transform(X_test_post_covid)

# Compile, Train and Evaluate the Model

In [None]:
# define the model
input_features = len(X_train_pre_covid_scaled[0])
nodes1 = 100
nodes2 = 100

nn_model = tf.keras.models.Sequential()

# first hidden layer
nn_model.add(tf.keras.layers.Dense(units=nodes1, input_dim=input_features, activation='tanh'))

# second hidden layer
nn_model.add(tf.keras.layers.Dense(units=nodes2, activation='relu'))

# output layer
nn_model.add(tf.keras.layers.Dense(units=1))

# check the structure of the model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               5500      
                                                                 
 dense_1 (Dense)             (None, 100)               10100     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 15701 (61.33 KB)
Trainable params: 15701 (61.33 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# compile the model
nn_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [None]:
# train the model
fit_model = nn_model.fit(X_train_pre_covid_scaled, y_train_pre_covid, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_post_covid_scaled,y_test_post_covid,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2/2 - 0s - loss: 9227.4463 - accuracy: 0.0000e+00 - 237ms/epoch - 119ms/step
Loss: 9227.4462890625, Accuracy: 0.0
