# Preprocessing Data

In [1]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [3]:
# read in the TB_Data_2000_2022_total_count.csv
df = pd.read_csv("/content/drive/My Drive/project4/resources/TB_Data_2000_2022_total_count.csv")
df

Unnamed: 0,Indicator,Year,Geography,FIPS,Cases,Rate per 100000
0,Tuberculosis,2000,Alabama,1,310,7.0
1,Tuberculosis,2001,Alabama,1,264,5.9
2,Tuberculosis,2002,Alabama,1,233,5.2
3,Tuberculosis,2003,Alabama,1,258,5.7
4,Tuberculosis,2004,Alabama,1,211,4.7
...,...,...,...,...,...,...
1168,Tuberculosis,2018,Wyoming,56,1,0.2
1169,Tuberculosis,2019,Wyoming,56,1,0.2
1170,Tuberculosis,2020 (COVID-19 Pandemic),Wyoming,56,0,0.0
1171,Tuberculosis,2021,Wyoming,56,3,0.5


In [4]:
# check the data types in the aggregate case count dataset
df.dtypes

Indicator           object
Year                object
Geography           object
FIPS                 int64
Cases               object
Rate per 100000    float64
dtype: object

In [5]:
# update "2020 (COVID-19 Pandemic)" under the Year column to just the year
df.loc[df['Year'] == '2020 (COVID-19 Pandemic)', 'Year'] = '2020'

In [6]:
# convert the Year and Cases columns' data type to integer
df[['Year','Cases']] = df.replace(',', '', regex=True)[['Year','Cases']].astype(int)

In [7]:
# divide the years into pre- and post-covid

# first create a function to categorize pre-/post-covid
def covid(year):
    if year < 2020:
        return 'Pre-COVID'
    else:
        return 'Post-COVID'

# apply the function to the Year column using the apply() function
df['Year'] = df['Year'].apply(covid)
df

Unnamed: 0,Indicator,Year,Geography,FIPS,Cases,Rate per 100000
0,Tuberculosis,Pre-COVID,Alabama,1,310,7.0
1,Tuberculosis,Pre-COVID,Alabama,1,264,5.9
2,Tuberculosis,Pre-COVID,Alabama,1,233,5.2
3,Tuberculosis,Pre-COVID,Alabama,1,258,5.7
4,Tuberculosis,Pre-COVID,Alabama,1,211,4.7
...,...,...,...,...,...,...
1168,Tuberculosis,Pre-COVID,Wyoming,56,1,0.2
1169,Tuberculosis,Pre-COVID,Wyoming,56,1,0.2
1170,Tuberculosis,Post-COVID,Wyoming,56,0,0.0
1171,Tuberculosis,Post-COVID,Wyoming,56,3,0.5


In [8]:
# drop the Indicator, FIPS and Rate columns
df = df.drop(columns=['Indicator', 'FIPS', 'Rate per 100000'])

In [9]:
# convert the categorical data (Year and State) to numeric with 'pd.get_dummies'
numeric = pd.get_dummies(df, dtype=int)
numeric.head()

Unnamed: 0,Cases,Year_Post-COVID,Year_Pre-COVID,Geography_Alabama,Geography_Alaska,Geography_Arizona,Geography_Arkansas,Geography_California,Geography_Colorado,Geography_Connecticut,...,Geography_South Dakota,Geography_Tennessee,Geography_Texas,Geography_Utah,Geography_Vermont,Geography_Virginia,Geography_Washington,Geography_West Virginia,Geography_Wisconsin,Geography_Wyoming
0,310,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,264,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,233,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,258,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,211,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# filter pre- and post-COVID data
pre_covid_data = numeric[numeric['Year_Pre-COVID'] == 1]
post_covid_data = numeric[numeric['Year_Post-COVID'] == 1]

# pre-COVID data
X_pre_covid = pre_covid_data.drop(['Cases'], axis=1)
y_pre_covid = pre_covid_data['Cases']

# post-COVID data
X_post_covid = post_covid_data.drop(['Cases'], axis=1)
y_post_covid = post_covid_data['Cases']

In [16]:
# split pre-COVID data into training and testing datasets
X_train_pre_covid, X_test_pre_covid, y_train_pre_covid, y_test_pre_covid = train_test_split(X_pre_covid, y_pre_covid, random_state=1)

# split post-COVID data into training and testing datasets
X_train_post_covid, X_test_post_covid, y_train_post_covid, y_test_post_covid = train_test_split(X_post_covid, y_post_covid, random_state=1)

In [17]:
# create a StandardScaler instances
scaler = StandardScaler()

# fit the StandardScaler on pre-COVID (training) data
X_scaler = scaler.fit(X_train_pre_covid)

# transform the training and testing data using the fitted scaler
X_train_pre_covid_scaled = scaler.transform(X_train_pre_covid)
X_test_pre_covid_scaled = scaler.transform(X_test_pre_covid)
X_train_post_covid_scaled = scaler.transform(X_train_post_covid)
X_test_post_covid_scaled = scaler.transform(X_test_post_covid)

# Compile, Train and Evaluate the Model

In [42]:
# define the model
input_features = len(X_train_pre_covid_scaled[0])
nodes1 = 2
# nodes2 = 1

nn_model = tf.keras.models.Sequential()

# first hidden layer
nn_model.add(tf.keras.layers.Dense(units=nodes1, input_dim=input_features, activation='relu'))

# second hidden layer
# nn_model.add(tf.keras.layers.Dense(units=nodes2, activation='relu'))

# output layer
nn_model.add(tf.keras.layers.Dense(units=1))

# check the structure of the model
nn_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 2)                 108       
                                                                 
 dense_17 (Dense)            (None, 1)                 3         
                                                                 
Total params: 111 (444.00 Byte)
Trainable params: 111 (444.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
# compile the model
nn_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [44]:
# train the model
fit_model = nn_model.fit(X_train_pre_covid_scaled, y_train_pre_covid, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [32]:
# evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_post_covid_scaled,y_test_post_covid,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2/2 - 0s - loss: 22582.3223 - accuracy: 0.0000e+00 - 108ms/epoch - 54ms/step
Loss: 22582.322265625, Accuracy: 0.0
