# Data wrangling

Lecture 4: Tuberculosis in the United States

In [32]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
import matplotlib.pyplot as plt

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Set style for seaborn plots
sns.set_style('dark')
sns.color_palette("viridis", as_cmap=True)
sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

# Stop scientific notation for pandas
pd.set_option('display.float_format', '{:.2f}'.format)

In [33]:
tb = pd.read_csv('data\cdc_tuberculosis.csv')
tb

Unnamed: 0.1,Unnamed: 0,No. of TB cases,Unnamed: 2,Unnamed: 3,TB incidence,Unnamed: 5,Unnamed: 6
0,U.S. jurisdiction,2019,2020,2021,2019.00,2020.00,2021.00
1,Total,8900,7173,7860,2.71,2.16,2.37
2,Alabama,87,72,92,1.77,1.43,1.83
3,Alaska,58,58,58,7.91,7.92,7.92
4,Arizona,183,136,129,2.51,1.89,1.77
...,...,...,...,...,...,...,...
48,Virginia,191,169,161,2.23,1.96,1.86
49,Washington,221,163,199,2.90,2.11,2.57
50,West Virginia,9,13,7,0.50,0.73,0.39
51,Wisconsin,51,35,66,0.88,0.59,1.12


# Lecture 4. Data wrangling

---

## 1. Field names

Several unnamed columns

In [34]:
tb = pd.read_csv('data\cdc_tuberculosis.csv', header = 1) # row index = 1
tb

Unnamed: 0,U.S. jurisdiction,2019,2020,2021,2019.1,2020.1,2021.1
0,Total,8900,7173,7860,2.71,2.16,2.37
1,Alabama,87,72,92,1.77,1.43,1.83
2,Alaska,58,58,58,7.91,7.92,7.92
3,Arizona,183,136,129,2.51,1.89,1.77
4,Arkansas,64,59,69,2.12,1.96,2.28
...,...,...,...,...,...,...,...
47,Virginia,191,169,161,2.23,1.96,1.86
48,Washington,221,163,199,2.90,2.11,2.57
49,West Virginia,9,13,7,0.50,0.73,0.39
50,Wisconsin,51,35,66,0.88,0.59,1.12


Wait...but now we can't differentiate betwen the "Number of TB cases" and "TB incidence" year columns. pandas has tried to make our lives easier by automatically adding ".1" to the latter columns, but this doesn't help us as humans understand the data.

We can do this manually with `df.rename()` ([documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html?highlight=rename#pandas.DataFrame.rename)):

In [35]:
rename_dict = {'2019': 'TB cases 2019',
               '2020': 'TB cases 2020',
               '2021': 'TB cases 2021',
               '2019.1': 'TB incidence 2019',
               '2020.1': 'TB incidence 2020',
               '2021.1': 'TB incidence 2021'}

tb = tb.rename(columns = rename_dict)
tb

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021
0,Total,8900,7173,7860,2.71,2.16,2.37
1,Alabama,87,72,92,1.77,1.43,1.83
2,Alaska,58,58,58,7.91,7.92,7.92
3,Arizona,183,136,129,2.51,1.89,1.77
4,Arkansas,64,59,69,2.12,1.96,2.28
...,...,...,...,...,...,...,...
47,Virginia,191,169,161,2.23,1.96,1.86
48,Washington,221,163,199,2.90,2.11,2.57
49,West Virginia,9,13,7,0.50,0.73,0.39
50,Wisconsin,51,35,66,0.88,0.59,1.12


---

## 2. Granularity

You might already be wondering: What's up with that first record?

Row 0 is what we call a **rollup record**, or summary record. It's often useful when displaying tables to humans. The **granularity** of record 0 (Totals) vs the rest of the records (States) is different.

Okay, EDA step two. How was the rollup record aggregated?

Let's check if Total TB cases is the sum of all state TB cases. If we sum over all rows, we should get **2x** the total cases in each of our TB cases by year (why?).

In [36]:
tb.head()

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021
0,Total,8900,7173,7860,2.71,2.16,2.37
1,Alabama,87,72,92,1.77,1.43,1.83
2,Alaska,58,58,58,7.91,7.92,7.92
3,Arizona,183,136,129,2.51,1.89,1.77
4,Arkansas,64,59,69,2.12,1.96,2.28


In [37]:
tb.sum(axis = 0) # incorrect

U.S. jurisdiction    TotalAlabamaAlaskaArizonaArkansasCaliforniaCol...
TB cases 2019        8,9008758183642,111666718245583029973261085237...
TB cases 2020        7,1737258136591,706525417194122219282169239376...
TB cases 2021        7,8609258129691,750585443194992281064255127494...
TB incidence 2019                                               109.94
TB incidence 2020                                                93.09
TB incidence 2021                                               102.94
dtype: object

In [38]:
type(tb)

pandas.core.frame.DataFrame

In [39]:
tb.dtypes

U.S. jurisdiction     object
TB cases 2019         object
TB cases 2020         object
TB cases 2021         object
TB incidence 2019    float64
TB incidence 2020    float64
TB incidence 2021    float64
dtype: object

Looks like those commas are causing all TB cases to be read as the `object` datatype, or **storage type** (close to the Python string datatype), so pandas is concatenating strings instead of adding integers.

Fortunately `read_csv` also has a `thousands` parameter (for what it's worth, I didn't know this beforehand--I [googled](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) this):

In [40]:
# improve readability: chaining method calls with outer parentheses/line breaks
tb = (
    pd.read_csv("data\cdc_tuberculosis.csv", header = 1, thousands = ',')
    .rename(columns = rename_dict)
)
tb

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021
0,Total,8900,7173,7860,2.71,2.16,2.37
1,Alabama,87,72,92,1.77,1.43,1.83
2,Alaska,58,58,58,7.91,7.92,7.92
3,Arizona,183,136,129,2.51,1.89,1.77
4,Arkansas,64,59,69,2.12,1.96,2.28
...,...,...,...,...,...,...,...
47,Virginia,191,169,161,2.23,1.96,1.86
48,Washington,221,163,199,2.90,2.11,2.57
49,West Virginia,9,13,7,0.50,0.73,0.39
50,Wisconsin,51,35,66,0.88,0.59,1.12


In [41]:
tb.sum(axis = 0) # incorrect

U.S. jurisdiction    TotalAlabamaAlaskaArizonaArkansasCaliforniaCol...
TB cases 2019                                                    17800
TB cases 2020                                                    14346
TB cases 2021                                                    15720
TB incidence 2019                                               109.94
TB incidence 2020                                                93.09
TB incidence 2021                                               102.94
dtype: object

State level granularity aka excluding the total row

In [42]:
state_tb = tb[1:] # 1: = from index 1 -> everything
state_tb

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021
1,Alabama,87,72,92,1.77,1.43,1.83
2,Alaska,58,58,58,7.91,7.92,7.92
3,Arizona,183,136,129,2.51,1.89,1.77
4,Arkansas,64,59,69,2.12,1.96,2.28
5,California,2111,1706,1750,5.35,4.32,4.46
...,...,...,...,...,...,...,...
47,Virginia,191,169,161,2.23,1.96,1.86
48,Washington,221,163,199,2.90,2.11,2.57
49,West Virginia,9,13,7,0.50,0.73,0.39
50,Wisconsin,51,35,66,0.88,0.59,1.12


---

## 3. Gather census data

U.S. Census population estimates [source](https://www.census.gov/data/tables/time-series/demo/popest/2010s-state-total.html) (2019), [source](https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-total.html) (2020-2021).

Running the below cells cleans the data. We encourage you to closely explore the CSV and study these lines after lecture...

There are a few new methods here:
* `df.convert_dtypes()` ([documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.convert_dtypes.html)) conveniently converts all float dtypes into ints and is out of scope for the class.
* `df.drop_na()` ([documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html)) will be explained in more detail next time.

In [43]:
# 2010s census data
census_2010s = pd.read_csv("data/nst-est2019-01.csv", header = 3, thousands = ",")
census_2010s = (
    census_2010s
    .reset_index()
    .drop(columns=["index", "Census", "Estimates Base"])
    .rename(columns={"Unnamed: 0": "Geographic area"})
    .convert_dtypes()                 # "smart" converting of columns, use at your own risk
    .dropna()                         # we'll introduce this next time
)
census_2010s.head(2)

Unnamed: 0,Geographic area,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United States,309321666,311556874,313830990,315993715,318301008,320635163,322941311,324985539,326687501,328239523
1,Northeast,55380134,55604223,55775216,55901806,56006011,56034684,56042330,56059240,56046620,55982803


In [44]:
# Since the area names has . in it
census_2010s['Geographic area'] = census_2010s['Geographic area'].str.strip('.')
census_2010s.head()

Unnamed: 0,Geographic area,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United States,309321666,311556874,313830990,315993715,318301008,320635163,322941311,324985539,326687501,328239523
1,Northeast,55380134,55604223,55775216,55901806,56006011,56034684,56042330,56059240,56046620,55982803
2,Midwest,66974416,67157800,67336743,67560379,67745167,67860583,67987540,68126781,68236628,68329004
3,South,114866680,116006522,117241208,118364400,119624037,120997341,122351760,123542189,124569433,125580448
4,West,72100436,72788329,73477823,74167130,74925793,75742555,76559681,77257329,77834820,78347268


In [45]:
# Importing the 2020 - 2022 data
census_2020s = pd.read_csv("data/NST-EST2022-POP.csv", header = 3, thousands = ',')
census_2020s = (census_2020s
    .rename(columns = {'Unnamed: 0': 'Geographic area'})
    .drop(columns= {"Unnamed: 1"})
    .dropna()
    )
census_2020s['Geographic area'] = census_2020s['Geographic area'].str.strip('.')
census_2020s

Unnamed: 0,Geographic area,2020,2021,2022
0,United States,331511512.00,332031554.00,333287557.00
1,Northeast,57448898.00,57259257.00,57040406.00
2,Midwest,68961043.00,68836505.00,68787595.00
3,South,126450613.00,127346029.00,128716192.00
4,West,78650958.00,78589763.00,78743364.00
...,...,...,...,...
52,Washington,7724031.00,7740745.00,7785786.00
53,West Virginia,1791420.00,1785526.00,1775156.00
54,Wisconsin,5896271.00,5880101.00,5892539.00
55,Wyoming,577605.00,579483.00,581381.00


---

## 4. Join data

Time to `merge`! Here I use the DataFrame method `df1.merge(right=df2, ...)` on DataFrame `df1` ([documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html)). Contrast this with the function `pd.merge(left=df1, right=df2, ...)` ([documentation](https://pandas.pydata.org/docs/reference/api/pandas.merge.html?highlight=pandas%20merge#pandas.merge)). Feel free to use either.

In [46]:
merged = tb \
    .merge(right = census_2010s,
           left_on = 'U.S. jurisdiction', right_on = 'Geographic area') \
    .merge(right = census_2020s,
           left_on = 'U.S. jurisdiction', right_on = 'Geographic area')

merged

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021,Geographic area_x,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Geographic area_y,2020,2021,2022
0,Alabama,87,72,92,1.77,1.43,1.83,Alabama,4785437,4799069,4815588,4830081,4841799,4852347,4863525,4874486,4887681,4903185,Alabama,5031362.00,5049846.00,5074296.00
1,Alaska,58,58,58,7.91,7.92,7.92,Alaska,713910,722128,730443,737068,736283,737498,741456,739700,735139,731545,Alaska,732923.00,734182.00,733583.00
2,Arizona,183,136,129,2.51,1.89,1.77,Arizona,6407172,6472643,6554978,6632764,6730413,6829676,6941072,7044008,7158024,7278717,Arizona,7179943.00,7264877.00,7359197.00
3,Arkansas,64,59,69,2.12,1.96,2.28,Arkansas,2921964,2940667,2952164,2959400,2967392,2978048,2989918,3001345,3009733,3017804,Arkansas,3014195.00,3028122.00,3045637.00
4,California,2111,1706,1750,5.35,4.32,4.46,California,37319502,37638369,37948800,38260787,38596972,38918045,39167117,39358497,39461588,39512223,California,39501653.00,39142991.00,39029342.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46,Virginia,191,169,161,2.23,1.96,1.86,Virginia,8023699,8101155,8185080,8252427,8310993,8361808,8410106,8463587,8501286,8535519,Virginia,8636471.00,8657365.00,8683619.00
47,Washington,221,163,199,2.90,2.11,2.57,Washington,6742830,6826627,6897058,6963985,7054655,7163657,7294771,7423362,7523869,7614893,Washington,7724031.00,7740745.00,7785786.00
48,West Virginia,9,13,7,0.50,0.73,0.39,West Virginia,1854239,1856301,1856872,1853914,1849489,1842050,1831023,1817004,1804291,1792147,West Virginia,1791420.00,1785526.00,1775156.00
49,Wisconsin,51,35,66,0.88,0.59,1.12,Wisconsin,5690475,5705288,5719960,5736754,5751525,5760940,5772628,5790186,5807406,5822434,Wisconsin,5896271.00,5880101.00,5892539.00


Let's use a for-loop and Python format strings to compute TB incidence for all years. Python f-strings are just used for the purposes of this demo, but they're handy to know when you explore data beyond this course ([Python documentation](https://docs.python.org/3/tutorial/inputoutput.html)).

In [47]:
for year in [2019, 2020, 2021]:
    merged[f"recompute incidence {year}"] = merged[f"TB cases {year}"]/merged[f"{year}"]*100000
merged

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021,Geographic area_x,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Geographic area_y,2020,2021,2022,recompute incidence 2019,recompute incidence 2020,recompute incidence 2021
0,Alabama,87,72,92,1.77,1.43,1.83,Alabama,4785437,4799069,4815588,4830081,4841799,4852347,4863525,4874486,4887681,4903185,Alabama,5031362.00,5049846.00,5074296.00,1.77,1.43,1.82
1,Alaska,58,58,58,7.91,7.92,7.92,Alaska,713910,722128,730443,737068,736283,737498,741456,739700,735139,731545,Alaska,732923.00,734182.00,733583.00,7.93,7.91,7.90
2,Arizona,183,136,129,2.51,1.89,1.77,Arizona,6407172,6472643,6554978,6632764,6730413,6829676,6941072,7044008,7158024,7278717,Arizona,7179943.00,7264877.00,7359197.00,2.51,1.89,1.78
3,Arkansas,64,59,69,2.12,1.96,2.28,Arkansas,2921964,2940667,2952164,2959400,2967392,2978048,2989918,3001345,3009733,3017804,Arkansas,3014195.00,3028122.00,3045637.00,2.12,1.96,2.28
4,California,2111,1706,1750,5.35,4.32,4.46,California,37319502,37638369,37948800,38260787,38596972,38918045,39167117,39358497,39461588,39512223,California,39501653.00,39142991.00,39029342.00,5.34,4.32,4.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46,Virginia,191,169,161,2.23,1.96,1.86,Virginia,8023699,8101155,8185080,8252427,8310993,8361808,8410106,8463587,8501286,8535519,Virginia,8636471.00,8657365.00,8683619.00,2.24,1.96,1.86
47,Washington,221,163,199,2.90,2.11,2.57,Washington,6742830,6826627,6897058,6963985,7054655,7163657,7294771,7423362,7523869,7614893,Washington,7724031.00,7740745.00,7785786.00,2.90,2.11,2.57
48,West Virginia,9,13,7,0.50,0.73,0.39,West Virginia,1854239,1856301,1856872,1853914,1849489,1842050,1831023,1817004,1804291,1792147,West Virginia,1791420.00,1785526.00,1775156.00,0.50,0.73,0.39
49,Wisconsin,51,35,66,0.88,0.59,1.12,Wisconsin,5690475,5705288,5719960,5736754,5751525,5760940,5772628,5790186,5807406,5822434,Wisconsin,5896271.00,5880101.00,5892539.00,0.88,0.59,1.12


This is a little unwieldy. We could either drop the unneeded columns now, or just merge on smaller census DataFrames. Let's do the latter.

In [48]:
# try merging again, but cleaner this time
tb_census = (
    tb
    .merge(right=census_2010s[["Geographic area", "2019"]],
           left_on="U.S. jurisdiction", right_on="Geographic area")
    .drop(columns="Geographic area")
    .merge(right=census_2020s[["Geographic area", "2020", "2021"]],
           left_on="U.S. jurisdiction", right_on="Geographic area")
    .drop(columns="Geographic area")
)
tb_census

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021,2019,2020,2021
0,Alabama,87,72,92,1.77,1.43,1.83,4903185,5031362.00,5049846.00
1,Alaska,58,58,58,7.91,7.92,7.92,731545,732923.00,734182.00
2,Arizona,183,136,129,2.51,1.89,1.77,7278717,7179943.00,7264877.00
3,Arkansas,64,59,69,2.12,1.96,2.28,3017804,3014195.00,3028122.00
4,California,2111,1706,1750,5.35,4.32,4.46,39512223,39501653.00,39142991.00
...,...,...,...,...,...,...,...,...,...,...
46,Virginia,191,169,161,2.23,1.96,1.86,8535519,8636471.00,8657365.00
47,Washington,221,163,199,2.90,2.11,2.57,7614893,7724031.00,7740745.00
48,West Virginia,9,13,7,0.50,0.73,0.39,1792147,1791420.00,1785526.00
49,Wisconsin,51,35,66,0.88,0.59,1.12,5822434,5896271.00,5880101.00


Recomputing incidence:
TB incidence is computed as “Cases per 100,000 persons using mid-year population estimates from the U.S. Census Bureau.”

In [49]:
# Tb case / (population / 100000) = TB case / poppulation * 100000
for year in [2019, 2020, 2021]:
    tb_census[f'recomputed_incidence_{year}'] = tb_census[f'TB cases {year}'] / tb_census[f'{year}'] * 100000

tb_census

Unnamed: 0,U.S. jurisdiction,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021,2019,2020,2021,recomputed_incidence_2019,recomputed_incidence_2020,recomputed_incidence_2021
0,Alabama,87,72,92,1.77,1.43,1.83,4903185,5031362.00,5049846.00,1.77,1.43,1.82
1,Alaska,58,58,58,7.91,7.92,7.92,731545,732923.00,734182.00,7.93,7.91,7.90
2,Arizona,183,136,129,2.51,1.89,1.77,7278717,7179943.00,7264877.00,2.51,1.89,1.78
3,Arkansas,64,59,69,2.12,1.96,2.28,3017804,3014195.00,3028122.00,2.12,1.96,2.28
4,California,2111,1706,1750,5.35,4.32,4.46,39512223,39501653.00,39142991.00,5.34,4.32,4.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46,Virginia,191,169,161,2.23,1.96,1.86,8535519,8636471.00,8657365.00,2.24,1.96,1.86
47,Washington,221,163,199,2.90,2.11,2.57,7614893,7724031.00,7740745.00,2.90,2.11,2.57
48,West Virginia,9,13,7,0.50,0.73,0.39,1792147,1791420.00,1785526.00,0.50,0.73,0.39
49,Wisconsin,51,35,66,0.88,0.59,1.12,5822434,5896271.00,5880101.00,0.88,0.59,1.12


In [50]:
tb_census.describe()

Unnamed: 0,TB cases 2019,TB cases 2020,TB cases 2021,TB incidence 2019,TB incidence 2020,TB incidence 2021,2019,2020,2021,recomputed_incidence_2019,recomputed_incidence_2020,recomputed_incidence_2021
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,174.51,140.65,154.12,2.1,1.78,1.97,6436069.08,6500225.73,6510422.63,2.1,1.78,1.97
std,341.74,271.06,286.78,1.5,1.34,1.48,7360660.47,7408168.46,7394300.08,1.5,1.34,1.47
min,1.0,0.0,2.0,0.17,0.0,0.21,578759.0,577605.0,579483.0,0.17,0.0,0.21
25%,25.5,29.0,23.0,1.29,1.21,1.23,1789606.0,1820311.0,1844920.0,1.3,1.21,1.23
50%,70.0,67.0,69.0,1.8,1.52,1.7,4467673.0,4507445.0,4506589.0,1.81,1.52,1.69
75%,180.5,139.0,150.0,2.58,1.99,2.22,7446805.0,7451987.0,7502811.0,2.58,1.99,2.22
max,2111.0,1706.0,1750.0,7.91,7.92,7.92,39512223.0,39501653.0,39142991.0,7.93,7.91,7.9
