In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (12,9)

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision = 2, suppress= True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

pd.set_option('display_float_format', '{:.2f}'.format)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
tbls = pd.read_html("https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w#T1_down")
df = tbls[0]
df

with open("data/cdc_tuberculosis.csv", "r") as f:
	for i, row in enumerate(f):
		print(row)
		if i >= 3: break

with open("data/cdc_tuberculosis.csv", "r") as f:
	for i, row in enumerate(f):
		print(repr(row))
		if i >= 3: break

with open("data/cdc_tuberculosis.csv", "r") as f:
	for row in f.readlines()[:4]
		print(repr(row))

with open("data/cdc_tuberculosis.tsv", "r") as f:
	for _, row in zip(range(4), f):
		print(repr(row))

tb_df = pd.read_csv("data/cdc_tuberculosis.csv",)
tb_df

tb_df = pd.read_csv("data/cdc_tuberculosis.csv", header=1)
tb_df

rename_dict = {'2019': 'TB cases 2019',
               '2020': 'TB cases 2020',
               '2021': 'TB cases 2021',
               '2019.1': 'TB incidence 2019',
               '2020.1': 'TB incidence 2020',
               '2021.1': 'TB incidence 2021'}

tb_df = tb_df.rename(columns=rename_dict)
tb_df

tb_df.head()

tb_df.drop(0)

tb_df.drop(0).sum()

tb_df.dtypes

tb_df = (
	pd.read_csv("data/cdc_tuberculosis.csv", header = 1, thousands=',')
	.rename(columns=rename_dict)
)
tb_df

tb_df.drop(0).sum()

tb_df.head(1)

census_2010s_df = pd.read_csv("data/nst-est2019-01.csv", header=3, thousands=",")
census_2010s_df

census_2010s_df = (
	census_2010s_df
	.rename(columns={"Unnamed:0":"Geographic Area"})
	.drop(columns=["Census", "Estimates Base"])
	.convert_dtypes()
	.dropna()
)
census_2010s_df



census_2010s_df['Geographic Area'] = census_2010s_df['Geographic Area'].str.strip('.')
census_2010s_df



census_2020s_df = pd.read_csv("data/NST-EST2022-POP.csv", header=3, thousands=",")
census_2020s_df = (
		 census_2020s_df
		 .drop(columns=["Unnamed : 1"])
	 	 .rename(columns={"Unnamed : 0": "Geographic Area"})
		 .convert_dtypes()
	   .dropna()
)
census_2020s_df['Geographic Area'] = census_2020s_df['Geographic Area'].str.strip('.')
census_2020s_df

display(tb_df.tail(2))
display(census_2010s_df.tail(2))
display(census_2020s_df.tail(2))



In [None]:
tb_census_df = (
    tb_df
    .merge(right=census_2010s_df,
           left_on="U.S. jurisdiction", right_on="Geographic Area")
    .merge(right=census_2020s_df,
           left_on="U.S. jurisdiction", right_on="Geographic Area")
)
tb_census_df.tail()

In [None]:
tb_census_df = (
    tb_df
    .merge(right=census_2010s_df[["Geographic Area", "2019"]],
           left_on= "U.S. jurisdiction", right_on="Geographic Area")
    .drop(columns="Geographic Area")
    .merge(right=census_2020s_df[["Geographic Area", "2020", "2021"]],
           left_on="U.S. jurisdiction", right_on= "Geographic Area")
    .drop(columns="Geographic Area")
)
tb_census_df.tail()

In [None]:
tb_census_df["recompute incidence 2019"] = (
    tb_census_df["TB cases 2019"]/tb_census_df["2019"] * 100_000
)
tb_census_df

In [None]:
for year in [2019, 2020, 2021]:
  tb_census_df[f"recompute incidence {year}"] = (
      tb_census_df[f"TB cases {year}"]/tb_census_df[f"{year}"]*100_000
  )
tb_census_df

In [None]:
tb_census_df.describe()

In [None]:
tb_df = tb_df.set_index("U.S. jurisdiction")
tb_df

In [None]:
census_2010s_df = census_2010s_df.set_index("Geographic Area")
census_2010s_df

In [None]:
census_2020s_df = census_2020s_df.set_index("Geographic Area")
census_2020s_df

In [None]:
tb_df.head()

In [None]:
census_2010s_df

In [None]:
census_2010s_df.rename(index={'United States':'Total'}, inplace=True)
census_2010s_df

In [None]:
census_2020s_df.rename(index={'United States':'Total'}, inplace=True)
census_2020s_df

In [None]:
tb_census_df = (
    tb_df
    .merge(right=census_2010s_df[["2019"]],
           left_index= True, right_index=True)
    .merge(right=census_2020s_df[["2020", "2021"]],
           left_index=True, right_index=True)
)
tb_census_df

In [None]:
for year in [2019, 2020, 2021]:
  tb_census_df[f"recompute incidence {year}"] = tb_census_df[f"TB cases {year}"] / tb_census_df[f"{year}"]*100000
tb_census_df

In [None]:
tb_census_df

In [None]:
incidence_2020 = tb_census_df.loc['Total', 'recompute incidence 2020']
incidence_2020

In [None]:
incidence_2021 = tb_census_df.loc['Total', 'recompute incidence 2021']
incidence_2021

In [None]:
difference = (incidence_2021 - incidence_2020)/incidence_2020 * 100
difference