In [1]:
 # Dependencies
import numpy as np
import tabula #read tables in PDFs
import pandas as pd
from sqlalchemy import create_engine

## Ohio income tax by county

In [2]:
# Read remote PDF into list of DataFrames
# Individual Income Tax: Taxable Year 2018 IT-1040 Returns, by County
tax_ohio_url="https://tax.ohio.gov/static/communications/publications/annual_reports/2020annualreport.pdf"
income_tax_page_num=165
income_tax_table= tabula.read_pdf(tax_ohio_url,pages=income_tax_page_num)

In [3]:
# choose the first dataframe in the table list
income_pdf_df=income_tax_table[0]

#change display option on column width and display the dataframe
pd.set_option('display.max_colwidth', None)
income_pdf_df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Table 21,,,,
1,"Individual Income Tax: Taxable Year 2018 IT-1040 Returns, by County",,,,
2,Number of Federal Adjusted Ohio Income Tax,Number of,,Federal Adjusted,Ohio Income Tax
3,County Returns Gross Income Liability County,Returns,,Gross Income,Liability
4,"Adams 10,801 $510,636,078 $9,137,991 Lucas",192188,$,11495307630,"$253,783,325"
5,"Allen 47,553 $2,652,560,940 $53,819,629 Madison",19265,$,1239937113,"$26,196,454"
6,"Ashland 24,343 $1,283,000,065 $24,930,240 Mahoning",105957,$,5954785375,"$121,834,737"
7,"Ashtabula 42,985 $2,059,532,429 $38,304,615 Marion",26974,$,1322750746,"$24,802,115"
8,"Athens 22,539 $1,167,323,875 $23,709,431 Medina",91606,$,6924402532,"$163,550,630"
9,"Auglaize 22,427 $1,371,905,585 $29,294,222 Meigs",8936,$,418893584,"$7,759,649"


In [4]:
#change column names
income_pdf_df.columns = income_pdf_df.iloc[2]+" "+income_pdf_df.iloc[3]

# remove unncessary headings  
income_pdf_df=income_pdf_df.drop([0,1,2,3])

# fill the nan column name with "Dollar"
income_pdf_df.columns = income_pdf_df.columns.fillna('Dollar')

#remove the dollar sign with in the content
income_pdf_df=income_pdf_df.replace('\$','',regex=True)
income_pdf_df=income_pdf_df.replace('\,','',regex=True)

# the dataframe after the above process
income_pdf_df.head()

Unnamed: 0,Number of Federal Adjusted Ohio Income Tax County Returns Gross Income Liability County,Number of Returns,Dollar,Federal Adjusted Gross Income,Ohio Income Tax Liability
4,Adams 10801 510636078 9137991 Lucas,192188,,11495307630,253783325
5,Allen 47553 2652560940 53819629 Madison,19265,,1239937113,26196454
6,Ashland 24343 1283000065 24930240 Mahoning,105957,,5954785375,121834737
7,Ashtabula 42985 2059532429 38304615 Marion,26974,,1322750746,24802115
8,Athens 22539 1167323875 23709431 Medina,91606,,6924402532,163550630


In [5]:
# prepare a data frame to split column values
income_pdf_split=income_pdf_df.rename(columns={'Number of  Federal Adjusted  Ohio Income Tax County Returns Gross Income Liability County':'County'}).copy()

# split the first column content with "space" 
income_left_df = income_pdf_split['County'].str.split(" ", expand=True)
income_left_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
4,Adams,10801,,510636078,,9137991,Lucas,
5,Allen,47553,,2652560940,,53819629,Madison,
6,Ashland,24343,,1283000065,,24930240,Mahoning,
7,Ashtabula,42985,,2059532429,,38304615,Marion,
8,Athens,22539,,1167323875,,23709431,Medina,


In [6]:
# drop columns that have no values
income_left_df=income_left_df.drop([2,4], axis=1)

# replace nan values with "space"
income_left_df=income_left_df.replace(np.nan, ' ', regex=True)

# rename the columns
income_left_df.rename(columns={0:"county_name",
                              1:"num_of_returns",
                               3:"fed_agi",
                              5:"oh_income_tax_liability"},inplace=True)
# the first half of the dataframe is cleaned as shown below
income_left_df.head()

Unnamed: 0,county_name,num_of_returns,fed_agi,oh_income_tax_liability,6,7
4,Adams,10801,510636078,9137991,Lucas,
5,Allen,47553,2652560940,53819629,Madison,
6,Ashland,24343,1283000065,24930240,Mahoning,
7,Ashtabula,42985,2059532429,38304615,Marion,
8,Athens,22539,1167323875,23709431,Medina,


In [7]:
# drop the first column to get the second half of the dataframe
income_right_df=income_pdf_split.drop(["County"], axis=1)

# add the county_name column from the values from income_left_df
income_right_df["county_name"]=income_left_df[6]+income_left_df[7]

# join "Dollar" and "Federal Adjusted Gross Income" and rename it to "fed_agi"
income_right_df["fed_agi"]=income_right_df["Dollar"]+income_right_df["Federal Adjusted Gross Income"]

# rename the columns
income_right_df.rename(columns={"Number of Returns":"num_of_returns",
                               "Ohio Income Tax Liability":"oh_income_tax_liability"},inplace=True)

# remove the rows which have no value from the second dataframe
income_right_df.dropna(thresh=3,inplace=True)
# the second half of the dataframe is cleaned as shown below
income_right_df.head()

Unnamed: 0,num_of_returns,Dollar,Federal Adjusted Gross Income,oh_income_tax_liability,county_name,fed_agi
4,192188,,11495307630,253783325,Lucas,11495307630
5,19265,,1239937113,26196454,Madison,1239937113
6,105957,,5954785375,121834737,Mahoning,5954785375
7,26974,,1322750746,24802115,Marion,1322750746
8,91606,,6924402532,163550630,Medina,6924402532


In [15]:
# choose the desired columns from the first and second half dataframes
income_top_df=income_left_df[["county_name","num_of_returns","fed_agi","oh_income_tax_liability"]].copy()
income_bottom_df=income_right_df[["county_name","num_of_returns","fed_agi","oh_income_tax_liability"]].copy()

# concat the two halves into one dataframe based on column names and reset the index
income_tax_df=pd.concat([income_top_df,income_bottom_df]).reset_index(drop=True)

# change datatype to numeric
income_tax_df [["num_of_returns","fed_agi","oh_income_tax_liability"]] =income_tax_df[["num_of_returns","fed_agi","oh_income_tax_liability"]].apply(pd.to_numeric)

# drop other, countytotal, and statetotal rows
income_tax_df=income_tax_df.drop(income_tax_df.index[[-1, -2, -3]])

# the income_tax_df is read to be loaded into a database
income_tax_df

Unnamed: 0,county_name,num_of_returns,fed_agi,oh_income_tax_liability
0,Adams,10801,5.106361e+08,9137991
1,Allen,47553,2.652561e+09,53819629
2,Ashland,24343,1.283000e+09,24930240
3,Ashtabula,42985,2.059532e+09,38304615
4,Athens,22539,1.167324e+09,23709431
...,...,...,...,...
83,Washington,27418,1.559358e+09,31323545
84,Wayne,52909,3.067494e+09,60785462
85,Williams,17991,9.102328e+08,17207810
86,Wood,61628,4.210164e+09,95595090


In [16]:
# Create database connection
# change the owner name, password and port number based on your local situation
# engine = create_engine(f'postgresql://{*database_owner}:{*password}@localhost:{*port}/housing_db')
rds_connection_string = "postgres:postgres@localhost:5433/housing_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [17]:
# Confirm tables
engine.table_names()

['income_tax', 'census', 'schools', 'property_tax']

In [18]:
# Load dataframes into databases
income_tax_df.to_sql(name = 'income_tax', con = engine, if_exists = 'append', index = False)

In [19]:
# Confirm data has been added by querying the income_tax table
pd.read_sql_query('select * from income_tax', con=engine).head()

Unnamed: 0,county_name,num_of_returns,fed_agi,oh_income_tax_liability
0,Adams,10801.0,510636100.0,9137991.0
1,Allen,47553.0,2652561000.0,53819629.0
2,Ashland,24343.0,1283000000.0,24930240.0
3,Ashtabula,42985.0,2059532000.0,38304615.0
4,Athens,22539.0,1167324000.0,23709431.0
