In [1]:
 # Dependencies
import numpy as np
import requests
import tabula #read tables in PDFs
import pandas as pd
from sqlalchemy import create_engine

## Ohio Property Tax Information

In [2]:
# Real Property Tax: Tax Year 2019 Taxable Values, 
# Effective Tax Rates, Taxes Charged 1 and Property Tax Relief, by County (dollars in thousands)
tax_ohio_url="https://tax.ohio.gov/static/communications/publications/annual_reports/2020annualreport.pdf"
property_tax_a_num=184
property_tax_b_num=185
property_tax_a_table= tabula.read_pdf(tax_ohio_url,pages=property_tax_a_num)
property_tax_b_table= tabula.read_pdf(tax_ohio_url,pages=property_tax_b_num)

AttributeError: module 'tabula' has no attribute 'read_pdf'

In [3]:
#change display option on column width
pd.set_option('display.max_colwidth', None)

# use the first dataframe from the list
property_tax_a_df=property_tax_a_table[0]

In [4]:
# remove dollar sign
property_tax_a_df=property_tax_a_df.replace('\$','',regex=True)

# replace nan value in the datafraame
property_tax_a_df=property_tax_a_df.replace(np.nan, ' ', regex=True)

# reassign the column names
property_tax_a_df.columns = property_tax_a_df.columns+" "+property_tax_a_df.iloc[0]+" "+property_tax_a_df.iloc[1]

# drop the first two rows
property_tax_a_df=property_tax_a_df.drop([0,1])

# rename the columns
proterty_a_dict={"Unnamed: 0   County":"county_name",
                "Unnamed: 1 Class 1 Taxable Value":"class_1_taxable_value",
                "Unnamed: 2 Class 2 Taxable Value":"class_2_taxable_value",
                 "Unnamed: 3   Total Taxable Value":"total_taxable_value",
                 "Class 1 Effective Rate":"class_1_effective_rate",
                 "Class 2 Effective Rate":"class_2_effective_rate",
                 "Unnamed: 5   Taxes Charged":"taxes_charged",
                 "Unnamed: 7 Non-business credit":"non_business_credit",
                 "Unnamed: 8 Owner-Occupied credit":"owner_occupied_credit",
                 "Homestead Exemption Reduction":"exemption_reduction",
                 "Unnamed: 10   Net Taxes Charged":"net_taxes_charged",
                 "Net Effective Tax Rate":"net_effective_tax_rate"
                }
property_tax_a_df.rename(columns=proterty_a_dict, inplace=True)

# get the columns needed for later concat
property_tax_a_df=property_tax_a_df[proterty_a_dict.values()]
property_tax_a_df.head()

Unnamed: 0,county_name,class_1_taxable_value,class_2_taxable_value,total_taxable_value,class_1_effective_rate,class_2_effective_rate,taxes_charged,non_business_credit,owner_occupied_credit,exemption_reduction,net_taxes_charged,net_effective_tax_rate
2,Adams,355123,100650,"4 55,773",43.11,45.2,19859,1373,85,6 30,17771,38.99
3,Allen,1494981,434513,1929495,51.19,59.19,102245,7016,1149,2805,91275,47.31
4,Ashland,853241,167477,1020717,47.52,57.45,50172,3856,622,1509,44185,43.29
5,Ashtabula,1435239,316619,1751858,55.98,67.5,101717,6542,909,3837,90429,51.62
6,Athens,772410,228892,1001302,55.81,57.87,56356,3654,502,1497,50703,50.64


In [5]:
#change display option on column width
pd.set_option('display.max_colwidth', None)
property_tax_b_df=property_tax_b_table[0]

# replace nan value to empty
property_tax_b_df=property_tax_b_df.replace(np.nan, '', regex=True)

# append the column names into the first row in the dataframe
tax_column_df=pd.DataFrame([property_tax_b_df.columns],columns=np.arange(1,25,1))
property_tax_b_df.columns=np.arange(1,25,1)
property_tax_b_df=tax_column_df.append(property_tax_b_df, ignore_index=True)

# select the columns and rename them as above
property_tax_b_df=property_tax_b_df[[1,4,6,8,9,10,12,14,17,21,23,24]]
property_tax_b_df.columns=proterty_a_dict.values()
# drop the unwanted rows
property_tax_b_df=property_tax_b_df.drop([43,44])
property_tax_b_df.head()

Unnamed: 0,county_name,class_1_taxable_value,class_2_taxable_value,total_taxable_value,class_1_effective_rate,class_2_effective_rate,taxes_charged,non_business_credit,owner_occupied_credit,exemption_reduction,net_taxes_charged,net_effective_tax_rate
0,Logan,1088303,221461,1309764,46.65,51.74,62225,4301,4 89,1132,56303,42.99
1,Lorain,5849903,1293943,7143846,62.58,64.69,449791,33288,6360,10970,399172,55.88
2,Lucas,5527264,1983451,7510715,78.87,94.04,622481,38074,7649,17116,559642,74.51
3,Madison,983346,138627,1121972,44.79,55.49,51740,3772,5 61,901,46506,41.45
4,Mahoning,2992463,944980,3937443,66.94,74.97,271147,17378,3405,10712,239653,60.87


In [6]:
# concat two tables into one
property_tax_df=pd.concat([property_tax_a_df,property_tax_b_df]).reset_index(drop=True)

# remove comma in the context
property_tax_df=property_tax_df.replace('\,','',regex=True)

# remove space inbetween numbers
property_tax_df[['class_1_taxable_value', 'class_2_taxable_value','total_taxable_value', 'class_1_effective_rate',
                 'class_2_effective_rate', 'taxes_charged', 'non_business_credit','owner_occupied_credit', 
                 'exemption_reduction', 'net_taxes_charged',
                 'net_effective_tax_rate']]=property_tax_df[['class_1_taxable_value', 'class_2_taxable_value',
                                                             'total_taxable_value', 'class_1_effective_rate',
                                                             'class_2_effective_rate', 'taxes_charged',
                                                             'non_business_credit','owner_occupied_credit',
                                                             'exemption_reduction', 'net_taxes_charged',
                                                             'net_effective_tax_rate']].replace("\ ","",regex=True)

# change value to numeric datatype
property_tax_df[['class_1_taxable_value', 'class_2_taxable_value','total_taxable_value', 'class_1_effective_rate',
                 'class_2_effective_rate', 'taxes_charged', 'non_business_credit','owner_occupied_credit', 
                 'exemption_reduction', 'net_taxes_charged',
                 'net_effective_tax_rate']]=property_tax_df[['class_1_taxable_value', 'class_2_taxable_value',
                                                             'total_taxable_value', 'class_1_effective_rate',
                                                             'class_2_effective_rate', 'taxes_charged',
                                                             'non_business_credit','owner_occupied_credit',
                                                             'exemption_reduction', 'net_taxes_charged',
                                                             'net_effective_tax_rate']].apply(pd.to_numeric)
#output the final dataframe to be loaded into SQL Database
property_tax_df

Unnamed: 0,county_name,class_1_taxable_value,class_2_taxable_value,total_taxable_value,class_1_effective_rate,class_2_effective_rate,taxes_charged,non_business_credit,owner_occupied_credit,exemption_reduction,net_taxes_charged,net_effective_tax_rate
0,Adams,355123,100650,455773,43.11,45.20,19859,1373,85,630,17771,38.99
1,Allen,1494981,434513,1929495,51.19,59.19,102245,7016,1149,2805,91275,47.31
2,Ashland,853241,167477,1020717,47.52,57.45,50172,3856,622,1509,44185,43.29
3,Ashtabula,1435239,316619,1751858,55.98,67.50,101717,6542,909,3837,90429,51.62
4,Athens,772410,228892,1001302,55.81,57.87,56356,3654,502,1497,50703,50.64
...,...,...,...,...,...,...,...,...,...,...,...,...
83,Washington,1010659,253105,1263764,41.54,48.08,54154,3759,561,1689,48144,38.10
84,Wayne,1955533,457980,2413513,51.16,68.34,131348,9052,1466,3041,117790,48.80
85,Williams,633158,131285,764443,51.62,61.69,40784,2901,410,1222,36250,47.42
86,Wood,2429647,727720,3157368,62.22,75.43,206067,12536,2057,3630,187844,59.49


In [7]:
# validate datatypes
property_tax_df.dtypes

county_name                object
class_1_taxable_value       int64
class_2_taxable_value       int64
total_taxable_value         int64
class_1_effective_rate    float64
class_2_effective_rate    float64
taxes_charged               int64
non_business_credit         int64
owner_occupied_credit       int64
exemption_reduction         int64
net_taxes_charged           int64
net_effective_tax_rate    float64
dtype: object

In [8]:
# Create database connection
# change the owner name, password and port number based on your local situation
#engine = create_engine(f'postgresql://{*database_owner}:{*password}@localhost:{*port}/housing_db')
#rds_connection_string = "postgres:postgres@localhost:5433/housing_db"
#engine = create_engine(f'postgresql://{rds_connection_string}')

In [9]:
# Confirm tables
engine.table_names()

  engine.table_names()


['property_tax']

In [10]:
# Load dataframes into databases
property_tax_df.to_sql(name = 'property_tax', con = engine, if_exists = 'append', index = False)

In [11]:
# Confirm data has been added by querying the income_tax table
pd.read_sql_query('select * from property_tax', con=engine).head()

Unnamed: 0,county_name,class_1_taxable_value,class_2_taxable_value,total_taxable_value,class_1_effective_rate,class_2_effective_rate,taxes_charged,non_business_credit,owner_occupied_credit,exemption_reduction,net_taxes_charged,net_effective_tax_rate
0,Adams,355123.0,100650.0,455773.0,43.11,45.2,19859.0,1373.0,85.0,630.0,17771.0,38.99
1,Allen,1494981.0,434513.0,1929495.0,51.19,59.19,102245.0,7016.0,1149.0,2805.0,91275.0,47.31
2,Ashland,853241.0,167477.0,1020717.0,47.52,57.45,50172.0,3856.0,622.0,1509.0,44185.0,43.29
3,Ashtabula,1435239.0,316619.0,1751858.0,55.98,67.5,101717.0,6542.0,909.0,3837.0,90429.0,51.62
4,Athens,772410.0,228892.0,1001302.0,55.81,57.87,56356.0,3654.0,502.0,1497.0,50703.0,50.64
