In [None]:
import pandas as pd
import os
OLD_PATH = "/data/gentrification/buffalo_oarsystem_data"
DATA_DIR = "/data/gentrification/LATEST"

# open data data frame 
open_data_df = pd.read_csv('https://data.buffalony.gov/api/views/kckn-jafw/rows.csv?accessType=DOWNLOAD',low_memory = False)

In [None]:
# we import all function used 
%run util_datacollection.ipynb

# 1. Find all SBLs (unique parcel identifiers) in Erie county data

In [None]:
import string
ALL_SBL_FILE = os.path.join(DATA_DIR,"owner_history_htmls")
lists_of_sbls_htmls = os.listdir(ALL_SBL_FILE)
possibilities =  [char for char in string.digits + string.ascii_uppercase] # possible combinations 

if os.path.exists(ALL_SBL_FILE):
    all_sbls = [x.strip().split('.')[0].replace('#','.').replace('+','/') for x in lists_of_sbls_htmls]
    print("ALREADY RAN THIS STEP!")

else:
    all_sbls = []
    for letter in possibilities:
        all_sbls += grab_source(letter,path = DATA_DIR) # creates a new directory and appends htmls files onto the created directories 
   

# 2. Download and compile all owner pages

In [None]:
csv_file_name =  'owner_history.csv'
csv_owner_history_file_path  = os.path.join(DATA_DIR,csv_file_name)
if not os.path.exists(csv_owner_history_file_path): # if it has not downloaded it yet 
    list_of_failed_sbls = html_csv_converter(ALL_SBL_FILE,csv_file_name) # this is to find out SBLS that gave a server error 500
else:
    print('Owner page history has been made')

In [None]:
owner_history_df = pd.read_csv(csv_owner_history_file_path,names= ['Owner','Owner Name','Book-Page/Date' 
                                                  ,'Book-Page/Date identification','SBL'])
owner_history_error_reformat(owner_history_df,open_data_df,inplace = True) # we then remove any noise and corrupted data (i.g. NAN) and replace them with the information open data buffalo has provided 

# 3. Scrape the considerations page using the Book-Page column from the owner history 

## link can be found on http://ecclerk.erie.gov/or_wb1/

In [None]:
ALL_CONSIDERATIONS_FILE = os.path.join(DATA_DIR,"considerations_htmls")
if not os.path.exists(ALL_CONSIDERATIONS_FILE): 
    scrape_considerations_page(owner_history_df,DATA_DIR) # creates a 
                                            # new directory called considerations_htmls and puts the newly considerations
else:
    print("CONSIDERATIONS HAVE BEEN SCRAPED")


# 4. Concat these into a single CSV

In [None]:
file_name = 'complete_search_parcel.csv'
path_joined_concatenated = os.path.join(DATA_DIR,'complete_search_parcel.csv')
if not os.path.exists(path_joined_concatenated):

    # we create a new csv called "complete_search_parcel.csv"
    list_of_failed_deeds = combine_considerations_owner_history(DATA_DIR,file_name)
print("CONSIDERATIONS AND OWNER HISTORY CSVS HAVE BEEN CONCATENATED")
complete_dataframe = pd.read_csv(path_joined_concatenated)
complete_dataframe 

# 5 Link to location data w/ Open Data Buffalo 



In [None]:
locations_link = link_locations(complete_dataframe,open_data_df)
locations_link

# 6. Fix location strings using OARSYSTEM data

In [None]:
print("Number of Locations we are missing prior: " + str(locations_link['Location'].isna().sum()))
oarsystem_data_path = os.path.join(OLD_PATH,'buffalo_oarsystem_CSVs/combined_tax_data.csv')
oarsystem_df = pd.read_csv(os.path.join(oarsystem_data_path)).rename(columns = {'SBL':'sbl'})
oarsystem_df # note this was scraped before hand; the aim is to take ADDRESS here 

# I took the address from OARSYSTEM, because the data from the real property tax parcel does not have information such as the ZipCode of a SBL 

In [None]:
fixed_dataframe =  batch_geoencode_missing_location(locations_link,OLD_PATH)
print('-------------------------------------------------------------------------')
print("Number of missing locations now.. : " + str(fixed_dataframe['Location'].isna().sum()))
fixed_dataframe

In [None]:
new_dataframe = add_multiple_doc_types_column(fix_dates(fixed_dataframe))

# 7. Filter out homes from apartments 

In [None]:

if os.path.exists('filtered_data.csv'):
    print('filtered data already exists')
    filtered_datav = pd.read_csv('filtered_data.csv')
else:
    filtered_datav, failed_files, anomolies = filter_apartments(new_dataframe,DATA_DIR)
    filtered_datav.to_csv('filtered_data.csv',index = False)