In [108]:
# HELPERS
import importlib
importlib.reload(helpers)

# OPERATING SYSTEM STUFF
import os
import io
import gc

# DATA SCIENCE
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# API STUFF
import xlrd
import requests
import json

# SQL
from sqlalchemy import create_engine, text, String, Integer, Float, Boolean, MetaData, Table, select
from sqlalchemy.exc import ProgrammingError # ProgrammingError catches SQL write exceptions
from sqlalchemy.sql import and_

# GEOCODING
from geopy.geocoders import GoogleV3

# CONFIGURATION FILES
import config
pd.set_option('display.float_format', '{:.6f}'.format)

# OTHER
from tqdm.notebook import tqdm

In [109]:
# FUN SQLALCHEMY CODE

# Show databases
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW DATABASES;"))
#     databases = [row[0] for row in result]
#     print(databases)

# Show tables
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW TABLES;"))
#     tables = [row[0] for row in result]
#     print(tables)

# Show all the building classes
# This won't work unless the DataFrame 'combined' has been set up
# sorted_building_classes = sorted(
#     combined["BUILDING CLASS CATEGORY"].unique(),
#     key=lambda x: int(x.split(" ")[0])
# )

# Show columns from table
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW COLUMNS FROM geocodes;"))
#     tables = [(row[0],row[1]) for row in result]
#     print(tables)

# Show data from table
# with engine.connect() as connection:
#     result = connection.execute(text("SELECT * FROM geocodes LIMIT 1;"))
#     tables = [(row[0],row[1],row[2],row[3],row[4],row[5],row[6],
#                row[7]
#               ) for row in result]
#     print(tables)

# Verify primary keys
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW KEYS FROM geocodes;"))
#     primary_key_column = result.fetchone()
#     if primary_key_column:
#         print("Primary key column:", primary_key_column['Column_name'])
#     else:
#         print("No primary key defined for the table.")

In [110]:
# CREATE DATABASE CONNECTIONS

# Silence random errors
os.environ['SQLALCHEMY_WARN_20'] = '0'
os.environ['SQLALCHEMY_SILENCE_UBER_WARNING'] = '1'

# Database params & credentials
username = 'root'
password = 'rootpassword'
hostname = 'db'
database_name = 'database_1'

# Create database connection
engine = create_engine(f'mysql+pymysql://{username}:{password}@{hostname}')

# Create database and tables
try:
    with engine.connect() as connection:
        connection.execute(text(f'CREATE DATABASE {database_name};'))
except ProgrammingError:
    pass

# Reset connection to connect to specific database
engine = create_engine(f'mysql+pymysql://{username}:{password}@{hostname}/{database_name}')

In [111]:
# DATABASE RESETS

# This resets the 'geocodes' table from a .csv in the folder 'project'
# DataFrame: 'geocodes_reset_df'
# SQL Table: 'geocodes'
# geocodes_reset_df = pd.read_csv('geocodes_export_backup.csv')
# geocodes_reset_df['PRIMARY_KEY'] = geocodes_reset_df['BOROUGH'] + '_' + geocodes_reset_df['ADDRESS']
# geocodes_reset_df.to_sql('geocodes', con=engine, index=False, if_exists='replace')

# This resets the 'geocodes' tabke from a .csv in the folder 'project'
# while dropping the last 10 entries...
# DataFrame: 'geocodes_reset_df'
# SQL Table: 'geocodes'
geocodes_reset_df = pd.read_csv('geocodes_export_backup.csv')
geocodes_reset_df.drop(geocodes_reset_df.tail(10).index, inplace=True)
# geocodes_reset_df

In [112]:
geocodes_reset_df.to_sql(geocodes_sql_table_name,
                             con=engine,
                             index=False,
                             if_exists='replace')

In [113]:
# CREATE DATABASES

# -------------- CREATE `geocodes` TABLE ---------------
geocodes_sql_table_name = 'geocodes'

with engine.connect() as connection:
    # Uses the reset data
    geocodes_reset_df.to_sql(geocodes_sql_table_name,
                             con=engine,
                             index=False,
                             if_exists='replace')

    try: # This will only work if there is not already a column called 'PRIMARY_KEY'
        connection.execute( # Set primary key
            text(
                f'ALTER TABLE {geocodes_sql_table_name} ADD COLUMN PRIMARY_KEY VARCHAR(255)'
            )
        )
        print(f"Column PRIMARY_KEY created in database '{database_name}'.")
    except:
        print(f"Column PRIMARY_KEY already exists in database '{database_name}'.")
    
    try:
        connection.execute( # Set the values of the primary keys
            text(
                f'UPDATE {geocodes_sql_table_name} SET PRIMARY_KEY = CONCAT(`BOROUGH`, \'_\', `ADDRESS`)'
            )
        )
        print(f"PRIMARY_KEY column values set in database '{database_name}'.")
    except:
        print(f"PRIMARY_KEY column values set error in database '{database_name}'.") 


# 2.) -------------- CREATE `cat_map` TABLE ---------------
mapping_list = [(k, v) for k, vals in helpers.mapping.items() for v in vals]
mapping_df = pd.DataFrame(
    mapping_list,
    columns=['ZILLOW CATEGORY', 'BUILDING CLASS CATEGORY']
)

# Put the map into a SQL table. Why? Not sure. Might need it later!
mapping_df.to_sql('cat_map', con=engine, index=False, if_exists='replace')

Column PRIMARY_KEY created in database 'database_1'.
PRIMARY_KEY column values set in database 'database_1'.


In [114]:
# Create an empty array that will hold our NYC Housing DataFrames
data = []

# Pull data from the NYC website
for url in helpers.dataURLs:
    # Read Excel file and skip the first 4 rows
    df = pd.read_excel(url, skiprows=4, engine="openpyxl")
    data.append(df)

In [115]:
# Combine the dataframes from the nyc housing website
combined = pd.concat(data, ignore_index=True)

# Rename the 'BOROUGH' column to 'BOROUGH CODE'
combined = combined.rename(columns={'BOROUGH': 'BOROUGH CODE'})

# Define the mapping for borough codes to borough names
borough_mapping = {1: 'MANHATTAN', 2: 'BRONX', 3: 'BROOKLYN', 4: 'QUEENS', 5: 'STATEN ISLAND'}

# Create a new 'BOROUGH' column based on 'BOROUGH CODE'
borough = combined['BOROUGH CODE'].map(borough_mapping)

# Insert the new 'BOROUGH' column into the DataFrame right after the 'BOROUGH CODE' column
combined.insert(loc=1, column='BOROUGH', value=borough)

# Remove rows that contain the string 'N/A'
combined = combined[~combined['ADDRESS'].str.contains('N/A')]

In [116]:
# Write the contents of `combined` to the `sales` SQL table...
combined.to_sql('sales', con=engine, index=False, if_exists='replace')

In [117]:
# CREATE GEOCODING TABLE & COPY TO SQL TABLE
# Template Pandas DataFrame that we use to build the SQL database
# using the 'geocoding_data_types_df' data structure we defined above
geocodes = pd.DataFrame(columns=helpers.geocoding_data_types_df)

In [118]:
# Create a table of the geographic information from `combined`
geocodingTable = combined[['BOROUGH CODE', 'BOROUGH', 'NEIGHBORHOOD', 'ADDRESS']].copy()
geocodingTable['LATITUDE'] = None
geocodingTable['LONGITUDE'] = None
geocodingTable['GEOCODING ERR'] = False
geocodingTable['PRIMARY_KEY'] = geocodingTable['BOROUGH'] + '_' + geocodingTable['ADDRESS']

In [119]:
# Load 'geocodes' SQL table into a DataFrame

geocodes_table_response = pd.read_sql_query(f"SELECT * FROM {geocodes_sql_table_name}", engine)


In [124]:
# FIND rows in NYC not in our 'geocodes table
missing_rows = geocodingTable[~geocodingTable['PRIMARY_KEY'].isin(geocodes_table_response['PRIMARY_KEY'])]

In [125]:
# Geocode the rows missing from the SQL table `geocodes`
tqdm.pandas()
missing_rows = missing_rows.progress_apply(helpers.geolocate, axis=1)

  0%|          | 0/9 [00:00<?, ?it/s]

In [128]:
# Set the index on the dataframe so that we ensure we don't have duplicates..
missing_rows.set_index('PRIMARY_KEY', inplace=True)

KeyError: "None of ['PRIMARY_KEY'] are in the columns"

In [130]:
# Add the missing rows back to the SQL table
missing_rows.to_sql('geocodes', con=engine, if_exists='append', index=True)

In [129]:
missing_rows

Unnamed: 0_level_0,BOROUGH CODE,BOROUGH,NEIGHBORHOOD,ADDRESS,LATITUDE,LONGITUDE,GEOCODING ERR
PRIMARY_KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
STATEN ISLAND_59 PHEASANT LANE,5,STATEN ISLAND,WOODROW,59 PHEASANT LANE,40.532981,-74.222449,False
STATEN ISLAND_60 PHEASANT LANE,5,STATEN ISLAND,WOODROW,60 PHEASANT LANE,40.532613,-74.222587,False
STATEN ISLAND_377 ENGLEWOOD AVENUE,5,STATEN ISLAND,WOODROW,377 ENGLEWOOD AVENUE,40.532316,-74.222781,False
STATEN ISLAND_40 HERRICK AVENUE,5,STATEN ISLAND,WOODROW,40 HERRICK AVENUE,40.530672,-74.219239,False
STATEN ISLAND_40 HERRICK AVENUE,5,STATEN ISLAND,WOODROW,40 HERRICK AVENUE,40.530672,-74.219239,False
STATEN ISLAND_104 GLADWIN STREET,5,STATEN ISLAND,WOODROW,104 GLADWIN STREET,40.531902,-74.222495,False
STATEN ISLAND_96 LENEVAR AVENUE,5,STATEN ISLAND,WOODROW,96 LENEVAR AVENUE,40.538871,-74.20938,False
STATEN ISLAND_401 BLOOMINGDALE ROAD,5,STATEN ISLAND,WOODROW,401 BLOOMINGDALE ROAD,40.53478,-74.217978,False
STATEN ISLAND_3120 ARTHUR KILL ROAD,5,STATEN ISLAND,WOODROW,3120 ARTHUR KILL ROAD,40.543765,-74.233477,False


In [131]:
# Now we can test to see if our append worked
geocodes_table_response = pd.read_sql_query(f"SELECT * FROM {geocodes_sql_table_name}", engine)
missing_rows = geocodingTable[~geocodingTable['PRIMARY_KEY'].isin(geocodes_table_response['PRIMARY_KEY'])]
missing_rows

Unnamed: 0,BOROUGH CODE,BOROUGH,NEIGHBORHOOD,ADDRESS,LATITUDE,LONGITUDE,GEOCODING ERR,PRIMARY_KEY


In [None]:
# YES! IT DID!

In [None]:
# DANGEROUS! HIGH API USAGE!

# tqdm.pandas()
# missing_rows = missing_rows.progress_apply(helpers.geolocate, axis=1)
# missing_rows.to_csv('geocodes_export.csv', index=False)

In [None]:
# Create the SQLAlchemy engine
engine = create_engine(f'mysql+pymysql://{username}:{password}@{hostname}/{database_name}')
# Call the function to print the SQL table
helpers.print_sql_table(engine, 'geocodes')

In [None]:
missing_rows.to_csv('geocodes_export_next.csv', index=False)

In [None]:
# Outdated
# with engine.connect() as connection:
#     connection.execute(text("ALTER TABLE geocodes ADD PRIMARY_KEY VARCHAR(255)"))
#     connection.execute(text("UPDATE geocodes SET PRIMARY_KEY = CONCAT(`BOROUGH CODE`, '_', `BOROUGH`, '_', `ADDRESS`)"))


In [None]:
text(f'UPDATE {geocodes_sql_table_name} SET PRIMARY_KEY = CONCAT([BOROUGH CODE], '_', [BOROUGH], '_', [ADDRESS])')


In [66]:
with engine.connect() as connection:
    result = connection.execute(text("SELECT * FROM geocodes LIMIT 2;"))
    tables = [(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7]) for row in result]
    print(tables)

[(1, 'MANHATTAN', 'ALPHABET CITY', '347 EAST 4TH STREET', 40.7216651, -73.97831219999998, 0, 'MANHATTAN_347 EAST 4TH STREET'), (1, 'MANHATTAN', 'ALPHABET CITY', '19 AVENUE D', 40.7206751, -73.97849839999998, 0, 'MANHATTAN_19 AVENUE D')]
