In [199]:
# OPERATING SYSTEM STUFF
import os
import io
import gc

# DATA SCIENCE
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# API STUFF
import xlrd
import requests

# SQL
from sqlalchemy import create_engine, text
from sqlalchemy.exc import ProgrammingError # ProgrammingError catches SQL write exceptions
from sqlalchemy import String, Integer, Float

# GEOCODING
from geopy.geocoders import GoogleV3

# CONFIGURATION FILES
import config
pd.set_option('display.float_format', '{:.6f}'.format)

In [200]:
os.environ['SQLALCHEMY_WARN_20'] = '0'
os.environ['SQLALCHEMY_SILENCE_UBER_WARNING'] = '1'

# Database params & credentials
username = 'root'
password = 'rootpassword'
hostname = 'db'
database_name = 'database_1'

# Create database connection
engine = create_engine(f'mysql+pymysql://{username}:{password}@{hostname}')

try:
    with engine.connect() as connection:
        connection.execute(text(f'CREATE DATABASE {database_name};'))
except ProgrammingError:
    pass

engine = create_engine(f'mysql+pymysql://{username}:{password}@{hostname}/{database_name}')

# Show databases
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW DATABASES;"))
#     databases = [row[0] for row in result]
#     print(databases)

In [201]:
# Generate mapping between NYC data and Zillow categories.

# Show all the building classes
# This won't work unless the DataFrame 'combined' has been set up
# sorted_building_classes = sorted(
#     combined["BUILDING CLASS CATEGORY"].unique(),
#     key=lambda x: int(x.split(" ")[0])
# )

# Define the mappings
mapping = {
    "Single-family home": ['01 ONE FAMILY DWELLINGS'],
    "Multi-family home": [
        '03 THREE FAMILY DWELLINGS',
        '07 RENTALS - WALKUP APARTMENTS',
        '08 RENTALS - ELEVATOR APARTMENTS',
        '14 RENTALS - 4-10 UNIT'
    ],
    "Apartment": [
        '07 RENTALS - WALKUP APARTMENTS',
        '08 RENTALS - ELEVATOR APARTMENTS',
        '09 COOPS - WALKUP APARTMENTS',
        '10 COOPS - ELEVATOR APARTMENTS'
    ],
    "Condo": [
        '04 TAX CLASS 1 CONDOS',
        '12 CONDOS - WALKUP APARTMENTS',
        '13 CONDOS - ELEVATOR APARTMENTS',
        '15 CONDOS - 2-10 UNIT RESIDENTIAL',
        '16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT'
    ],
    "Co-op": [
        '09 COOPS - WALKUP APARTMENTS',
        '10 COOPS - ELEVATOR APARTMENTS',
        '17 CONDO COOPS'
    ],
    "Duplex": ['02 TWO FAMILY DWELLINGS'],
    "Townhouse": [
        '01 ONE FAMILY DWELLINGS',
        '02 TWO FAMILY DWELLINGS'
    ],
    "Brownstone": [
        '01 ONE FAMILY DWELLINGS',
        '02 TWO FAMILY DWELLINGS'
    ],
    "Row house": [
        '01 ONE FAMILY DWELLINGS',
        '02 TWO FAMILY DWELLINGS'
    ],
}

# Flatten the mapping dictionary to create a dataframe
mapping_list = [(k, v) for k, vals in mapping.items() for v in vals]
mapping_df = pd.DataFrame(
    mapping_list,
    columns=['ZILLOW CATEGORY', 'BUILDING CLASS CATEGORY']
)

In [202]:
mapping_df.to_sql('cat_map', con=engine, index=False, if_exists='replace')

#Show tables
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW TABLES;"))
#     tables = [row[0] for row in result]
#     print(tables)

#Show columns from table
with engine.connect() as connection:
    result = connection.execute(text("SHOW COLUMNS FROM geocodes;"))
    tables = [(row[0],row[1]) for row in result]
    print(tables)

[('BOROUGH CODE', 'int(11)'), ('BOROUGH', 'varchar(25)'), ('NEIGHBORHOOD', 'varchar(100)'), ('ADDRESS', 'varchar(255)'), ('LATITUDE', 'float'), ('LONGITUDE', 'float')]


In [203]:
# URL Schema
# [Manhattan, Bronx, Brooklyn, Queens, Staten Island]

dataURLs = [
    'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/'
    'rollingsales_manhattan.xlsx',
    'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/'
    'rollingsales_bronx.xlsx',
    'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/'
    'rollingsales_brooklyn.xlsx',
    'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/'
    'rollingsales_queens.xlsx',
    'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/'
    'rollingsales_statenisland.xlsx'
]
# Create an empty array that will hold our NYC Housing DataFrames
data = []

# Pull data from the NYC website
for url in dataURLs:
    # Read Excel file and skip the first 4 rows
    df = pd.read_excel(url, skiprows=4, engine="openpyxl")
    data.append(df)

In [204]:
# Combine the dataframes from the nyc housing website
combined = pd.concat(data, ignore_index=True)

# Add borough names

# Rename the 'BOROUGH' column to 'BOROUGH CODE'
combined = combined.rename(columns={'BOROUGH': 'BOROUGH CODE'})
# Define the mapping for borough codes to borough names
borough_mapping = {1: 'MANHATTAN', 2: 'BRONX', 3: 'BROOKLYN', 4: 'QUEENS', 5: 'STATEN ISLAND'}
# Create a new 'BOROUGH' column based on 'BOROUGH CODE'
borough = combined['BOROUGH CODE'].map(borough_mapping)
# Insert the new 'BOROUGH' column into the DataFrame right after the 'BOROUGH CODE' column
combined.insert(loc=1, column='BOROUGH', value=borough)

# Removing bad rows
combined = combined[~combined['ADDRESS'].str.contains('N/A')]

In [205]:
combined

Unnamed: 0,BOROUGH CODE,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,1,MANHATTAN,ALPHABET CITY,01 ONE FAMILY DWELLINGS,1,374,46,,A4,347 EAST 4TH STREET,...,1.000000,0.000000,1.000000,2116.000000,4400.000000,1900.000000,1,A4,399000,2022-09-29
1,1,MANHATTAN,ALPHABET CITY,02 TWO FAMILY DWELLINGS,1,372,36,,S2,19 AVENUE D,...,2.000000,1.000000,3.000000,826.000000,2481.000000,1900.000000,1,S2,1,2023-01-20
2,1,MANHATTAN,ALPHABET CITY,02 TWO FAMILY DWELLINGS,1,377,1,,S2,110 AVENUE C,...,2.000000,1.000000,3.000000,1503.000000,2790.000000,1901.000000,1,S2,2999999,2022-09-15
3,1,MANHATTAN,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,373,16,,C1,326 EAST 4TH STREET,...,10.000000,0.000000,10.000000,2204.000000,8625.000000,1899.000000,2,C1,16800000,2022-08-04
4,1,MANHATTAN,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,373,17,,C1,328 EAST 4TH STREET,...,10.000000,0.000000,10.000000,2204.000000,8625.000000,1900.000000,2,C1,16800000,2022-08-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84382,5,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7351,11,,B2,40 HERRICK AVENUE,...,2.000000,0.000000,2.000000,4000.000000,2250.000000,2000.000000,1,B2,0,2022-07-21
84383,5,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,1,,B2,104 GLADWIN STREET,...,2.000000,0.000000,2.000000,4345.000000,2200.000000,2001.000000,1,B2,0,2023-01-27
84387,5,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,87,,V0,96 LENEVAR AVENUE,...,0.000000,0.000000,0.000000,2644.000000,0.000000,,1,V0,1116638,2022-12-19
84389,5,STATEN ISLAND,WOODROW,21 OFFICE BUILDINGS,4,7011,1,,O7,401 BLOOMINGDALE ROAD,...,0.000000,6.000000,6.000000,9683.000000,8395.000000,2000.000000,4,O7,1600000,2022-07-18


In [206]:
# Write the contents of `combined` to the `sales` SQL table...
combined.to_sql('sales', con=engine, index=False, if_exists='replace')

In [207]:
# CREATE GEOCODING TABLE & COPY TO SQL TABLE

# Table name in the SQL database
geocodes_sql_table = 'geocodes'

data_types_df = {
    'BOROUGH CODE': int,
    'BOROUGH': str,
    'NEIGHBORHOOD': str,
    'ADDRESS': str,
    'LATITUDE': float,
    'LONGITUDE': float
}

data_types_sqlalchemy = {
    'BOROUGH CODE': Integer,
    'BOROUGH': String(25),
    'NEIGHBORHOOD': String(100),
    'ADDRESS': String(255),
    'LATITUDE': Float,
    'LONGITUDE': Float
}

try:
    # Template Pandas DataFrame that we use to build the SQL database
    geocodes = pd.DataFrame(columns=data_types_df)
    # Create the table & confirm
    geocodes.to_sql(geocodes_sql_table, engine, index=False, dtype=data_types_sqlalchemy)
    print(f"Table '{geocodes_sql_table}' created in database '{database_name}'.")
except ValueError:
    print(f"Table '{geocodes_sql_table}' already exists in database '{database_name}'.")

Table 'geocodes' already exists in database 'database_1'.


In [208]:
# Create a table of geographic information from `combined`
geocodingTable = combined[['BOROUGH CODE', 'BOROUGH', 'NEIGHBORHOOD', 'ADDRESS']].copy()
geocodingTable['LATITUDE'] = None
geocodingTable['LONGITUDE'] = None

In [209]:
query = f"SELECT * FROM {geocodes_sql_table}" 

# Execute the query and load the result into a DataFrame
df11 = pd.read_sql_query(query, engine)
df11 = geocodingTable[:-5].copy() # Fictional

In [210]:
mask = geocodingTable[['BOROUGH CODE', 'BOROUGH', 'NEIGHBORHOOD', 'ADDRESS']].isin(
    df11[['BOROUGH CODE', 'BOROUGH', 'NEIGHBORHOOD', 'ADDRESS']]
)
matching_rows = mask.all(axis=1)
missing_rows = geocodingTable[~matching_rows]

In [211]:
missing_rows

Unnamed: 0,BOROUGH CODE,BOROUGH,NEIGHBORHOOD,ADDRESS,LATITUDE,LONGITUDE
84382,5,STATEN ISLAND,WOODROW,40 HERRICK AVENUE,,
84383,5,STATEN ISLAND,WOODROW,104 GLADWIN STREET,,
84387,5,STATEN ISLAND,WOODROW,96 LENEVAR AVENUE,,
84389,5,STATEN ISLAND,WOODROW,401 BLOOMINGDALE ROAD,,
84390,5,STATEN ISLAND,WOODROW,3120 ARTHUR KILL ROAD,,


In [188]:
# Initialize a geopy Google API query object
geolocator = GoogleV3(api_key=config.GOOGLE_API_KEY)

# Define a geolocator function w/ the object
def geolocate(address):
    location = geolocator.geocode(address)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None

# Apply geolocator function to all rows in missing_rows DataFrame
full_address = missing_rows[['ADDRESS', 'NEIGHBORHOOD', 'BOROUGH']].apply(lambda x: ', '.join(x) + ', New York City', axis=1)
missing_rows['LATITUDE'], missing_rows['LONGITUDE'] = zip(*full_address.apply(geolocate))
missing_rows

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,BOROUGH CODE,BOROUGH,NEIGHBORHOOD,ADDRESS,LATITUDE,LONGITUDE
84386,5,STATEN ISLAND,WOODROW,N/A LENEVAR AVENUE,40.535148,-74.207975
84387,5,STATEN ISLAND,WOODROW,96 LENEVAR AVENUE,40.538871,-74.20938
84388,5,STATEN ISLAND,WOODROW,N/A ENGLEWOOD AVENUE,40.53248,-74.225382
84389,5,STATEN ISLAND,WOODROW,401 BLOOMINGDALE ROAD,40.53478,-74.217978
84390,5,STATEN ISLAND,WOODROW,3120 ARTHUR KILL ROAD,40.543765,-74.233477


In [1]:
combined

NameError: name 'combined' is not defined

In [12]:
mapping_df

Unnamed: 0,ZILLOW CATEGORY,BUILDING CLASS CATEGORY
0,Single-family home,01 ONE FAMILY DWELLINGS
1,Multi-family home,03 THREE FAMILY DWELLINGS
2,Multi-family home,07 RENTALS - WALKUP APARTMENTS
3,Multi-family home,08 RENTALS - ELEVATOR APARTMENTS
4,Multi-family home,14 RENTALS - 4-10 UNIT
5,Apartment,07 RENTALS - WALKUP APARTMENTS
6,Apartment,08 RENTALS - ELEVATOR APARTMENTS
7,Apartment,09 COOPS - WALKUP APARTMENTS
8,Apartment,10 COOPS - ELEVATOR APARTMENTS
9,Condo,04 TAX CLASS 1 CONDOS
