In [61]:
# OPERATING SYSTEM STUFF
import os
import io
import gc

# DATA SCIENCE
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# API STUFF
import xlrd
import requests

# SQL
from sqlalchemy import create_engine, text
from sqlalchemy.exc import ProgrammingError # ProgrammingError catches SQL write exceptions

# GEOCODING
from geopy.geocoders import GoogleV3

# CONFIGURATION FILES
import config
pd.set_option('display.float_format', '{:.6f}'.format)

In [43]:
# START SQL DATABASE
# import os
# os.environ['SQLALCHEMY_WARN_20'] = '0'

# engine = create_engine('mysql+pymysql://root:rootpassword@db')

# with engine.connect() as connection:
#     connection.execute(text("CREATE DATABASE new_database;"))

# SHOW DATABASES
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW DATABASES;"))
#     databases = [row[0] for row in result]
#     print(databases)


In [44]:
# URL Schema
# [Manhattan, Bronx, Brooklyn, Queens, Staten Island]

dataURLs = ['https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_manhattan.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_bronx.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_brooklyn.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_queens.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_statenisland.xlsx']

In [45]:
data = []

# PULL DATA FROM NYC WEBSITE
for url in dataURLs:
    # Read Excel file and skip the first 4 rows
    df = pd.read_excel(url, skiprows=4, engine="openpyxl")
    data.append(df)

In [46]:
# Combine the dataframes
combined = pd.concat(data, ignore_index=True)

# Add Boroughs
# Rename the 'BOROUGH' column to 'BOROUGH CODE'
combined = combined.rename(columns={'BOROUGH': 'BOROUGH CODE'})

# Define the mapping for borough codes to borough names
borough_mapping = {1: 'MANHATTAN', 2: 'BRONX', 3: 'BROOKLYN', 4: 'QUEENS', 5: 'STATEN ISLAND'}

# Create a new 'BOROUGH' column based on 'BOROUGH CODE'
borough = combined['BOROUGH CODE'].map(borough_mapping)

# Insert the new 'BOROUGH' column into the DataFrame right after the 'BOROUGH CODE' column
combined.insert(loc=1, column='BOROUGH', value=borough)

In [47]:
# GENERATE MAPPING BETWEEN NYC DATA AND ZILLOW CATEGORIES.

# Get the unique values in the "BUILDING CLASS CATEGORY" column and
# extract the leading number from each string, convert to int, sort and display
sorted_building_classes = sorted(combined["BUILDING CLASS CATEGORY"].unique(), key=lambda x: int(x.split(" ")[0]))

# Define the mappings
mapping = {
    "Single-family home": ['01 ONE FAMILY DWELLINGS'],
    "Multi-family home": ['03 THREE FAMILY DWELLINGS',
                          '07 RENTALS - WALKUP APARTMENTS',
                          '08 RENTALS - ELEVATOR APARTMENTS',
                          '14 RENTALS - 4-10 UNIT'],
    "Apartment": ['07 RENTALS - WALKUP APARTMENTS',
                  '08 RENTALS - ELEVATOR APARTMENTS',
                  '09 COOPS - WALKUP APARTMENTS',
                  '10 COOPS - ELEVATOR APARTMENTS'],
    "Condo": ['04 TAX CLASS 1 CONDOS',
              '12 CONDOS - WALKUP APARTMENTS',
              '13 CONDOS - ELEVATOR APARTMENTS',
              '15 CONDOS - 2-10 UNIT RESIDENTIAL',
              '16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT'],
    "Co-op": ['09 COOPS - WALKUP APARTMENTS',
              '10 COOPS - ELEVATOR APARTMENTS',
              '17 CONDO COOPS'],
    "Duplex": ['02 TWO FAMILY DWELLINGS'],
    "Townhouse": ['01 ONE FAMILY DWELLINGS',
                  '02 TWO FAMILY DWELLINGS'],
    "Brownstone": ['01 ONE FAMILY DWELLINGS',
                   '02 TWO FAMILY DWELLINGS'],
    "Row house": ['01 ONE FAMILY DWELLINGS',
                  '02 TWO FAMILY DWELLINGS'],
}

# Flatten the mapping dictionary to create a dataframe
mapping_list = [(k, v) for k, vals in mapping.items() for v in vals]
mapping_df = pd.DataFrame(mapping_list, columns=['ZILLOW CATEGORY', 'BUILDING CLASS CATEGORY'])

In [62]:
# START GEOCODER SQL DATABASE
os.environ['SQLALCHEMY_WARN_20'] = '0'
engine = create_engine('mysql+pymysql://root:rootpassword@db')

try:
    with engine.connect() as connection:
        connection.execute(text("CREATE DATABASE new_database;"))
except ProgrammingError:
    pass

In [None]:
# Let's try writing to this database and putting it on the docker volume.

In [None]:
with engine.connect() as connection:
    result = connection.execute(text("SHOW DATABASES;"))
    databases = [row[0] for row in result]
    print(databases)

In [13]:
geocodingTable = combined[['BOROUGH CODE', 'BOROUGH', 'NEIGHBORHOOD', 'ADDRESS']].copy()
geocodingTable['LATITUDE'] = None
geocodingTable['LONGITUDE'] = None

In [14]:
# Initialize a geopy Google API query object
geolocator = GoogleV3(api_key=config.GOOGLE_API_KEY)

# Define a geolocator function w/ the object
def geolocate(address):
    location = geolocator.geocode(address)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None

# Apply geolocator function to the first n rows only
full_address = geocodingTable.loc[:4, ['ADDRESS', 'NEIGHBORHOOD', 'BOROUGH']].apply(lambda x: ', '.join(x) + ', New York City', axis=1)
geocodingTable.loc[:4, 'LATITUDE'], geocodingTable.loc[:4, 'LONGITUDE'] = zip(*full_address.apply(geolocate))

In [41]:
geocodingTable

Unnamed: 0,BOROUGH CODE,BOROUGH,NEIGHBORHOOD,ADDRESS,LATITUDE,LONGITUDE
0,1,MANHATTAN,ALPHABET CITY,347 EAST 4TH STREET,40.721665,-73.978312
1,1,MANHATTAN,ALPHABET CITY,19 AVENUE D,40.720675,-73.978498
2,1,MANHATTAN,ALPHABET CITY,110 AVENUE C,40.724210,-73.978491
3,1,MANHATTAN,ALPHABET CITY,326 EAST 4TH STREET,40.721688,-73.979215
4,1,MANHATTAN,ALPHABET CITY,328 EAST 4TH STREET,40.721631,-73.979227
...,...,...,...,...,...,...
84386,5,STATEN ISLAND,WOODROW,N/A LENEVAR AVENUE,,
84387,5,STATEN ISLAND,WOODROW,96 LENEVAR AVENUE,,
84388,5,STATEN ISLAND,WOODROW,N/A ENGLEWOOD AVENUE,,
84389,5,STATEN ISLAND,WOODROW,401 BLOOMINGDALE ROAD,,
