In [18]:
# OPERATING SYSTEM STUFF
import io
import gc

# CONFIGURATION FILES
import config

# DATA SCIENCE
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# API STUFF
import xlrd
import requests

# SQL
from sqlalchemy import create_engine, text

# GEOCODING
from geopy.geocoders import GoogleV3

In [3]:
# START SQL DATABASE
# import os
# os.environ['SQLALCHEMY_WARN_20'] = '0'

# engine = create_engine('mysql+pymysql://root:rootpassword@db')

# with engine.connect() as connection:
#     connection.execute(text("CREATE DATABASE new_database;"))

# SHOW DATABASES
# with engine.connect() as connection:
#     result = connection.execute(text("SHOW DATABASES;"))
#     databases = [row[0] for row in result]
#     print(databases)


In [5]:
# URL Schema
# [Manhattan, Bronx, Brooklyn, Queens, Staten Island]

dataURLs = ['https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_manhattan.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_bronx.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_brooklyn.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_queens.xlsx',
           'https://www.nyc.gov/assets/finance/downloads/pdf/rolling_sales/rollingsales_statenisland.xlsx']

In [6]:
data = []

# PULL DATA FROM NYC WEBSITE
for url in dataURLs:
    # Read Excel file and skip the first 4 rows
    df = pd.read_excel(url, skiprows=4, engine="openpyxl")
    data.append(df)

In [23]:
# Combine the dataframes
combined = pd.concat(data, ignore_index=True)

# Add Boroughs
# Rename the 'BOROUGH' column to 'BOROUGH CODE'
combined = combined.rename(columns={'BOROUGH': 'BOROUGH CODE'})

# Define the mapping for borough codes to borough names
borough_mapping = {1: 'MANHATTAN', 2: 'BRONX', 3: 'BROOKLYN', 4: 'QUEENS', 5: 'STATEN ISLAND'}

# Create a new 'BOROUGH' column based on 'BOROUGH CODE'
borough = combined['BOROUGH CODE'].map(borough_mapping)

# Insert the new 'BOROUGH' column into the DataFrame right after the 'BOROUGH CODE' column
combined.insert(loc=1, column='BOROUGH', value=borough)

In [9]:
# GENERATE MAPPING BETWEEN NYC DATA AND ZILLOW CATEGORIES.

# Get the unique values in the "BUILDING CLASS CATEGORY" column
unique_building_classes = combined["BUILDING CLASS CATEGORY"].unique()

# Extract the leading number from each string, convert to int, sort and display
sorted_building_classes = sorted(unique_building_classes, key=lambda x: int(x.split(" ")[0]))

# Define the mappings
mapping = {
    "Single-family home": ['01 ONE FAMILY DWELLINGS'],
    "Multi-family home": ['03 THREE FAMILY DWELLINGS',
                          '07 RENTALS - WALKUP APARTMENTS',
                          '08 RENTALS - ELEVATOR APARTMENTS',
                          '14 RENTALS - 4-10 UNIT'],
    "Apartment": ['07 RENTALS - WALKUP APARTMENTS',
                  '08 RENTALS - ELEVATOR APARTMENTS',
                  '09 COOPS - WALKUP APARTMENTS',
                  '10 COOPS - ELEVATOR APARTMENTS'],
    "Condo": ['04 TAX CLASS 1 CONDOS',
              '12 CONDOS - WALKUP APARTMENTS',
              '13 CONDOS - ELEVATOR APARTMENTS',
              '15 CONDOS - 2-10 UNIT RESIDENTIAL',
              '16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT'],
    "Co-op": ['09 COOPS - WALKUP APARTMENTS',
              '10 COOPS - ELEVATOR APARTMENTS',
              '17 CONDO COOPS'],
    "Duplex": ['02 TWO FAMILY DWELLINGS'],
    "Townhouse": ['01 ONE FAMILY DWELLINGS',
                  '02 TWO FAMILY DWELLINGS'],
    "Brownstone": ['01 ONE FAMILY DWELLINGS',
                   '02 TWO FAMILY DWELLINGS'],
    "Row house": ['01 ONE FAMILY DWELLINGS',
                  '02 TWO FAMILY DWELLINGS'],
}

# Flatten the mapping dictionary to create a dataframe
mapping_list = [(k, v) for k, vals in mapping.items() for v in vals]
mapping_df = pd.DataFrame(mapping_list, columns=['ZILLOW CATEGORY', 'BUILDING CLASS CATEGORY'])

In [20]:
geolocator = GoogleV3(api_key=config.GOOGLE_API_KEY)

def geolocate(address):
    location = geolocator.geocode(address)
    if location:
        return location.latitude, location.longitude
    else:
        return None, None

# combined['latitude'], combined['longitude'] = zip(*combined['ADDRESS'].apply(geolocate))

geolocate('347 EAST 4TH STREET'+', Manhattan, New York City')


(40.7216651, -73.97831219999999)

In [16]:
# combined.to_csv('combined.csv', index=False)
