# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

First import all the modules needed for the project 

In [1]:
import json
import pathlib
import urllib.parse
import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
from sqlalchemy.orm import declarative_base
from shapely.geometry import Point 
from ipywidgets import interact, widgets,fixed
from IPython.display import display
from typing import Text
import plotly.express as px
from typing import Tuple
import psycopg2
from psycopg2 import sql
from sqlalchemy import text
from sqlalchemy import create_engine
from geoalchemy2 import Geometry

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Define path to read and write all the data files, and directory for database queries which will be used in the project later

In [2]:
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "ZIP_CODE_040114.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "FILL_ME_IN"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "FILL_ME_IN"
DB_USER = "FILL_ME_IN"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"
# define directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

In [4]:
# store app_token
app_token='Ynj6cS7u6dCNSS8Mx24yo8QLg'

## Part 1: Data Preprocessing
In Part 1, we will download data, clean and filter for the relevant data, fill in missing data, and generate samples of these datasets.

Download and clean the zipcode, 311(complaints), tree, and zillow(rent) data files to prepare for later uses

In [5]:
# load and clean zipcode data
def load_and_clean_zipcodes(zipcode_datafile: str) -> gpd.GeoDataFrame:
    geodf_zipcode_data=gpd.read_file(zipcode_datafile)
    return geodf_zipcode_data
geodf_zipcode_data=load_and_clean_zipcodes('data/nyc_zipcodes/nyc_zipcodes.dbf')
geodf_zipcode_data=geodf_zipcode_data.drop(['BLDGZIP','POPULATION','AREA','ST_FIPS','CTY_FIPS','URL','SHAPE_AREA','SHAPE_LEN','PO_NAME','STATE','COUNTY'],axis=1)
geodf_zipcode_data= geodf_zipcode_data.to_crs(epsg=4326)
geodf_zipcode_data=geodf_zipcode_data.rename(columns={'ZIPCODE':'zipcode'})


In [None]:
# load and clean 311_data
def download_and_clean_311_data() -> pd.DataFrame:
    api_endpoint = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json'
    selected_columns=['created_date','complaint_type','descriptor','incident_zip','latitude','longitude']
    params = {
    '$$app_token': app_token,
    '$where': 'created_date >= "2015-01-01T00:00:00.000"',
    '$select': ','.join(selected_columns),
    '$limit': 34856027
}
    response = requests.get(api_endpoint, params=params)
    if response.status_code == 200:
    # Load the data into a pandas DataFrame
        nyc_data = pd.read_json(response.text)

    # Now you have the data and can perform further processing
        nyc_data.to_csv('data/nyc_data_311.csv', index=False)
        return nyc_data
    else:
        print(f"Error: {response.status_code}, {response.text}")


geodf_311_data=download_and_clean_311_data()
geodf_311_data=geodf_311_data.dropna()
geodf_311_data['zipcode'] = geodf_311_data['incident_zip'].astype(str)
geodf_311_data['zipcode'] = geodf_311_data['zipcode'].str.split('.').str[0]
geodf_311_data['geometry'] = geodf_311_data.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
geodf_311_data = gpd.GeoDataFrame(geodf_311_data, geometry='geometry', crs='EPSG:4326')
geodf_311_data=geodf_311_data.drop(['incident_zip','descriptor','latitude','longitude'],axis=1)
geodf_311_data=geodf_311_data.rename(columns={'created_date':'date'})

In [None]:
# load and clean tree data
def download_and_clean_tree_data() -> gpd.GeoDataFrame:
    api_endpoint1 = 'https://data.cityofnewyork.us/resource/5rq2-4hqu.geojson'
    selected_columns = ['tree_id','health','status','spc_common','zipcode','latitude','longitude']
    params = {
        '$$app_token': app_token,
        '$limit': 683788,
        '$select': ','.join(selected_columns),
    }
    response = requests.get(api_endpoint1, params=params)
    if response.status_code == 200:
        # Load the data into a GeoDataFrame using geopandas
        geodf_tree_data  = gpd.read_file(response.text)

        # Save the GeoDataFrame to a CSV file
        geodf_tree_data.to_csv('data/tree_data.csv', index=False)

        return geodf_tree_data
    else:
        print(f"Error: {response.status_code}, {response.text}")

# 调用函数
geodf_tree_data = download_and_clean_tree_data()

In [15]:
geodf_tree_data['geometry'] = geodf_tree_data.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
geodf_tree_data = gpd.GeoDataFrame(geodf_tree_data, geometry='geometry', crs='EPSG:4326')
geodf_tree_data=geodf_tree_data.drop(['latitude','longitude'],axis=1)
geodf_tree_data['zipcode']=geodf_tree_data['zipcode'].astype(str)
geodf_tree_data=geodf_tree_data.rename({"spc_common":"species"})

In [14]:
# load and clean zillow data
def load_and_clean_zillow_data() -> pd.DataFrame:
    df_zillow_data=pd.read_csv('data/zillow_rent_data.csv')
    df_zillow_data= df_zillow_data.drop(['RegionID','SizeRank','RegionType','StateName'],axis=1)
    df_zillow_data=df_zillow_data[df_zillow_data['City']=='New York']
    return df_zillow_data

df_zillow_data=load_and_clean_zillow_data()
selected_columns = df_zillow_data.iloc[:, 5:]
df_zillow_data['rent_avg']=selected_columns.mean(axis=1, skipna=True)
df_zillow_data=df_zillow_data.rename(columns={'RegionName':'zipcode'})
df_zillow_data['zipcode']=df_zillow_data['zipcode'].astype(str)
df_zillow_data=df_zillow_data.drop(columns={'CountyName','State','City','Metro'},axis=1)
#rent_avg calculates average history rent in each zipcode. we use it in visualization 3

Display the information and sample for each of the dataframes for our knowledge and understanding of the structures and features included in them

In [7]:
# Show basic info about zipcode dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   zipcode   263 non-null    object  
 1   geometry  263 non-null    geometry
dtypes: geometry(1), object(1)
memory usage: 4.2+ KB


In [8]:
# Show first 5 entries about zipcode dataframe
geodf_zipcode_data.head()

Unnamed: 0,zipcode,geometry
0,11436,"POLYGON ((-73.80585 40.68291, -73.80569 40.682..."
1,11213,"POLYGON ((-73.93740 40.67973, -73.93487 40.679..."
2,11212,"POLYGON ((-73.90294 40.67084, -73.90223 40.668..."
3,11225,"POLYGON ((-73.95797 40.67066, -73.95576 40.670..."
4,11218,"POLYGON ((-73.97208 40.65060, -73.97192 40.650..."


In [12]:
# Show basic info about 311 dataframe
geodf_311_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23117461 entries, 0 to 23117460
Data columns (total 4 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   date            object
 1   complaint_type  object
 2   geometry        object
 3   zipcode         int64 
dtypes: int64(1), object(3)
memory usage: 705.5+ MB


In [13]:
# Show first 5 entries about 311 dataframe
geodf_311_data.head()

Unnamed: 0,date,complaint_type,geometry,zipcode
0,2023-11-29T12:00:00.000,Derelict Vehicles,POINT (-73.8142801312359 40.817187332918586),10465
1,2023-11-29T12:00:00.000,Derelict Vehicles,POINT (-73.82745673428309 40.844792223265024),10461
2,2023-11-29T12:00:00.000,Derelict Vehicles,POINT (-73.95719798301853 40.70100122473231),11206
3,2023-11-29T12:00:00.000,Derelict Vehicles,POINT (-73.99160285570444 40.5959797557408),11214
4,2023-11-29T01:19:45.000,Noise - Helicopter,POINT (-73.960360425043 40.78063308264647),10028


In [16]:
# Show basic info about tree dataframe
geodf_tree_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 683788 entries, 0 to 683787
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   health    652172 non-null  object  
 1   zipcode   683788 non-null  object  
 2   tree_id   683788 non-null  int64   
 3   species   652169 non-null  object  
 4   status    683788 non-null  object  
 5   geometry  683788 non-null  geometry
dtypes: geometry(1), int64(1), object(4)
memory usage: 36.5+ MB


In [17]:
# Show first 5 entries about tree dataframe
geodf_tree_data.head()

Unnamed: 0,health,zipcode,tree_id,species,status,geometry
0,Fair,11375,180683,red maple,Alive,POINT (-73.84422 40.72309)
1,Fair,11357,200540,pin oak,Alive,POINT (-73.81868 40.79411)
2,Good,11211,204026,honeylocust,Alive,POINT (-73.93661 40.71758)
3,Good,11211,204337,honeylocust,Alive,POINT (-73.93446 40.71354)
4,Good,11215,189565,American linden,Alive,POINT (-73.97598 40.66678)


In [18]:
# Show basic info about zillow dataframe
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, 4 to 6721
Columns: 107 entries, zipcode to rent_avg
dtypes: float64(106), object(1)
memory usage: 122.3+ KB


In [19]:
# Show first 5 entries about zillow dataframe
df_zillow_data.head()

Unnamed: 0,zipcode,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,2015-06-30,2015-07-31,2015-08-31,2015-09-30,...,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31,2023-09-30,rent_avg
4,11385,,2087.527084,,2149.924252,2166.263698,2148.992886,2190.098591,2264.966715,2297.900917,...,2895.699421,2873.209025,2881.906361,2913.546218,2963.964134,3005.735342,3034.413822,3064.476503,3079.585783,2476.923402
6,11208,,,,,,,,,,...,2588.030194,2613.790654,2585.561351,2633.200754,2672.038493,2806.918757,2765.224364,2737.54747,2728.733333,2560.469803
12,11236,,,,,,,,,,...,,,,,,,,2285.460026,2362.5,2323.980013
13,10467,,,,,,,,,,...,2155.617718,2172.346611,2160.962748,2110.533203,2180.323655,2276.37229,2334.204728,2353.686402,2423.888889,1861.072982
14,11373,,,,,,,,,,...,2255.604528,2262.101623,2271.514956,2250.182334,2231.959479,2257.413993,2247.592851,2302.557354,2292.994444,2084.549636


## Part 2: Storing Data
In Part 2, we will take the datasets downloaded & cleaned from Part 1, and populate a PostgreSQL database with tables generated from the datasets.

Create a new database in PostgreSQL with which we'll load in our preprocessed datasets, and turn on the PostGIS extension

In [24]:
!createdb Julia-Joy1

createdb: error: database creation failed: ERROR:  database "Julia-Joy1" already exists


In [7]:
!psql --dbname Julia-Joy1 -c 'CREATE EXTENSION postgis;'

ERROR:  extension "postgis" already exists


In [5]:
import psycopg2
from psycopg2 import sql

def setup_new_postgis_database(username, db_name):
    # Connect to the default 'postgres' database to create a new database
    connection = psycopg2.connect(user=username, dbname=db_name)
    cursor = connection.cursor()

In [6]:
setup_new_postgis_database('ljy', 'Julia-Joy1')

### Creating Tables

In SQL, define SQL statements for four tables with desired keys and columns; then create and execute the sql schema file to create our four tables

In [22]:
ZIPCODE_SCHEMA = """
CREATE TABLE IF NOT EXISTS zipcodes (
    id SERIAL PRIMARY KEY,
    zipcode VARCHAR(10),
    geometry GEOMETRY(POLYGON, 4326)
);
"""

NYC_311_SCHEMA = """
CREATE TABLE IF NOT EXISTS complaints (
    id SERIAL PRIMARY KEY,
    date DATE,
    complaint_type VARCHAR(255),
    geometry GEOMETRY(POINT, 4326),
    zipcode VARCHAR
);
"""

NYC_TREE_SCHEMA = """
CREATE TABLE IF NOT EXISTS trees (
    id SERIAL PRIMARY KEY,
    health VARCHAR,
    zipcodes VARCHAR,
    tree_id INTEGER,
    status VARCHAR,
    geometry GEOMETRY(POINT, 4326)
);
"""

ZILLOW_SCHEMA = """
CREATE TABLE IF NOT EXISTS rents (
    id SERIAL PRIMARY KEY,
    zipcode VARCHAR(10),
    rent_avg NUMERIC
);
"""

In [23]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)

In [7]:
# execute the schema files to create tables
from sqlalchemy import text
connection=psycopg2.connect("dbname=Julia-Joy1 user=ljy password=Jing0202 host=localhost port=5432")
from sqlalchemy import create_engine
DATABASE_URL = 'postgresql://ljy:Jing0202@localhost:5432/Julia-Joy1'
engine = create_engine(DATABASE_URL)

with engine.connect() as connection:
    with connection.begin():
        connection.execute(text(ZIPCODE_SCHEMA))
        connection.execute(text(NYC_311_SCHEMA))
        connection.execute(text(NYC_TREE_SCHEMA))
        connection.execute(text(ZILLOW_SCHEMA))
    pass

### Add Data to Database

Connect to the created database and add data into it

In [25]:
from geoalchemy2 import Geometry
geodf_zipcode_data.to_postgis("zipcodes", engine, index=False, if_exists='replace', dtype={'geometry': Geometry('POLYGON', srid=4326)})

geodf_311_data = geodf_311_data.to_crs("EPSG:4326") 
geodf_311_data.to_postgis("complaints", engine, if_exists="replace", index=False, dtype={"geometry": Geometry('POINT', 4326)})

geodf_tree_data = geodf_tree_data.to_crs("EPSG:4326")
geodf_tree_data.to_postgis("trees", engine, if_exists="replace", index=False, dtype={"geometry": Geometry('POINT', 4326)})

df_zillow_data.to_sql("rents", engine, if_exists="replace", index=False)

145

In [26]:
with engine.connect() as connection:
    connection.execute(text("CREATE INDEX idx_rent_zipcode ON rents (zipcode);"))
    connection.execute(text("CREATE INDEX idx_tree_zipcode ON trees (zipcode);"))
    connection.execute(text("CREATE INDEX idx_complaint_zipcode ON complaints (zipcode, date);"))