# NYC Apartment Search

## Setup

In [1]:
# Standard library imports
import json
import os
import io
import glob
import warnings
from datetime import datetime

# File and path handling
import pathlib

# HTTP and URL handling
import urllib.parse
import requests

# Data handling and analysis
import pandas as pd
import numpy as np
import geopandas as gpd
from geopandas.tools import sjoin

# Database and SQL handling
import psycopg2
import sqlalchemy as db
from sqlalchemy import create_engine, Column, Integer, String, Float, Date, text
from sqlalchemy.orm import declarative_base, sessionmaker

# Geometry and spatial analysis
import shapely
from shapely.geometry import Point, Polygon
from shapely import wkt
import geoalchemy2 as gdb
from geoalchemy2 import Geometry

# Visualization and plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.animation import FuncAnimation
import contextily as ctx
import seaborn as sns
from PIL import Image

# IPython and widgets
from IPython.display import Image as IPImage, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, IntSlider
from ipywidgets.embed import embed_minimal_html

# Warnings configuration
warnings.filterwarnings('ignore')

In [2]:
# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "ZIP_CODE_040114.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

# Download NYC Data
url_311 = 'https://data.cityofnewyork.us/resource/erm2-nwe9.csv'
url_trees = 'https://data.cityofnewyork.us/resource/5rq2-4hqu.csv'
NYC_DATA_APP_TOKEN = "UYsSh8MfAPVog5LPL1G3ySktk"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

# create schema.sql file
DB_SCHEMA_FILE = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

### 1. Export Data

In [None]:
def download_nyc_geojson_data(url, app_token, filename, date_field, 
                              start_date, end_date, date_format="%Y-%m-%dT%H:%M:%S", limit=10000):
    """
    Downloads NYC GeoJSON data within a specified date range and writes it to a file.

    This function fetches data from a specified URL using API requests, filtering the data based on a date range. It then writes the data into a file in batches, handling pagination through the 'offset' parameter.

    Parameters:
    - url (str): The URL endpoint for the API from which data is to be fetched.
    - app_token (str): Application token for API access.
    - filename (str): Name of the file where the downloaded data will be saved.
    - date_field (str): The field in the data used to filter by date.
    - start_date (datetime): The start date for the data query.
    - end_date (datetime): The end date for the data query.
    - date_format (str, optional): The format in which dates are represented. Defaults to "%Y-%m-%dT%H:%M:%S".
    - limit (int, optional): The maximum number of records to fetch per request. Defaults to 10000.

    Returns:
    None. The function writes the data to the specified file and prints a message if any request fails.

    The function iterates over batches of data until all records within the specified date range are retrieved and saved to the file. It ensures that the column headers are written only once and handles any HTTP errors encountered during the requests.
    """
    offset = 0
    start_date_str = start_date.strftime(date_format)
    end_date_str = end_date.strftime(date_format)
    date_query = f"$where={date_field} between '{start_date_str}' and '{end_date_str}'"
    
    # set up as the first batch
    first_batch = True  
    while True:
        full_url = f"{url}?$$app_token={app_token}&{date_query}&$limit={limit}&$offset={offset}"
        response = requests.get(full_url)

        if response.status_code == 200:
            data = response.text
            # count the records have been exported
            records_retrieved = data.count('\n') 
            
            # To check if it is the first batch and whether have value
            if first_batch and records_retrieved > 0: 
                # only keep column name in the first batch
                with open(filename, 'w') as file:
                    file.write(data)
                first_batch = False
            elif records_retrieved > 1:  # 
                with open(filename, 'a') as file:
                    # slip the column name
                    file.write(data.split('\n', 1)[1])  
            
            # to check if the data have been exported or not
            if records_retrieved < limit + 1: 
                break
            offset += limit
        else:
            print(f"Failed to download data at offset {offset}: Status code {response.status_code}")
            break

#### 1.1 Download tree 2015 data

In [None]:
# export tree data
download_nyc_geojson_data(
    url=url_trees,
    app_token=NYC_DATA_APP_TOKEN,  
    filename="data/tree_data.csv",
    date_field="created_at",  
    start_date=datetime(2015, 1, 1),
    end_date=datetime(2015, 12, 31),
    date_format="%m/%d/%Y",  
    limit=250000
)

**<span style="color: red;">Test the function 1</span>**

Test for File Existence: This test will check if the tree_data.csv file is created in the specified directory after executing the function.

In [None]:
assert os.path.isfile("data/tree_data.csv"), "File tree_data.csv does not exist"