 Data Preprocessing

In [8]:
import requests
import json
import pandas as pd
import geopandas as gpd
import os

In [9]:
app_token = '2bSOlTJkWZ0e43SGvaNbY1sHz'
API_endpoint_list = [
    "https://data.cityofnewyork.us/resource/5rq2-4hqu.json", 
    "https://data.cityofnewyork.us/resource/erm2-nwe9.json"
]

# Create a 'data' directory if it doesn't exist
data_directory = 'data'
os.makedirs(data_directory, exist_ok=True)

# Corresponding filenames for each endpoint, with path to 'data' directory
file_names = [os.path.join(data_directory, "2015StreetTreesCensus_TREES.csv"), 
              os.path.join(data_directory, "311_Service_Requests.csv")]

headers = {
    'X-App-Token': app_token
}

# Processing each API endpoint separately and saving as CSV in the 'data' folder
for i, endpoint in enumerate(API_endpoint_list):
    response = requests.get(endpoint, headers=headers)
    if response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data)
        df.to_csv(file_names[i], index=False)
        print(f"Data from {endpoint} written to {file_names[i]} in CSV format")
    else:
        print(f"Error: {response.status_code} from {endpoint}")


Data from https://data.cityofnewyork.us/resource/5rq2-4hqu.json written to data/2015StreetTreesCensus_TREES.csv in CSV format
Data from https://data.cityofnewyork.us/resource/erm2-nwe9.json written to data/311_Service_Requests.csv in CSV format


In [10]:
service_requests_path = './data/311_Service_Requests.csv'
trees_census_path = './data/2015StreetTreesCensus_TREES.csv'
rent_data_path = './data/zillow_rent_data.csv'
shapefile_path = './data/nyc_zipcodes.shp'

In [11]:
# Cleaning Shapefiles of NYC’s Zip Codes

# Load the shapefile with geopandas
gdf = gpd.read_file('./data/nyc_zipcodes.shp')
# Assuming 'ZIPCODE' and 'geometry' are the necessary columns
gdf = gdf[['ZIPCODE', 'geometry']]
# Ensure the ZIPCODE column is a string for consistency
gdf['ZIPCODE'] = gdf['ZIPCODE'].astype(str).str.zfill(5)
# Set the CRS to a common SRID if needed (e.g., SRID 4326 for WGS84)
gdf = gdf.to_crs(epsg=4326)
# Save the cleaned data back to a shapefile
gdf.to_file('./data/cleaned_nyc_zipcodes.shp')


In [12]:
#Cleaning Historical Monthly Average Rents by Zip Code from Zillow

rent_data_df = pd.read_csv(rent_data_path)
latest_rent_column = rent_data_df.columns[-1]  # Assuming the last column is the latest rent data
rent_data_df = rent_data_df[['RegionName', latest_rent_column]]
rent_data_df.rename(columns={'RegionName': 'zipcode', latest_rent_column: 'rent_amount'}, inplace=True)
rent_data_df['zipcode'] = rent_data_df['zipcode'].astype(str).str.zfill(5)
rent_data_df.to_csv('./data/cleaned_zillow_rent_data.csv', index=False)



In [13]:
#Cleaning Historical Data from NYC Open Data on 311 Complaints
service_requests_df = pd.read_csv(service_requests_path)
service_requests_df = service_requests_df[['unique_key', 'created_date', 'complaint_type', 'incident_zip']]
service_requests_df['created_date'] = pd.to_datetime(service_requests_df['created_date'])
service_requests_df['incident_zip'] = service_requests_df['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)
service_requests_df.dropna(subset=['incident_zip'], inplace=True)
service_requests_df.to_csv('./data/cleaned_311_Service_Requests.csv', index=False)

In [14]:
#Cleaning the 2015 Tree Census
trees_census_df = pd.read_csv(trees_census_path)
trees_census_df = trees_census_df[['tree_id', 'spc_common', 'health', 'zipcode']]
trees_census_df['zipcode'] = trees_census_df['zipcode'].astype(str).str.zfill(5)
trees_census_df['health'].fillna('Unknown', inplace=True)
trees_census_df.to_csv('./data/cleaned_2015StreetTreesCensus_TREES.csv', index=False)



Storing Data

Understanding Data

Visualizing Data