 Data Preprocessing

In [69]:
import requests
import json
import pandas as pd
import geopandas as gpd
import os

In [70]:
app_token = '2bSOlTJkWZ0e43SGvaNbY1sHz'
API_endpoint_list = [
    "https://data.cityofnewyork.us/resource/5rq2-4hqu.json?$limit=500",  
    "https://data.cityofnewyork.us/resource/erm2-nwe9.json?$limit=500"
]


# Create a 'data' directory if it doesn't exist
data_directory = 'data'
os.makedirs(data_directory, exist_ok=True)

# Corresponding filenames for each endpoint, with path to 'data' directory
file_names = [os.path.join(data_directory, "2015StreetTreesCensus_TREES.csv"), 
              os.path.join(data_directory, "311_Service_Requests.csv")]

headers = {
    'X-App-Token': app_token
}

# Processing each API endpoint separately and saving as CSV in the 'data' folder
for i, endpoint in enumerate(API_endpoint_list):
    response = requests.get(endpoint, headers=headers)
    if response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data)
        df.to_csv(file_names[i], index=False)
        print(f"Data from {endpoint} written to {file_names[i]} in CSV format")
    else:
        print(f"Error: {response.status_code} from {endpoint}")


Data from https://data.cityofnewyork.us/resource/5rq2-4hqu.json?$limit=500 written to data/2015StreetTreesCensus_TREES.csv in CSV format
Data from https://data.cityofnewyork.us/resource/erm2-nwe9.json?$limit=500 written to data/311_Service_Requests.csv in CSV format


In [71]:
service_requests_path = './data/311_Service_Requests.csv'
trees_census_path = './data/2015StreetTreesCensus_TREES.csv'
rent_data_path = './data/zillow_rent_data.csv'
shapefile_path = './data/nyc_zipcodes.shp'

In [72]:
# Cleaning Shapefiles of NYC’s Zip Codes

# Load the shapefile with geopandas
gdf = gpd.read_file('./data/nyc_zipcodes.shp')
# Assuming 'ZIPCODE' and 'geometry' are the necessary columns
gdf = gdf[['ZIPCODE', 'geometry']]
# Ensure the ZIPCODE column is a string for consistency
gdf['ZIPCODE'] = gdf['ZIPCODE'].astype(str).str.zfill(5)
# Set the CRS to a common SRID if needed (e.g., SRID 4326 for WGS84)
gdf = gdf.to_crs(epsg=4326)
# Save the cleaned data back to a shapefile
gdf.to_file('./data/cleaned_nyc_zipcodes.shp')


In [78]:
rent_data_df = pd.read_csv(rent_data_path)
rent_data_df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2015-01-31,...,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31,2023-09-30
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,1606.206406,...,1994.653463,2027.438438,2042.237444,2049.325559,2016.531345,2023.438976,2031.558202,2046.144009,2053.486247,2055.771355
1,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,1257.814660,...,1749.697900,1738.217986,1747.305840,1758.407295,1758.891075,1762.980879,1771.751591,1779.338402,1795.384582,1799.631140
2,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,,...,1701.217520,1706.900064,1706.067787,1723.722320,1735.484670,1752.132904,1756.990323,1754.429516,1757.602011,1755.031490
3,93144,6,79936,zip,TX,TX,El Paso,"El Paso, TX",El Paso County,,...,1419.480272,1458.063897,1471.726681,1466.734658,1456.175660,1462.478506,1466.267391,1490.237063,1488.180414,1494.366097
4,62093,7,11385,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,,...,2935.808220,2895.699421,2873.209025,2881.906361,2913.546218,2963.964134,3005.735342,3034.413822,3064.476503,3079.585783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6717,418163,30158,89158,zip,NV,NV,Las Vegas,"Las Vegas-Henderson-Paradise, NV",Clark County,,...,3281.330738,3509.210744,3407.499896,3438.041504,3436.371804,3524.703410,3426.708975,3412.249969,3310.302151,3448.166667
6718,72017,30490,32461,zip,FL,FL,Panama City Beach,"Crestview-Fort Walton Beach-Destin, FL",Walton County,,...,,,,,,,2583.675563,2590.977335,2639.938102,2702.500000
6719,58956,30490,2876,zip,RI,RI,North Smithfield,"Providence-Warwick, RI-MA",Providence County,,...,,,,,,,,,,2250.000000
6720,91179,30490,76005,zip,TX,TX,Arlington,"Dallas-Fort Worth-Arlington, TX",Tarrant County,,...,2148.224601,2169.143026,2179.393248,2226.624684,2369.532530,2374.713926,2414.638428,2389.749852,2383.185013,2313.944444


In [79]:
#Cleaning Historical Monthly Average Rents by Zip Code from Zillow

# Load the Zillow Rent Data
rent_data_df = pd.read_csv(rent_data_path)
# Selecting the desired columns including region, city, countyName, and the latest rent data
# Assuming the last column is the latest rent data (you might want to verify this)
latest_rent_column = rent_data_df.columns[-1]
rent_data_df = rent_data_df[['RegionName', 'City', 'CountyName', '2023-08-31', latest_rent_column]]
# Renaming columns for clarity
rent_data_df.rename(columns={'RegionName': 'zipcode', 'City': 'city', 'CountyName': 'county_name', '2023-08-31': 'Auguest2023_rent_amount',latest_rent_column: 'rent_amount'}, inplace=True)
# Ensuring the ZIP code is a string and formatted correctly
rent_data_df['zipcode'] = rent_data_df['zipcode'].astype(str).str.zfill(5)
# Saving the cleaned data
rent_data_df.to_csv('./data/cleaned_zillow_rent_data.csv', index=False)


In [74]:
#Cleaning Historical Data from NYC Open Data on 311 Complaints
service_requests_df = pd.read_csv(service_requests_path)
service_requests_df = service_requests_df[['unique_key', 'created_date', 'complaint_type', 'incident_zip']]
service_requests_df['created_date'] = pd.to_datetime(service_requests_df['created_date'])
service_requests_df['incident_zip'] = service_requests_df['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)
service_requests_df.dropna(subset=['incident_zip'], inplace=True)
service_requests_df.to_csv('./data/cleaned_311_Service_Requests.csv', index=False)

In [75]:
#Cleaning the 2015 Tree Census
trees_census_df = pd.read_csv(trees_census_path)
trees_census_df = trees_census_df[['tree_id', 'spc_common', 'health', 'zipcode']]
trees_census_df['zipcode'] = trees_census_df['zipcode'].astype(str).str.zfill(5)
trees_census_df['health'].fillna('Unknown', inplace=True)
trees_census_df.to_csv('./data/cleaned_2015StreetTreesCensus_TREES.csv', index=False)



Storing Data

In [46]:
zipcode_df = rent_data_df["zipcode"]
zipcode_df.head()

0    77494
1    77449
2    77084
3    79936
4    11385
Name: zipcode, dtype: object

In [76]:
service_requests_df.tail()

Unnamed: 0,unique_key,created_date,complaint_type,incident_zip
495,59683915,2023-12-09 00:05:23,Noise - Helicopter,10128
496,59682527,2023-12-09 00:05:17,Illegal Parking,11364
497,59685193,2023-12-09 00:05:16,Illegal Parking,11373
498,59676738,2023-12-09 00:05:07,Noise - Residential,10002
499,59684515,2023-12-09 00:05:01,Noise - Commercial,11222


In [80]:
rent_data_df.head()

Unnamed: 0,zipcode,city,county_name,Auguest2023_rent_amount,rent_amount
0,77494,Katy,Fort Bend County,2053.486247,2055.771355
1,77449,Katy,Harris County,1795.384582,1799.63114
2,77084,Houston,Harris County,1757.602011,1755.03149
3,79936,El Paso,El Paso County,1488.180414,1494.366097
4,11385,New York,Queens County,3064.476503,3079.585783


In [81]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.schema import CreateTable


# Replace these variables with your actual database connection details
db_params = {
    'host': 'localhost',
    'database': 'template1',
    'user': 'ritajkx',
    'password': 'none',
    'port': '5432',  # Typically 5432 for PostgreSQL
}

# Construct the connection string
conn_string = "postgresql://{user}:{password}@{host}:{port}/{database}".format(**db_params)

# Create an SQLAlchemy engine
engine = create_engine(conn_string)

Base = declarative_base()

# Define a class for each table
class ZipCode(Base):
    __tablename__ = 'zip_codes'
    id = Column(Integer, primary_key=True)
    zip_code = Column(String)

class Complaint(Base):
    __tablename__ = 'complaints'
    id = Column(Integer, primary_key=True)
    complaint_type = Column(String)

class Tree(Base):
    __tablename__ = 'trees'
    id = Column(Integer, primary_key=True)
    species = Column(String)

class Rent(Base):
    __tablename__ = 'rents'
    id = Column(Integer, primary_key=True)
    average_rent = Column(Float)
    

# Generate schema.sql
with open('schema.sql', 'w') as f:
    for table in [ZipCode.__table__, Complaint.__table__, Tree.__table__, Rent.__table__]:
        f.write(str(CreateTable(table)))
        f.write(";\n")

# Create the tables in the database
Base.metadata.create_all(engine)

# Define the table name
table_name_1 = 'zip_codes'
table_name_2 = 'trees'
table_name_3 = 'rents'
table_name_4 = 'complaints'

# Use the to_sql method to write the DataFrame to the SQL database
rent_data_df.to_sql(table_name_3, con=engine, index=False, if_exists='replace') 
service_requests_df.to_sql(table_name_4, con=engine, index=False, if_exists='replace')
trees_census_df.to_sql(table_name_2, con=engine, index=False, if_exists='replace')  
zipcode_df.to_sql(table_name_1, con=engine, index=False, if_exists='replace')  

722

In [48]:
pd.read_sql_table('trees', conn_string)  

Unnamed: 0,tree_id,spc_common,health,zipcode
0,180683,red maple,Fair,11375
1,200540,pin oak,Fair,11357
2,204026,honeylocust,Good,11211
3,204337,honeylocust,Good,11211
4,189565,American linden,Good,11215
...,...,...,...,...
495,198323,sweetgum,Good,10312
496,158851,sweetgum,Good,11211
497,195970,sweetgum,Good,10028
498,197287,Sophora,Good,10456


In [49]:
pd.read_sql_table('zip_codes', conn_string) 

Unnamed: 0,zipcode
0,77494
1,77449
2,77084
3,79936
4,11385
...,...
6717,89158
6718,32461
6719,02876
6720,76005


In [82]:
pd.read_sql_table('rents', conn_string) 

Unnamed: 0,zipcode,city,county_name,Auguest2023_rent_amount,rent_amount
0,77494,Katy,Fort Bend County,2053.486247,2055.771355
1,77449,Katy,Harris County,1795.384582,1799.631140
2,77084,Houston,Harris County,1757.602011,1755.031490
3,79936,El Paso,El Paso County,1488.180414,1494.366097
4,11385,New York,Queens County,3064.476503,3079.585783
...,...,...,...,...,...
6717,89158,Las Vegas,Clark County,3310.302151,3448.166667
6718,32461,Panama City Beach,Walton County,2639.938102,2702.500000
6719,02876,North Smithfield,Providence County,,2250.000000
6720,76005,Arlington,Tarrant County,2383.185013,2313.944444


In [51]:
pd.read_sql_table('complaints', conn_string) 

Unnamed: 0,unique_key,created_date,complaint_type,incident_zip
0,59682706,2023-12-09 12:00:00,Derelict Vehicles,11412
1,59683999,2023-12-09 12:00:00,Derelict Vehicles,11357
2,59681385,2023-12-09 12:00:00,Derelict Vehicles,11222
3,59681790,2023-12-09 02:41:46,Graffiti,10032
4,59684401,2023-12-09 02:06:35,Graffiti,11211
...,...,...,...,...
495,59683915,2023-12-09 00:05:23,Noise - Helicopter,10128
496,59682527,2023-12-09 00:05:17,Illegal Parking,11364
497,59685193,2023-12-09 00:05:16,Illegal Parking,11373
498,59676738,2023-12-09 00:05:07,Noise - Residential,10002


In [None]:
#Between October 1st, 2022 and September 30th, 2023 (inclusive), find the number of 311 complaints per zip code. 

In [62]:
def get_complaints_count_per_zip(engine):
    # Define the SQL query
    query = """
    SELECT incident_zip, COUNT(*) AS complaint_count
    FROM complaints
    WHERE created_date::date >= '2022-10-01' AND created_date <= '2023-09-30'
    GROUP BY incident_zip
    ORDER BY complaint_count DESC;
    """
    
    # Execute the query and return a DataFrame
    return pd.read_sql_query(query, engine)

# Use the function to get the complaint counts per zip code
complaints_per_zip_df = get_complaints_count_per_zip(engine)

# Display the DataFrame
print(complaints_per_zip_df)

Empty DataFrame
Columns: [incident_zip, complaint_count]
Index: []


In [63]:
def get_complaints_count_per_zip(engine):
    query = """
    SELECT incident_zip, COUNT(*) AS complaint_count
    FROM complaints
    WHERE created_date::date = '2023-12-09'
    GROUP BY incident_zip
    ORDER BY complaint_count DESC;
    """
    return pd.read_sql_query(query, engine)

complaints_per_zip_df = get_complaints_count_per_zip(engine)

print(complaints_per_zip_df)

    incident_zip  complaint_count
0          10004               16
1          10025               13
2          10013               13
3          11414               13
4          10032               12
..           ...              ...
127        11229                1
128        10027                1
129        10024                1
130        11249                1
131        11354                1

[132 rows x 2 columns]


In [None]:
# Query 2: Where has the most greenery?
# Using just the trees table, which 10 zip codes have the most trees?

#The query result should have two columns, 10 rows. The rows should be sorted by the total number of trees, descending.


In [66]:
def get_top_zipcodes_with_most_trees(engine):
    query = """
    SELECT zipcode, COUNT(*) AS tree_count
    FROM trees
    GROUP BY zipcode
    ORDER BY tree_count DESC
    LIMIT 10;
    """
    return pd.read_sql_query(query, engine)

top_zipcodes_with_most_trees_df = get_top_zipcodes_with_most_trees(engine)

print(top_zipcodes_with_most_trees_df)

  zipcode  tree_count
0   10023          26
1   11205          19
2   11215          19
3   11375          18
4   10457          17
5   11105          17
6   10306          15
7   10024          14
8   11226          13
9   10458          12


Understanding Data

Visualizing Data