 # Data Preprocessing

In [1]:
import requests
import json
import pandas as pd
import geopandas as gpd
import os
from shapely.geometry import Point

In [37]:
# Setup for API token and endpoint list
app_token = '2bSOlTJkWZ0e43SGvaNbY1sHz'
API_endpoint_list = [
    "https://data.cityofnewyork.us/resource/5rq2-4hqu.json?$limit=500",  # Endpoint for 2015 Street Trees Census
    "https://data.cityofnewyork.us/resource/erm2-nwe9.json?$limit=500"   # Endpoint for 311 Service Requests
]

# Create a directory named 'data' if it does not exist, to store the downloaded data
data_directory = 'data'
os.makedirs(data_directory, exist_ok=True)

# Paths for saving the downloaded data in CSV format
file_names = [os.path.join(data_directory, "2015StreetTreesCensus_TREES.csv"), 
              os.path.join(data_directory, "311_Service_Requests.csv")]

# Headers for the API request, including the application token for authentication
headers = {'X-App-Token': app_token}

# Looping through each API endpoint to fetch and save data
for i, endpoint in enumerate(API_endpoint_list):
    # Sending a request to the API endpoint
    response = requests.get(endpoint, headers=headers)
    # Check if the response is successful (status code 200)
    if response.status_code == 200:
        # Convert the JSON data to a pandas DataFrame
        data = response.json()
        df = pd.DataFrame(data)
        # Save the DataFrame as a CSV file
        df.to_csv(file_names[i], index=False)
        print(f"Data from {endpoint} written to {file_names[i]} in CSV format")
    else:
        # Print an error message if the request was unsuccessful
        print(f"Error: {response.status_code} from {endpoint}")


Data from https://data.cityofnewyork.us/resource/5rq2-4hqu.json?$limit=500 written to data/2015StreetTreesCensus_TREES.csv in CSV format
Data from https://data.cityofnewyork.us/resource/erm2-nwe9.json?$limit=500 written to data/311_Service_Requests.csv in CSV format


In [38]:
# Path to the 311 Service Requests data file
service_requests_path = './data/311_Service_Requests.csv'

# Path to the 2015 Street Trees Census data file
trees_census_path = './data/2015StreetTreesCensus_TREES.csv'

# Path to the Zillow Rent Data file
rent_data_path = './data/zillow_rent_data.csv'

# Path to the NYC Zip Codes shapefile, used in geographic information systems
shapefile_path = './data/nyc_zipcodes.shp'


In [4]:
# Cleaning Shapefiles of NYC’s Zip Codes

# Importing geopandas, assumed to be imported earlier in the code
import geopandas as gpd

# Loading the NYC Zip Codes shapefile into a GeoDataFrame using geopandas
# Shapefiles contain geographical information, here focusing on NYC Zip Codes
gdf = gpd.read_file('./data/nyc_zipcodes.shp')

# Keeping only the 'ZIPCODE' and 'geometry' columns
# 'ZIPCODE' for the postal code and 'geometry' for the geographical shape
gdf = gdf[['ZIPCODE', 'geometry']]

# Formatting 'ZIPCODE' column as a string and padding it with zeros for consistency
gdf['ZIPCODE'] = gdf['ZIPCODE'].astype(str).str.zfill(5)

# Setting the Coordinate Reference System (CRS) to WGS84 (epsg:4326)
# This step ensures that the geographic data uses a standard spatial reference
gdf = gdf.to_crs(epsg=4326)

# Saving the cleaned shapefile data for further use or analysis
gdf.to_file('./data/cleaned_nyc_zipcodes.shp')




In [40]:
#Cleaning Historical Monthly Average Rents by Zip Code from Zillow

# Load the Zillow Rent Data
rent_data_df = pd.read_csv(rent_data_path)
# Selecting the desired columns including region, city, countyName, and the latest rent data
# Assuming the last column is the latest rent data (you might want to verify this)
latest_rent_column = rent_data_df.columns[-1]
rent_data_df = rent_data_df[['RegionName', 'City', 'CountyName', '2023-08-31', latest_rent_column]]
# Renaming columns for clarity
rent_data_df.rename(columns={'RegionName': 'zipcode', 'City': 'city', 'CountyName': 'county_name', '2023-08-31': 'Auguest2023_rent_amount',latest_rent_column: 'rent_amount'}, inplace=True)
# Ensuring the ZIP code is a string and formatted correctly
rent_data_df['zipcode'] = rent_data_df['zipcode'].astype(str).str.zfill(5)


In [44]:
#Cleaning Historical Data from NYC Open Data on 311 Complaints
# Load the 311 Service Requests data into a DataFrame
service_requests_df = pd.read_csv(service_requests_path)

# Select specific columns: 'unique_key', 'created_date', 'complaint_type', and 'incident_zip'
service_requests_df = service_requests_df[['unique_key', 'created_date', 'complaint_type', 'incident_zip']]

# Convert 'created_date' to a datetime format for proper time-series analysis
service_requests_df['created_date'] = pd.to_datetime(service_requests_df['created_date'])

# Format 'incident_zip' as a string and pad with zeros to ensure consistent ZIP code formatting
service_requests_df['incident_zip'] = service_requests_df['incident_zip'].astype(str).str.split('.').str[0].str.zfill(5)

# Remove rows where 'incident_zip' is missing to maintain data integrity
service_requests_df.dropna(subset=['incident_zip'], inplace=True)


In [45]:
# Cleaning the 2015 Tree Census
# Load the 2015 Street Trees Census data into a DataFrame
trees_census_df = pd.read_csv(trees_census_path)

# Select specific columns: 'tree_id', 'spc_common' (species common name), 'health', 'zipcode', 'status', 'latitude', and 'longitude'
trees_census_df = trees_census_df[['tree_id', 'spc_common', 'health', 'zipcode', 'status', 'latitude','longitude']]

# Format 'zipcode' as a string and pad with zeros for consistent ZIP code formatting
trees_census_df['zipcode'] = trees_census_df['zipcode'].astype(str).str.zfill(5)

# Fill missing values in the 'health' column with 'Unknown' to maintain data integrity
trees_census_df['health'].fillna('Unknown', inplace=True)

In [46]:
# Create a GeoDataFrame with a 'geometry' column containing Point objects
geometry = [Point(lon, lat) for lon, lat in zip(trees_census_df['longitude'], trees_census_df['latitude'])]
trees_census_gdf = gpd.GeoDataFrame(trees_census_df, geometry=geometry, crs="EPSG:4326")

# Display the GeoDataFrame
trees_census_gdf.head()


Unnamed: 0,tree_id,spc_common,health,zipcode,status,latitude,longitude,geometry
0,180683,red maple,Fair,11375,Alive,40.723092,-73.844215,POINT (-73.84422 40.72309)
1,200540,pin oak,Fair,11357,Alive,40.794111,-73.818679,POINT (-73.81868 40.79411)
2,204026,honeylocust,Good,11211,Alive,40.717581,-73.936608,POINT (-73.93661 40.71758)
3,204337,honeylocust,Good,11211,Alive,40.713537,-73.934456,POINT (-73.93446 40.71354)
4,189565,American linden,Good,11215,Alive,40.666778,-73.975979,POINT (-73.97598 40.66678)


# Storing Data

In [11]:
from geoalchemy2 import Geometry, WKTElement
# Change dataframe column name to match with the schema
trees_census_gdf.rename(columns={'tree_id': 'id'}, inplace=True)
trees_census_gdf.rename(columns={'spc_common': 'specie'}, inplace=True)
trees_census_gdf.rename(columns={'zipcode': 'zip_code_id'}, inplace=True)
trees_census_gdf['geometry'] = trees_census_gdf['geometry'].apply(lambda geom: WKTElement(geom.wkt, srid=4326))
trees_census_gdf.head()

  trees_census_gdf['geometry'] = trees_census_gdf['geometry'].apply(lambda geom: WKTElement(geom.wkt, srid=4326))


Unnamed: 0,id,specie,health,zip_code_id,status,latitude,longitude,geometry
0,180683,red maple,Fair,11375,Alive,40.723092,-73.844215,POINT (-73.84421522 40.72309177)
1,200540,pin oak,Fair,11357,Alive,40.794111,-73.818679,POINT (-73.81867946 40.79411067)
2,204026,honeylocust,Good,11211,Alive,40.717581,-73.936608,POINT (-73.9366077 40.71758074)
3,204337,honeylocust,Good,11211,Alive,40.713537,-73.934456,POINT (-73.93445616 40.71353749)
4,189565,American linden,Good,11215,Alive,40.666778,-73.975979,POINT (-73.97597938 40.66677776)


In [12]:
# Change dataframe column name to match with the schema
service_requests_df.rename(columns={'unique_key': 'id'}, inplace=True)
service_requests_df.rename(columns={'created_date': 'date_column'}, inplace=True)
service_requests_df.rename(columns={'incident_zip': 'zip_code_id'}, inplace=True)
service_requests_df = service_requests_df[service_requests_df['zip_code_id'] != '00nan']
service_requests_df.tail()

Unnamed: 0,id,date_column,complaint_type,zip_code_id
495,59683915,2023-12-09 00:05:23,Noise - Helicopter,10128
496,59682527,2023-12-09 00:05:17,Illegal Parking,11364
497,59685193,2023-12-09 00:05:16,Illegal Parking,11373
498,59676738,2023-12-09 00:05:07,Noise - Residential,10002
499,59684515,2023-12-09 00:05:01,Noise - Commercial,11222


In [13]:
# Change dataframe column name to match with the schema
rent_data_df.rename(columns={'Auguest2023_rent_amount': 'rent_amount_aug'}, inplace=True)
rent_data_df.rename(columns={'county_name': 'county'}, inplace=True)
rent_data_df.rename(columns={'zipcode': 'zip_code_id'}, inplace=True)
rent_data_df['id'] = range(1, len(rent_data_df) + 1)
rent_data_df = rent_data_df[['id','city', 'county', 'rent_amount_aug','rent_amount','zip_code_id']]
rent_data_df.head()

Unnamed: 0,id,city,county,rent_amount_aug,rent_amount,zip_code_id
0,1,Katy,Fort Bend County,2053.486247,2055.771355,77494
1,2,Katy,Harris County,1795.384582,1799.63114,77449
2,3,Houston,Harris County,1757.602011,1755.03149,77084
3,4,El Paso,El Paso County,1488.180414,1494.366097,79936
4,5,New York,Queens County,3064.476503,3079.585783,11385


In [14]:
zipcode_df = pd.DataFrame(rent_data_df["zip_code_id"])


# Array of values to be added
new_values = trees_census_gdf['zip_code_id']
new_values_2 = rent_data_df['zip_code_id']
new_values_3 = service_requests_df['zip_code_id']


In [15]:
# Convert the original column to a set to identify unique values
existing_values = set(zipcode_df['zip_code_id'])

# Filter new values to only include those not present in the original column
unique_new_values = [value for value in new_values if value not in existing_values]

# Add the unique values to the original column
zipcode_df = pd.concat([zipcode_df, pd.DataFrame({'zip_code_id': unique_new_values})], ignore_index=True)

In [16]:
# Convert the original column to a set to identify unique values
existing_values = set(zipcode_df['zip_code_id'])

# Filter new values to only include those not present in the original column
unique_new_values = [value for value in new_values_2 if value not in existing_values]

# Add the unique values to the original column
zipcode_df = pd.concat([zipcode_df, pd.DataFrame({'zip_code_id': unique_new_values})], ignore_index=True)

In [17]:
# Convert the original column to a set to identify unique values
existing_values = set(zipcode_df['zip_code_id'])

# Filter new values to only include those not present in the original column
unique_new_values = [value for value in new_values_3 if value not in existing_values]

# Add the unique values to the original column
zipcode_df = pd.concat([zipcode_df, pd.DataFrame({'zip_code_id': unique_new_values})], ignore_index=True)

In [18]:
zipcode_df.rename(columns={'zip_code_id': 'RegionID'}, inplace=True)

In [19]:
zipcode_df = zipcode_df[zipcode_df['RegionID'] != '00nan']
zipcode_df = pd.DataFrame(zipcode_df['RegionID'].unique())
zipcode_df.columns = ["id"]
# zipcode_df.columns = ["ZIPCODE"]
zipcode_df

Unnamed: 0,id
0,77494
1,77449
2,77084
3,79936
4,11385
...,...
6741,11366
6742,10310
6743,11423
6744,11419


In [20]:
import psycopg2

def drop_table(connection_params, table_name):
    try:
        # Establish a connection to the PostgreSQL database
        conn = psycopg2.connect(**connection_params)
        conn.autocommit = True  # Set autocommit to True for DDL statements

        # Create a cursor
        cursor = conn.cursor()

        # Drop the table
        query = f"DROP TABLE IF EXISTS {table_name} CASCADE;"
        cursor.execute(query)

        print(f"Table {table_name} and its columns dropped successfully.")

    except psycopg2.Error as e:
        print(f"Error: {e}")
    finally:
        # Close the cursor and connection
        if cursor:
            cursor.close()
        if conn:
            conn.close()

# Specify the table name to drop
table_to_drop = 'complaints'
table_to_drop_2 = 'trees'
table_to_drop_3 = 'zip_codes'
table_to_drop_4 = 'rents'

# Call the function to drop the table
drop_table(db_params, table_to_drop)
drop_table(db_params, table_to_drop_2)
drop_table(db_params, table_to_drop_3)
drop_table(db_params, table_to_drop_4)


NameError: name 'db_params' is not defined

In [21]:
#!createdb final_project_4501
#!psql --dbname final_project_4501 -c 'CREATE EXTENSION postgis;'

In [23]:
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Date, Float, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.schema import CreateTable
from sqlalchemy.orm import relationship
from geoalchemy2 import Geometry
from sqlalchemy.ext.declarative import declarative_base



# Replace these variables with your actual database connection details
db_params = {
    'host': 'localhost',
    'database': 'molly',
    'user': 'molly', #ritajkx
    'password': 'none',
    'port': '5432',  # Typically 5432 for PostgreSQL
}

# Construct the connection string
conn_string = "postgresql://{user}:{password}@{host}:{port}/{database}".format(**db_params)

# Create an SQLAlchemy engine
engine = create_engine(conn_string)

Base = declarative_base()


class ZipCode(Base):
    __tablename__ = 'zip_codes'
    id = Column(Integer, primary_key=True)
#     geometry = Column(Geometry(geometry_type='POLYGON', srid=4326))
    # Relationships
    complaints = relationship("Complaint", backref="zip_code")
    trees = relationship("Tree", backref="zip_code")
    rents = relationship("Rent", backref="zip_code")

class Complaint(Base):
    __tablename__ = 'complaints'
    id = Column(Integer, primary_key=True)
    date_column = Column(Date)
    complaint_type = Column(String)
    zip_code_id = Column(Integer, ForeignKey('zip_codes.id'))

class Tree(Base):
    __tablename__ = 'trees'
    id = Column(Integer, primary_key=True)
    specie = Column(String)
    health = Column(String)
    status = Column(String)
    zip_code_id = Column(Integer, ForeignKey('zip_codes.id'))
    longitude = Column(Float)
    latitude = Column(Float)
    geometry = Column(Geometry(geometry_type='POINT', srid=4326))

class Rent(Base):
    __tablename__ = 'rents'
    id = Column(Integer, primary_key=True)
    city = Column(String)
    county = Column(String)
    rent_amount_aug = Column(Float)
    rent_amount = Column(Float)
    zip_code_id = Column(Integer, ForeignKey('zip_codes.id'))

    

# Generate schema.sql
with open('schema.sql', 'w') as f:
    for table in [ZipCode.__table__, Complaint.__table__, Tree.__table__, Rent.__table__]:
        f.write(str(CreateTable(table)))
        f.write(";\n")

# Create the tables in the database
Base.metadata.create_all(engine)

  Base = declarative_base()


In [24]:
# Define the table name
table_name_1 = 'zip_codes'
table_name_2 = 'trees'
table_name_3 = 'rents'
table_name_4 = 'complaints'

# Use the to_sql method to write the DataFrame to the SQL database
zipcode_df.to_sql(table_name_1, con=engine, index=False, if_exists='append')
# zipcode_gdf.to_sql(table_name_1, con=engine, index=False, if_exists='append', dtype={'geometry': Geometry('POLYGON', srid=4326)})

746

In [25]:
rent_data_df.to_sql(table_name_3, con=engine, index=False, if_exists='append') 

722

In [26]:
service_requests_df.to_sql(table_name_4, con=engine, index=False, if_exists='append')  

495

In [27]:
trees_census_gdf.to_sql(table_name_2, con=engine, index=False, if_exists='append', dtype={'geometry': Geometry('POINT', srid=4326)})  

500

In [28]:
gpd.read_postgis('trees', conn_string, geom_col='geometry')  

Unnamed: 0,id,specie,health,status,zip_code_id,longitude,latitude,geometry
0,180683,red maple,Fair,Alive,11375,-73.844215,40.723092,POINT (-73.84422 40.72309)
1,200540,pin oak,Fair,Alive,11357,-73.818679,40.794111,POINT (-73.81868 40.79411)
2,204026,honeylocust,Good,Alive,11211,-73.936608,40.717581,POINT (-73.93661 40.71758)
3,204337,honeylocust,Good,Alive,11211,-73.934456,40.713537,POINT (-73.93446 40.71354)
4,189565,American linden,Good,Alive,11215,-73.975979,40.666778,POINT (-73.97598 40.66678)
...,...,...,...,...,...,...,...,...
495,198323,sweetgum,Good,Alive,10312,-74.189236,40.540939,POINT (-74.18924 40.54094)
496,158851,sweetgum,Good,Alive,11211,-73.945449,40.717243,POINT (-73.94545 40.71724)
497,195970,sweetgum,Good,Alive,10028,-73.960957,40.777745,POINT (-73.96096 40.77775)
498,197287,Sophora,Good,Alive,10456,-73.906901,40.833278,POINT (-73.90690 40.83328)


In [29]:
pd.read_sql_table('zip_codes', conn_string) 

Unnamed: 0,id
0,77494
1,77449
2,77084
3,79936
4,11385
...,...
6741,11366
6742,10310
6743,11423
6744,11419


In [30]:
pd.read_sql_table('rents', conn_string) 

Unnamed: 0,id,city,county,rent_amount_aug,rent_amount,zip_code_id
0,1,Katy,Fort Bend County,2053.486247,2055.771355,77494
1,2,Katy,Harris County,1795.384582,1799.631140,77449
2,3,Houston,Harris County,1757.602011,1755.031490,77084
3,4,El Paso,El Paso County,1488.180414,1494.366097,79936
4,5,New York,Queens County,3064.476503,3079.585783,11385
...,...,...,...,...,...,...
6717,6718,Las Vegas,Clark County,3310.302151,3448.166667,89158
6718,6719,Panama City Beach,Walton County,2639.938102,2702.500000,32461
6719,6720,North Smithfield,Providence County,,2250.000000,2876
6720,6721,Arlington,Tarrant County,2383.185013,2313.944444,76005


In [31]:
pd.read_sql_table('complaints', conn_string) 

Unnamed: 0,id,date_column,complaint_type,zip_code_id
0,59682706,2023-12-09,Derelict Vehicles,11412
1,59681385,2023-12-09,Derelict Vehicles,11222
2,59683999,2023-12-09,Derelict Vehicles,11357
3,59681790,2023-12-09,Graffiti,10032
4,59684401,2023-12-09,Graffiti,11211
...,...,...,...,...
490,59683915,2023-12-09,Noise - Helicopter,10128
491,59682527,2023-12-09,Illegal Parking,11364
492,59685193,2023-12-09,Illegal Parking,11373
493,59676738,2023-12-09,Noise - Residential,10002


# Understanding Data

Query 1*: Which area might be more calm to live in?

Between October 1st, 2022 and September 30th, 2023 (inclusive), find the number of 311 complaints per zip code. 

In [32]:
def get_complaints_count_per_zip(engine):
    query = """
    SELECT zip_code_id, COUNT(*) AS complaint_count
    FROM complaints
    WHERE date_column::date = '2023-12-09'
    GROUP BY zip_code_id
    ORDER BY complaint_count DESC;
    """
    return pd.read_sql_query(query, engine)

complaints_per_zip_df = get_complaints_count_per_zip(engine)

print(complaints_per_zip_df)

     zip_code_id  complaint_count
0          10004               16
1          11414               13
2          10013               13
3          10025               13
4          10032               12
..           ...              ...
126        11419                1
127        11422                1
128        11423                1
129        11434                1
130        10001                1

[131 rows x 2 columns]


Query 2: Where has the most greenery?

Using just the trees table, which 10 zip codes have the most trees?

The query result should have two columns, 10 rows. The rows should be sorted by the total number of trees, descending.


In [33]:
def get_top_zipcodes_with_most_trees(engine):
    query = """
    SELECT zip_code_id, COUNT(*) AS tree_count
    FROM trees
    GROUP BY zip_code_id
    ORDER BY tree_count DESC
    LIMIT 10;
    """
    return pd.read_sql_query(query, engine)

top_zipcodes_with_most_trees_df = get_top_zipcodes_with_most_trees(engine)

print(top_zipcodes_with_most_trees_df)

   zip_code_id  tree_count
0        10023          26
1        11215          19
2        11205          19
3        11375          18
4        11105          17
5        10457          17
6        10306          15
7        10024          14
8        11226          13
9        11217          12


Query 3: Can I afford a place in the areas with the most trees?

Of the 10 zip codes with the most trees, for the month of August 2023,what is the average rent by zip code?

In [34]:
def get_average_rent_with_most_trees(engine):
    query = """
        WITH TopZipCodes AS (
            SELECT zip_code_id, COUNT(*) AS tree_count
            FROM trees
            GROUP BY zip_code_id
            ORDER BY tree_count DESC
            LIMIT 10
        )
        SELECT TopZipCodes.zip_code_id, 
               TO_CHAR(rents.rent_amount_aug, '9999.99') AS average_rent
        FROM TopZipCodes 
        JOIN rents  ON TopZipCodes.zip_code_id = rents.zip_code_id
        ORDER BY TopZipCodes.tree_count DESC
        LIMIT 10;
    """
    return pd.read_sql_query(query, engine)

get_average_rent_with_most_trees_df = get_average_rent_with_most_trees(engine)

print(get_average_rent_with_most_trees_df)

   zip_code_id average_rent
0        10023      4370.07
1        11215      3575.65
2        11205      3497.47
3        11375      2743.40
4        11105      2852.73
5        10457      2183.97
6        10306      2331.54
7        10024      3797.94
8        11226      2785.32
9        11217      4066.88


Query 4: Could there be a correlation between an area’s rent, the number of its trees, and the number of 311 complaints?


In [35]:
def get_five_best_and_five_worst(engine):
    query = """
    WITH RentStatistics AS (
        SELECT
            rents.zip_code_id,
            TO_CHAR(AVG(rents.rent_amount), '9999.99') AS avg_rent,
            COUNT(trees.id) AS tree_count,
            COUNT(complaints.id) AS complaint_count
        FROM rents 
        LEFT JOIN trees ON rents.zip_code_id = trees.zip_code_id
        LEFT JOIN complaints  ON rents.zip_code_id = complaints.zip_code_id
        WHERE EXTRACT(MONTH FROM complaints.date_column) = 12 AND EXTRACT(YEAR FROM complaints.date_column) = 2023
        GROUP BY rents.zip_code_id
    )
    SELECT
        zip_code_id,
        avg_rent,
        tree_count,
        complaint_count
    FROM (
        SELECT
            zip_code_id,
            avg_rent,
            tree_count,
            complaint_count,
            ROW_NUMBER() OVER (ORDER BY avg_rent ASC) AS low_rank,
            ROW_NUMBER() OVER (ORDER BY avg_rent DESC) AS high_rank
        FROM RentStatistics
    ) ranked
    WHERE low_rank <= 5 OR high_rank <= 5
    ORDER BY avg_rent ASC;
    """
    return pd.read_sql_query(query, engine)

get_five_best_and_five_worst_df = get_five_best_and_five_worst(engine)

print(get_five_best_and_five_worst_df)

   zip_code_id  avg_rent  tree_count  complaint_count
0        10458   1968.71         132              132
1        10462   1999.83           0                3
2        11204   2049.17          24               24
3        10453   2086.03          15               15
4        11214   2117.20          28               28
5        10014   4851.52          14               14
6        10011   4936.19          27               27
7        10001   5055.31           0                1
8        10013   5880.80          13               13
9        10282   7347.46           0                1



Query 5: Where has the most greenery (take 2)?
Rewrite Query 2 to use both the trees table and the zipcodes table. Join both tables where the coordinate point of the tree is inside the polygon boundary of the zipcode as defined in the zipcode table.
The query should have a JOIN statement. The query results should match exactly the results of Query 2.

Query 6: What is the immediate area like?
Using the following coordinate pair on campus, which trees are within 1⁄2 mile radius of this point?
Latitude: 40.80737875669467, Longitude: -73.96253174434912
The result should have 5 columns (ID, species, health, status, and coordinate location of each tree).

In [36]:
def immediate_area(engine):
    query = """
    SELECT
        trees.id AS ID,
        trees.specie AS species,
        trees.health,
        trees.status,
        ST_AsText(trees.geometry) AS coordinate_location
    FROM
        trees
    JOIN
        zip_codes  ON trees.zip_code_id = zip_codes.id
    WHERE
        ST_DWithin(
            trees.geometry,
            ST_SetSRID(ST_MakePoint(-73.96253174434912, 40.80737875669467), 4326),
            804.672  -- 1/2 mile in meters (1 mile = 1609.344 meters)
        );

    """
    return pd.read_sql_query(query, engine)

immediate_area_df = immediate_area(engine)

print(immediate_area_df)

         id           species   health status              coordinate_location
0    180683         red maple     Fair  Alive  POINT(-73.84421522 40.72309177)
1    200540           pin oak     Fair  Alive  POINT(-73.81867946 40.79411067)
2    204026       honeylocust     Good  Alive   POINT(-73.9366077 40.71758074)
3    204337       honeylocust     Good  Alive  POINT(-73.93445616 40.71353749)
4    203719  London planetree     Good  Alive  POINT(-73.91117077 40.78242823)
..      ...               ...      ...    ...                              ...
495  153586           Sophora     Good  Alive  POINT(-73.96683229 40.64360575)
496  171066         white oak     Good  Alive   POINT(-73.95835615 40.6493897)
497  171519         white oak     Good  Alive  POINT(-74.08651136 40.58283323)
498  176343              None  Unknown  Stump  POINT(-73.95830423 40.63787921)
499  198323          sweetgum     Good  Alive  POINT(-74.18923615 40.54093888)

[500 rows x 5 columns]


# Visualizing Data