In [6]:
import geopandas as gpd
from typing import List, Union, Set
import os
import requests
import pandas as pd
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [None]:


output_folder = "data"
chunk_size = 102400
os.makedirs(output_folder, exist_ok=True)



def download(url, filename):
    # Send an HTTP GET request to the specified URL
    r = requests.get(url, stream=True)

    # Check if the request was successful (HTTP status code 200)
    if r.status_code == 200:
        # Open the file in write-binary mode
        with open(filename, 'ab') as f:
            # Define the chunk size (e.g., 1024 bytes)

            count = 0
            # Download the file in chunks
            for chunk in r.iter_content(chunk_size=chunk_size):
                # print(f"{filename} has download {count} data")
                f.write(chunk)
                # count+=chunk_size

        print("\nDownload complete.")
    else:
        print("Error: Failed to download the file.")

urls = ["https://data.cityofnewyork.us/api/views/erm2-nwe9/rows.csv?accessType=DOWNLOAD",
        "https://data.cityofnewyork.us/resource/5rq2-4hqu.csv",
        "https://data.cityofnewyork.us/resource/wz6d-d3jb.csv"]

filenames = ["data/311_Service_Requests.csv","data/2015 Street Tree Census.json","data/Bedbug Reporting.csv"]



for i in range(len(urls)):
    download(urls[i],filenames[i])

In [9]:

def read_shapefile(shapefile_path: str) -> gpd.GeoDataFrame:
    """
    Reads the shapefile into a GeoDataFrame.

    Args:
    - shapefile_path (str): Path to the shapefile.

    Returns:
    - gpd.GeoDataFrame: The GeoDataFrame read from the shapefile.
    """
    return gpd.read_file(shapefile_path)

def filter_columns(gdf: gpd.GeoDataFrame, columns: List[str]) -> gpd.GeoDataFrame:
    """
    Filters the GeoDataFrame to include only specified columns.

    Args:
    - gdf (gpd.GeoDataFrame): The original GeoDataFrame.
    - columns (List[str]): A list of column names to retain.

    Returns:
    - gpd.GeoDataFrame: The GeoDataFrame with only the specified columns.
    """
    return gdf[columns]

def remove_duplicates(gdf: gpd.GeoDataFrame, subset: str) -> gpd.GeoDataFrame:
    """
    Removes duplicate rows based on a specified subset of columns.

    Args:
    - gdf (gpd.GeoDataFrame): The GeoDataFrame to process.
    - subset (str): Column name to check for duplicates.

    Returns:
    - gpd.GeoDataFrame: The GeoDataFrame with duplicates removed.
    """
    return gdf.drop_duplicates(subset=[subset])

def filter_invalid_zipcodes(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Keeps only rows with valid 5-digit zipcodes.

    Args:
    - gdf (gpd.GeoDataFrame): The GeoDataFrame to process.

    Returns:
    - gpd.GeoDataFrame: The GeoDataFrame with only valid 5-digit zipcodes.
    """
    gdf['zipcode'] = gdf['zipcode'].astype(str)
    return gdf[gdf['zipcode'].str.isdigit() & (gdf['zipcode'].str.len() == 5)]

def process_zipcode_shapefile(shapefile_path: str) -> gpd.GeoDataFrame:
    """
    Cleans and prepares a zipcode shapefile for analysis.

    Args:
    - shapefile_path (str): Path to the zipcode shapefile.

    Returns:
    - gpd.GeoDataFrame: GeoDataFrame with processed zipcode data.
    """
    zipcode_gdf = read_shapefile(shapefile_path)
    essential_columns = ['ZIPCODE', 'geometry']
    zipcode_gdf = filter_columns(zipcode_gdf, essential_columns)
    zipcode_gdf = remove_duplicates(zipcode_gdf, 'ZIPCODE')
    zipcode_gdf.dropna(subset=essential_columns, inplace=True)
    zipcode_gdf.rename(columns={'ZIPCODE': 'zipcode'}, inplace=True)
    zipcode_gdf = filter_invalid_zipcodes(zipcode_gdf)
    common_crs = "EPSG:3857"
    zipcode_gdf.to_crs(common_crs, inplace=True)
    zipcode_gdf.columns = map(str.lower, zipcode_gdf.columns)

    return zipcode_gdf

def lat_validation(latitude):
    if not isinstance(latitude, (int, float)):
        raise TypeError("The latitude should be a float or int type")
    return -90 <= latitude <= 90


def long_validation(longitude: float) -> bool:
    if not isinstance(longitude, (int, float)):
        raise TypeError("The longitude should be a float or int type")
    return -180 <= longitude <= 180



In [10]:
geodf_zip_data = process_zipcode_shapefile("data/nyc_zipcodes/nyc_zipcodes.shp")
nyc_zips = geodf_zip_data['zipcode'].tolist()
nyc_zips = [float(element) for element in nyc_zips]

In [11]:

columns_needed = ['Unique Key', 'Created Date', 'Complaint Type', 'Incident Zip', 'Latitude', 'Longitude', 'Location']
def filter_t311(df: pd.DataFrame, column_needed: List[str], nyc_zip: Union[Set[str], List[str]]) -> gpd.GeoDataFrame:
    # Filter the DataFrame to only include necessary columns and drop rows with NaN values
    filtered = df[column_needed].dropna()

    # Further filter the DataFrame to only include rows where 'Incident Zip' is in nyc_zip
    filtered = filtered[filtered['Incident Zip'].isin(nyc_zip)]

    # Converting 'Created Date' to datetime
    filtered['Created Date'] = pd.to_datetime(filtered['Created Date'])

    # Define your date range
    start_date = pd.to_datetime('2015-01-01')
    end_date = pd.to_datetime('2023-09-30')

    # Filter the DataFrame for dates within the range
    filtered = filtered[(filtered['Created Date'] >= start_date) & (filtered['Created Date'] <= end_date)]

    # Apply latitude and longitude validation
    filtered = filtered[filtered['Latitude'].apply(lat_validation) & filtered['Longitude'].apply(long_validation)]

    # Convert to GeoDataFrame
    filtered = gpd.GeoDataFrame(filtered, geometry=gpd.points_from_xy(filtered['Longitude'], filtered['Latitude']))
    filtered.set_crs("EPSG:4326", inplace=True)
    filtered.to_crs("EPSG:3857", inplace=True)

    return filtered

In [12]:
## cehck
def filter_stc(df: pd.DataFrame, column_needed: List[str], nyc_zip: Set[str]) -> gpd.GeoDataFrame:
    # Filter the DataFrame to only include necessary columns and drop rows with NaN values
    filtered = df[columns_needed].dropna()

    # Further filter the DataFrame to only include rows where 'zipcode' is in nyc_zip
    filtered = filtered[filtered['zipcode'].isin(nyc_zip)]

    # Converting 'created_at' to datetime
    filtered['created_at'] = pd.to_datetime(filtered['created_at'])

    # Define your date range
    start_date = pd.to_datetime('01/01/2015')
    end_date = pd.to_datetime('09/30/2023')  # Corrected date

    # Filter the DataFrame for dates within the range
    filtered = filtered[(filtered['created_at'] >= start_date) & (filtered['created_at'] <= end_date)]

    # Apply latitude and longitude validation
    filtered = filtered[filtered['Latitude'].apply(lat_validation) & filtered['longitude'].apply(long_validation)]

    # Convert to GeoDataFrame
    filtered = gpd.GeoDataFrame(filtered, geometry=gpd.points_from_xy(filtered['longitude'], filtered['Latitude']))
    filtered.set_crs("EPSG:4326", inplace=True)
    filtered.to_crs("EPSG:3857", inplace=True)

    return filtered

In [13]:

def filter_zillow(df: pd.DataFrame, nyc_zip: list) -> pd.DataFrame:
    # Selecting the required columns. Assuming the first column is 'RegionName' and the 9th to last are dates
    useful_cols = df.columns[9:].to_list() + ['RegionName']+['RegionID']
    filtered = df[useful_cols]

    # Drop rows where 'RegionName' is NaN
    filtered = filtered.dropna(subset=['RegionName'])
    filtered = filtered.drop_duplicates()

    # Filter rows where 'RegionName' is in the list of NYC zip codes
    filtered = filtered[filtered['RegionName'].isin(nyc_zip)]
    

    # Melting the DataFrame
    melted_df = filtered.melt(id_vars=['RegionID','RegionName'], value_vars=df.columns[9:-2], var_name='date', value_name='rent')
    melted_df = melted_df.reset_index()
    return melted_df


In [14]:
a = pd.read_csv("data/zillow_rent_data.csv")
a

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2015-01-31,...,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31,2023-09-30
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,1606.206406,...,1994.653463,2027.438438,2042.237444,2049.325559,2016.531345,2023.438976,2031.558202,2046.144009,2053.486247,2055.771355
1,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,1257.814660,...,1749.697900,1738.217986,1747.305840,1758.407295,1758.891075,1762.980879,1771.751591,1779.338402,1795.384582,1799.631140
2,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,,...,1701.217520,1706.900064,1706.067787,1723.722320,1735.484670,1752.132904,1756.990323,1754.429516,1757.602011,1755.031490
3,93144,6,79936,zip,TX,TX,El Paso,"El Paso, TX",El Paso County,,...,1419.480272,1458.063897,1471.726681,1466.734658,1456.175660,1462.478506,1466.267391,1490.237063,1488.180414,1494.366097
4,62093,7,11385,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,,...,2935.808220,2895.699421,2873.209025,2881.906361,2913.546218,2963.964134,3005.735342,3034.413822,3064.476503,3079.585783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6717,418163,30158,89158,zip,NV,NV,Las Vegas,"Las Vegas-Henderson-Paradise, NV",Clark County,,...,3281.330738,3509.210744,3407.499896,3438.041504,3436.371804,3524.703410,3426.708975,3412.249969,3310.302151,3448.166667
6718,72017,30490,32461,zip,FL,FL,Panama City Beach,"Crestview-Fort Walton Beach-Destin, FL",Walton County,,...,,,,,,,2583.675563,2590.977335,2639.938102,2702.500000
6719,58956,30490,2876,zip,RI,RI,North Smithfield,"Providence-Warwick, RI-MA",Providence County,,...,,,,,,,,,,2250.000000
6720,91179,30490,76005,zip,TX,TX,Arlington,"Dallas-Fort Worth-Arlington, TX",Tarrant County,,...,2148.224601,2169.143026,2179.393248,2226.624684,2369.532530,2374.713926,2414.638428,2389.749852,2383.185013,2313.944444


In [15]:
filter_zillow(a,nyc_zips)

Unnamed: 0,index,RegionID,RegionName,date,rent
0,0,62093,11385,2015-01-31,
1,1,62019,11208,2015-01-31,
2,2,62046,11236,2015-01-31,
3,3,61807,10467,2015-01-31,
4,4,62085,11373,2015-01-31,
...,...,...,...,...,...
15033,15033,61773,10282,2023-07-31,7611.834625
15034,15034,62010,11109,2023-07-31,4445.207586
15035,15035,61620,10006,2023-07-31,4035.676503
15036,15036,61723,10162,2023-07-31,


In [64]:
pd.read_csv("data/Bedbug_Reporting_20231203.csv")

Unnamed: 0,Building ID,Registration ID,Borough,House Number,Street Name,Postcode,# of Dwelling Units,Infested Dwelling Unit Count,Eradicated Unit Count,Re-infested Dwelling Unit Count,...,Filing Period Start Date,Filling Period End Date,Latitude,Longitude,Community Board,Council District,2010 Census Tract,BIN,BBL,NTA
0,344156,306009,BROOKLYN,161,NEWEL STREET,11222.0,6.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.727680,-73.949030,1.0,33.0,573.0,3065755.0,3.026230e+09,Greenpoint
1,170777,316161,BROOKLYN,1854,74 STREET,11204.0,3.0,0.0,0.0,0.0,...,11/01/2020,10/31/2021,40.613333,-73.994291,11.0,47.0,274.0,3159693.0,3.062160e+09,Bensonhurst
2,358122,357464,BROOKLYN,389,PUTNAM AVENUE,11216.0,3.0,0.0,0.0,0.0,...,11/01/2021,10/31/2022,40.684545,-73.945667,3.0,36.0,267.0,3051701.0,3.018240e+09,Bedford-Stuyvesant (West)
3,705843,403865,QUEENS,861,WOODWARD AVENUE,11385.0,4.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.702903,-73.902286,5.0,30.0,585.0,4083378.0,4.034830e+09,Ridgewood
4,877716,379778,BROOKLYN,912,MOTHER GASTON BOULEVARD,11212.0,3.0,0.0,0.0,0.0,...,11/01/2021,10/31/2022,40.656147,-73.903403,16.0,42.0,922.0,3388522.0,3.036390e+09,Brownsville
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369223,416021,430155,QUEENS,32-14,30 AVENUE,11102.0,8.0,0.0,0.0,0.0,...,11/01/2019,10/31/2020,40.766422,-73.920391,1.0,22.0,63.0,4008591.0,4.006168e+09,Astoria (Central)
369224,990475,921919,QUEENS,213-07A,HILLSIDE AVENUE,11427.0,2.0,0.0,0.0,0.0,...,11/01/2021,10/31/2022,40.725671,-73.752440,13.0,23.0,552.0,4451885.0,4.106810e+09,Queens Village
369225,418600,405984,QUEENS,199-08,32 AVENUE,11358.0,5.0,0.0,0.0,0.0,...,11/01/2020,10/31/2021,40.769538,-73.789856,11.0,19.0,1099.0,4134129.0,4.060260e+09,Auburndale
369226,470425,920278,QUEENS,30-54,73 STREET,11370.0,2.0,0.0,2.0,0.0,...,11/01/2020,10/31/2021,40.759274,-73.894576,3.0,22.0,30905.0,4024752.0,4.011210e+09,Jackson Heights


In [65]:
def filter_bedbug(df: pd.DataFrame, column_needed: List[str], nyc_zip: Set[str]) -> gpd.GeoDataFrame:
    # Ensure 'Postcode' and 'Filing Date' are in the needed columns
    if 'Postcode' not in column_needed or 'Filing Date' not in column_needed:
        raise ValueError("Required columns 'Postcode' and 'Filing Date' are missing.")

    # Selecting the required columns and drop rows with NaN values
    filtered = df[column_needed].dropna()

    # Further filter the DataFrame to only include rows where 'Postcode' is in nyc_zip
    filtered = filtered[filtered['Postcode'].isin(nyc_zip)]

    # Converting 'Filing Date' to datetime
    filtered['Filing Date'] = pd.to_datetime(filtered['Filing Date'])

    # Define your date range
    start_date = pd.to_datetime('01/01/2015')
    end_date = pd.to_datetime('09/30/2023')

    # Filter the DataFrame for dates within the range
    filtered = filtered[(filtered['Filing Date'] >= start_date) & (filtered['Filing Date'] <= end_date)]
    filtered = filtered.reset_index()
    # Convert to GeoDataFrame (assuming Latitude and Longitude columns are present)
    if 'Latitude' in filtered.columns and 'Longitude' in filtered.columns:
        gdf = gpd.GeoDataFrame(filtered, geometry=gpd.points_from_xy(filtered['Longitude'], filtered['Latitude']))

        # Set a CRS for the GeoDataFrame
        # Replace 'EPSG:3857' with the correct EPSG code to match your PostGIS table
        gdf.set_crs(epsg=3857, inplace=True)
        return gdf
    else:
        raise ValueError("Latitude and Longitude columns are required for GeoDataFrame conversion.")

# Example usage:

In [18]:


# Specify the chunk size
chunk_size = 100000

# Initialize lists to hold processed chunks
t311_chunks = []
stc_chunks = []
zillow_chunks = []
bedbug_chunks = []

# Process and store chunks for '311_Service_Requests'
for chunk in pd.read_csv('data/311_Service_Requests_from_2010_to_Present_20231129.csv', chunksize=chunk_size):
    columns_needed = ['Unique Key', 'Created Date', 'Complaint Type', 'Incident Zip', 'Latitude', 'Longitude', 'Location']
    processed_chunk = filter_t311(chunk, columns_needed, nyc_zip=nyc_zips)
    t311_chunks.append(processed_chunk)
geodf_311_data = pd.concat(t311_chunks)
# Process and store chunks for 'StreetTreesCensus_TREES'
for chunk in pd.read_csv('data/2015StreetTreesCensus_TREES.csv', chunksize=chunk_size):
    columns_needed = ['created_at', 'Latitude', 'longitude', 'tree_id', 'zipcode', 'health', 'spc_common']
    processed_chunk = filter_stc(chunk, columns_needed, nyc_zip=nyc_zips)
    stc_chunks.append(processed_chunk)
geodf_tree_data = pd.concat(stc_chunks)
# Process and store chunks for 'zillow_rent_data'
for chunk in pd.read_csv('data/zillow_rent_data.csv', chunksize=chunk_size):
    processed_chunk = filter_zillow(chunk, nyc_zip=nyc_zips)
    zillow_chunks.append(processed_chunk)
df_zillow_data = pd.concat(zillow_chunks)
# Process and store chunks for 'Bedbug_Reporting'
for chunk in pd.read_csv('data/Bedbug_Reporting_20231203.csv', chunksize=chunk_size):
    columns_needed = ['Building ID', 'Postcode', 'Filing Date', 'Eradicated Unit Count', 'Re-infested  Dwelling Unit Count','Latitude','Longitude']
    processed_chunk = filter_bedbug(chunk, columns_needed, nyc_zip=nyc_zips)
    bedbug_chunks.append(processed_chunk)
df_bedbug_data = pd.concat(bedbug_chunks)

# Load shapefile and save to the database
geodf_zipcode_data = gpd.read_file('data/nyc_zipcodes/nyc_zipcodes.shp')




In [66]:
bedbug_chunks = []
for chunk in pd.read_csv('data/Bedbug_Reporting_20231203.csv', chunksize=chunk_size):
    columns_needed = ['Building ID', 'Postcode', 'Filing Date', 'Eradicated Unit Count', 'Re-infested  Dwelling Unit Count','Latitude','Longitude']
    processed_chunk = filter_bedbug(chunk, columns_needed, nyc_zip=nyc_zips)
    bedbug_chunks.append(processed_chunk)
df_bedbug_data = pd.concat(bedbug_chunks)

In [67]:
df_bedbug_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 364683 entries, 0 to 68765
Data columns (total 9 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   index                             364683 non-null  int64         
 1   Building ID                       364683 non-null  int64         
 2   Postcode                          364683 non-null  float64       
 3   Filing Date                       364683 non-null  datetime64[ns]
 4   Eradicated Unit Count             364683 non-null  float64       
 5   Re-infested  Dwelling Unit Count  364683 non-null  float64       
 6   Latitude                          364683 non-null  float64       
 7   Longitude                         364683 non-null  float64       
 8   geometry                          364683 non-null  geometry      
dtypes: datetime64[ns](1), float64(5), geometry(1), int64(2)
memory usage: 27.8 MB


In [50]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   ZIPCODE     263 non-null    object  
 1   BLDGZIP     263 non-null    object  
 2   PO_NAME     263 non-null    object  
 3   POPULATION  263 non-null    float64 
 4   AREA        263 non-null    float64 
 5   STATE       263 non-null    object  
 6   COUNTY      263 non-null    object  
 7   ST_FIPS     263 non-null    object  
 8   CTY_FIPS    263 non-null    object  
 9   URL         263 non-null    object  
 10  SHAPE_AREA  263 non-null    float64 
 11  SHAPE_LEN   263 non-null    float64 
 12  geometry    263 non-null    geometry
dtypes: float64(4), geometry(1), object(8)
memory usage: 26.8+ KB


In [21]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

Unnamed: 0,ZIPCODE,BLDGZIP,PO_NAME,POPULATION,AREA,STATE,COUNTY,ST_FIPS,CTY_FIPS,URL,SHAPE_AREA,SHAPE_LEN,geometry
0,11436,0,Jamaica,18681.0,22699300.0,NY,Queens,36,81,http://www.usps.com/,0.0,0.0,"POLYGON ((1038098.252 188138.380, 1038141.936 ..."
1,11213,0,Brooklyn,62426.0,29631000.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((1001613.713 186926.440, 1002314.243 ..."
2,11212,0,Brooklyn,83866.0,41972100.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((1011174.276 183696.338, 1011373.584 ..."
3,11225,0,Brooklyn,56527.0,23698630.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((995908.365 183617.613, 996522.848 18..."
4,11218,0,Brooklyn,72280.0,36868800.0,NY,Kings,36,47,http://www.usps.com/,0.0,0.0,"POLYGON ((991997.113 176307.496, 992042.798 17..."


In [22]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 19286754 entries, 81945 to 34832754
Data columns (total 8 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Unique Key      int64         
 1   Created Date    datetime64[ns]
 2   Complaint Type  object        
 3   Incident Zip    object        
 4   Latitude        float64       
 5   Longitude       float64       
 6   Location        object        
 7   geometry        geometry      
dtypes: datetime64[ns](1), float64(2), geometry(1), int64(1), object(3)
memory usage: 1.3+ GB


In [23]:
geodf_311_data.head()

Unnamed: 0,Unique Key,Created Date,Complaint Type,Incident Zip,Latitude,Longitude,Location,geometry
81945,53024143,2022-01-11 15:14:02,Homeless Person Assistance,11211.0,40.714062,-73.952911,"(40.7140618829546, -73.9529114410158)",POINT (-8232400.444 4970256.903)
82084,53024195,2022-01-11 08:38:39,Homeless Person Assistance,10001.0,40.745384,-73.994709,"(40.7453835620819, -73.99470933975887)",POINT (-8237053.365 4974858.028)
82171,53024218,2022-01-11 12:54:38,Homeless Person Assistance,11416.0,40.687066,-73.847246,"(40.68706593065851, -73.84724582505751)",POINT (-8220637.802 4966292.960)
82243,53024232,2022-01-11 14:41:54,Homeless Person Assistance,10007.0,40.713813,-74.00557,"(40.7138133710393, -74.0055695158991)",POINT (-8238262.314 4970220.406)
82405,53024300,2022-01-11 11:57:55,Homeless Person Assistance,10003.0,40.73733,-73.992653,"(40.737330349666436, -73.9926531498436)",POINT (-8236824.471 4973674.812)


In [24]:
geodf_tree_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 652167 entries, 0 to 683787
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   created_at  652167 non-null  datetime64[ns]
 1   Latitude    652167 non-null  float64       
 2   longitude   652167 non-null  float64       
 3   tree_id     652167 non-null  int64         
 4   zipcode     652167 non-null  int64         
 5   health      652167 non-null  object        
 6   spc_common  652167 non-null  object        
 7   geometry    652167 non-null  geometry      
dtypes: datetime64[ns](1), float64(2), geometry(1), int64(2), object(2)
memory usage: 44.8+ MB


In [25]:
geodf_tree_data.head()

Unnamed: 0,created_at,Latitude,longitude,tree_id,zipcode,health,spc_common,geometry
0,2015-08-27,40.723092,-73.844215,180683,11375,Fair,red maple,POINT (-8220300.436 4971583.163)
1,2015-09-03,40.794111,-73.818679,200540,11357,Fair,pin oak,POINT (-8217457.809 4982020.303)
2,2015-09-05,40.717581,-73.936608,204026,11211,Good,honeylocust,POINT (-8230585.520 4970773.712)
3,2015-09-05,40.713537,-73.934456,204337,11211,Good,honeylocust,POINT (-8230346.012 4970179.889)
4,2015-08-30,40.666778,-73.975979,189565,11215,Good,American linden,POINT (-8234968.356 4963315.009)


In [26]:
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15038 entries, 0 to 15037
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       15038 non-null  int64  
 1   RegionID    15038 non-null  int64  
 2   RegionName  15038 non-null  int64  
 3   date        15038 non-null  object 
 4   rent        8756 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 587.5+ KB


In [27]:
df_zillow_data.head()

Unnamed: 0,index,RegionID,RegionName,date,rent
0,0,62093,11385,2015-01-31,
1,1,62019,11208,2015-01-31,
2,2,62046,11236,2015-01-31,
3,3,61807,10467,2015-01-31,
4,4,62085,11373,2015-01-31,


## Part 2: Storing Data

In [28]:
# this code is not be able to use 
!createdb e4501project
!psql --dbname e4501project -c 'CREATE EXTENSION if NOT EXISTS postgis;'

^C
^C


### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [29]:
# Define your database connection parameters
db_connection_string = "postgresql://postgres:1234@localhost:5432/e4501project"
engine = create_engine(db_connection_string)

In [68]:
ZIPCODE_SCHEMA = """
CREATE TABLE IF NOT EXISTS nyc_shape (
  "zipcode" float8 PRIMARY KEY,
  "geometry" geometry(POLYGON, 3857)
);
"""

NYC_311_SCHEMA = """
CREATE TABLE IF NOT EXISTS t311 (
    "Unique Key" int8 PRIMARY KEY,
    "Created Date" timestamp(6),
    "Complaint Type" text COLLATE "pg_catalog"."default",
    "Incident Zip" float8,
    "Latitude" float8,
    "Longitude" float8,
    "Location" text COLLATE "pg_catalog"."default",
    "geometry" geometry(POINT, 3857)
);
"""

NYC_TREE_SCHEMA = """
CREATE TABLE IF NOT EXISTS stc (
    "created_at" timestamp(6),
    "Latitude" float8,
    "longitude" float8,
    "tree_id" int8 PRIMARY KEY,
    "zipcode" int8,
    "health" text COLLATE "pg_catalog"."default",
    "spc_common" text COLLATE "pg_catalog"."default",
    "geometry" geometry(POINT, 3857)
);
"""

ZILLOW_SCHEMA = """
CREATE TABLE IF NOT EXISTS zillow (
  "index" int8 PRIMARY KEY,
  "RegionID" int8,
  "RegionName" int8,
  "date" DATE,
  "rent" float8
)
;
"""

BEDBUG_SCHEMA = '''
CREATE TABLE IF NOT EXISTS Bedbug (
  "index" int   PRIMARY KEY,
  "Building ID" int8,
  "Postcode" float8,
  "Filing Date" timestamp(6),
  "Eradicated Unit Count" float8,
  "Re-infested  Dwelling Unit Count" float8,
  "Latitude" float8,
  "Longitude" float8,
  "geometry" geometry(POINT,3857)
);
'''

In [69]:
# create that required schema.sql file
with open('schema.sql', "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)
    f.write(BEDBUG_SCHEMA)

In [70]:
# Read the SQL schema file
schema_file_path= "schema.sql"
with open(schema_file_path, 'r') as file:
    schema_sql = file.read()

# Execute the SQL schema
with engine.connect() as connection:
    # It's often a good idea to execute commands within a transaction
    with connection.begin() as transaction:
        try:
            connection.execute(schema_sql)
            # Commit the transaction if all commands execute successfully
            transaction.commit()
        except Exception as e:
            # Rollback the transaction in case of an error
            transaction.rollback()
            # Optionally, re-raise the exception or handle it as needed
            raise e

In [59]:
df_zillow_data

Unnamed: 0,index,RegionID,RegionName,date,rent
0,0,62093,11385,2015-01-31,
1,1,62019,11208,2015-01-31,
2,2,62046,11236,2015-01-31,
3,3,61807,10467,2015-01-31,
4,4,62085,11373,2015-01-31,
...,...,...,...,...,...
15033,15033,61773,10282,2023-07-31,7611.834625
15034,15034,62010,11109,2023-07-31,4445.207586
15035,15035,61620,10006,2023-07-31,4035.676503
15036,15036,61723,10162,2023-07-31,


In [60]:
geodf_311_data.to_postgis('t311', engine, if_exists='append', index=False)
geodf_tree_data.to_postgis('stc', engine, if_exists='append', index=False)
df_zillow_data.to_sql('zillow', engine, if_exists='append', index=False)
df_bedbug_data.to_postgis('bedbug', engine, if_exists='append', index=False)
geodf_zipcode_data.to_postgis('nyc_shape', engine, if_exists='replace', index=False)

KeyboardInterrupt: 

In [71]:
df_bedbug_data.to_postgis('bedbug', engine, if_exists='append', index=False)
