# Part0 Libraries & Setup

In [12]:
import requests
import pandas as pd
import geopandas as gpd
import json
from datetime import datetime, timedelta
import os
import glob
import warnings
import psycopg2
import sqlalchemy as db
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from shapely.geometry import Point
import contextily as ctx
from geoalchemy2 import Geometry
from pathlib import Path
import ipywidgets as widgets
from ipywidgets import interact
import unittest
import numpy as np
from unittest.mock import patch
# warnings
warnings.filterwarnings('ignore')


# Part1: Data Processing

## 1.1 Downloading 311 and Tree data from API
- Download 311 and Tree data using API and Python Code
- 311 Data set is pretty huge, so we choose to download it seperately into subfiles by year, and merge it at the end

In [None]:
#1. Function (1) Manually Doanload Data from NYC Open Data
def download_data(url, app_token, filename, date_field, start_date, end_date, date_format="%Y-%m-%dT%H:%M:%S", limit=10000):
    """
    Downloads data from the specified NYC Open Data URL within a given date range.

    Args:
        url (str): The API endpoint for the dataset.
        app_token (str): Application token for authenticated access.
        filename (str): The name of the file where the data will be saved.
        date_field (str): The name of the date field in the dataset.
        start_date (datetime): The start date for filtering data.
        end_date (datetime): The end date for filtering data.
        date_format (str): Format of the date fields, defaults to '%Y-%m-%dT%H:%M:%S'.
        limit (int): Number of records to retrieve per request, defaults to 10000.

    Returns:
        None: This function writes the downloaded data to a file and does not return anything.
    """
    
    offset = 0
    start_date_str = start_date.strftime(date_format) # Format the start date
    end_date_str = end_date.strftime(date_format) # Format the end date
    # Construct the query for filtering data by date range
    date_query = f"$where={date_field} between '{start_date_str}' and '{end_date_str}'"
    
    first_batch = True  # Flag to identify the first batch of data
    while True:
        # Construct the full URL with necessary query parameters
        full_url = f"{url}?$$app_token={app_token}&{date_query}&$limit={limit}&$offset={offset}"
        response = requests.get(full_url) # Perform the API request

        if response.status_code == 200:
            data = response.text
            records_retrieved = data.count('\n')  # Count the number of lines (records) retrieved

            if first_batch and records_retrieved > 0:  # If this is the first batch and it contains data
                with open(filename, 'w') as file:
                    file.write(data) # Write data to file, including header
                first_batch = False
            elif records_retrieved > 1:  # For subsequent batches, skip the header row
                with open(filename, 'a') as file:
                    file.write(data.split('\n', 1)[1])  # Append data to file without header

            if records_retrieved < limit + 1:   # Check if all records have been retrieved
                break
            offset += limit # Increment the offset for the next batch
        else:
            print(f"Failed to download data at offset {offset}: Status code {response.status_code}")
            break


In [None]:
#app Toekn: Application token used for authentication
app_token = 'Z8lDMDpdnonlT1RjM5YGII6Ii'
#Data URL: Defines the online API URLs for the datasets
url_311 = 'https://data.cityofnewyork.us/resource/erm2-nwe9.csv'
url_trees = 'https://data.cityofnewyork.us/resource/5rq2-4hqu.csv'

In [None]:
# download tree data: Initiates the download of tree data
# date format "%m/%d/%Y": Setting the date format to month/day/year
download_data(
    url=url_trees,
    app_token=app_token,  
    filename="data/tree_data.csv",
    date_field="created_at",  
    start_date=datetime(2015, 1, 1),
    end_date=datetime(2015, 12, 31),
    date_format="%m/%d/%Y",  
    limit=10000
)

In [None]:
# download 311 data from 2015.1.1-2023.9.30
#create a new folder to save 311 data by year
subfolder_name = "311_data"
subfolder_path = os.path.join("data", subfolder_name)
if not os.path.exists(subfolder_path):
    os.makedirs(subfolder_path)

In [None]:
# Run download data function by year
#Year 2015
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2015.csv",
    date_field="created_date",
    start_date=datetime(2015, 1, 1, 0, 0),  # This represents 2015-01-01 00:00:00 AM
    end_date=datetime(2015, 12, 31, 23, 59, 59),  # This represents 2023-09-30 11:59:59 PM
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Unit Test (1)
# Path to the file you expect to exist
filename = "data/311_data/311_data_2015.csv"
# Assert that the file exists
assert os.path.exists(filename), "File does not exist"

In [None]:
# Year 2016
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2016.csv",
    date_field="created_date",
    start_date=datetime(2016, 1, 1, 0, 0),  
    end_date=datetime(2016, 12, 31, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Year 2017
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2017.csv",
    date_field="created_date",
    start_date=datetime(2017, 1, 1, 0, 0),  
    end_date=datetime(2017, 12, 31, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Year 2018
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2018.csv",
    date_field="created_date",
    start_date=datetime(2018, 1, 1, 0, 0),  
    end_date=datetime(2018, 12, 31, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Year 2019
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2019.csv",
    date_field="created_date",
    start_date=datetime(2019, 1, 1, 0, 0),  
    end_date=datetime(2019, 12, 31, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Year 2020
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2020.csv",
    date_field="created_date",
    start_date=datetime(2020, 1, 1, 0, 0),  
    end_date=datetime(2020, 12, 31, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Year 2021
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2021.csv",
    date_field="created_date",
    start_date=datetime(2021, 1, 1, 0, 0),  
    end_date=datetime(2021, 12, 31, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Year 2022
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2022.csv",
    date_field="created_date",
    start_date=datetime(2022, 1, 1, 0, 0), 
    end_date=datetime(2022, 12, 31, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

In [None]:
# Year 2023
download_data(
    url_311,
    app_token,  
    filename="data/311_data/311_data_2023.csv",
    date_field="created_date",
    start_date=datetime(2023, 1, 1, 0, 0),  
    end_date=datetime(2023, 9, 30, 23, 59, 59),  
    date_format="%Y-%m-%dT%H:%M:%S",
    limit=50000
)

## 1.2 Cleaning & Filtering

### 1.2.1 Data Description & Columns to keep

Prior to developing data cleaning functions, we conducted a thorough examination of each dataset's types and descriptions, as well as the query and visualization requirements of our project. The following list details the columns we have chosen to retain. Each column is accompanied by its description and original data type.

##### *Zillow Rent Data Description

| Column Name                  | Description                                                                          | Type        |
|------------------------------|--------------------------------------------------------------------------------------|-------------|
| RegionID                     | Used for pandas, an identifier for the region                                         | Integer     |
| RegionName                   | Same as postcode, matches 'Incident Zip' in other datasets, link with latitude and longitude | Integer     |
| City                         | Different cities, for later filtering to New York                                     | Object      |
| Average Housing Price Columns| Keep all columns related to average housing prices for each region                   | Float64     |


##### *311 Data
https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9

311 Service Requests from 2010 to Present
This dataset comprises all 311 Service Requests from 2010 to the present day, updated daily. It provides a comprehensive overview of non-emergency requests and complaints to New York City's 311 service.


| Column Name    | Description                                                              | Type        |
|----------------|--------------------------------------------------------------------------|-------------|
| unique_key     | Unique identifier of a Service Request (SR) in the open data set         | Plain Text  |
| created_date   | Date SR was created                                                      | Date & Time |
| complaint_type | First level of a hierarchy identifying the topic of the incident or condition | Plain Text  |
| incident_zip   | Incident location zip code, provided by geo validation                   | Plain Text  |
| latitude       | Geo based Latitude of the incident location                              | Number      |
| longitude      | Geo based Longitude of the incident location                             | Number      |




##### *Tree Data Description
https://data.cityofnewyork.us/Environment/2015-Street-Tree-Census-Tree-Data/uvpi-gqnh


2015 Street Tree Census, conducted by volunteers and staff organized by NYC Parks & Recreation and partner organizations. Tree data collected includes tree species, diameter and perception of health. Accompanying blockface data is available indicating status of data collection and data release citywide.


| Column Name   | Description                                                                   | Type       |
|---------------|-------------------------------------------------------------------------------|------------|
| tree_id       | Unique identification number for each tree point                              | Integer     |
| status        | Indicates whether the tree is alive, standing dead, or a stump               | Plain Text |
| created_at  | Date and time when the tree data was created                                  | Plain Text |
| zipcode       | Five-digit zipcode in which tree is located                                  | Integer |
| latitude      | Latitude of point, in decimal degrees                                        | Number     |
| longitude     | Longitude of point, in decimal degrees                                       | Number     |
| health        | Indicates the user's perception of tree health                               | Plain Text |
| spc_common    | Common name for species, e.g., "red maple"                                   | Plain Text |


#### *Zipcode Data Description
| Column Name | Description                                | Type    |
|-------------|--------------------------------------------|---------|
| ZIPCODE     | The postal code corresponding to the area  | Plain Text |
| geometry    | Geometrical data representing the area     | Geometry |

#### 1.2.2.1 Functions aiding data cleaning

In [4]:
# Function (2) create a remove_column function for deleting the unnecessary columns

def remove_column(df, keep_columns, include_date_columns=False, date_pattern=r'\d{4}-\d{2}-\d{2}'):
    """
    Removes columns from a DataFrame, retaining only the specified columns and optionally any date columns.

    Parameters:
    - df (pd.DataFrame): The DataFrame to be modified.
    - keep_columns (list of str): A list of column names to retain in the DataFrame.
    - include_date_columns (bool, optional): Flag to include columns with date format. Defaults to False.
    - date_pattern (str, optional): Regular expression pattern to identify date columns. Used when 'include_date_columns' is True. Default pattern matches 'YYYY-MM-DD'.

    Returns:
    - pd.DataFrame: The modified DataFrame with only the specified columns retained.
    
    Example:
    >>> df = pd.DataFrame(...)
    >>> new_df = remove_column(df, ['column1', 'column2'], include_date_columns=True)
    """
    # Combine specified columns with date columns if needed
    all_columns_to_keep = keep_columns

    # If including date columns, append them to the list of columns to keep
    if include_date_columns:
        # Identify date columns using the regex pattern
        date_columns = df.columns[df.columns.str.contains(date_pattern)]
        all_columns_to_keep += date_columns.tolist()

    # Return the DataFrame with only the specified columns retained
    return df.loc[:, all_columns_to_keep]

In [5]:
# Unit test (2)
# Create a sample DataFrame
df = pd.DataFrame({
    'column1': [1, 2, 3],
    'column2': [4, 5, 6],
    'date_column': pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03']),
    'irrelevant_column': ['x', 'y', 'z']
})

# Run the function with a test case
keep_columns = ['column1', 'column2']
new_df = remove_column(df, keep_columns, include_date_columns=True)

# Test assertions
assert 'column1' in new_df.columns, "column1 is not in the DataFrame"
assert 'column2' in new_df.columns, "column2 is not in the DataFrame"
assert 'date_column' not in new_df.columns, "date_column should not be retained in the DataFrame based on the current function implementation"
assert 'irrelevant_column' not in new_df.columns, "irrelevant_column should have been removed from the DataFrame"
assert new_df.shape[1] == 2, "The number of columns in the DataFrame should be two"


In [6]:
# Function (3),(4) create functions to check valid longitute and latitute
def is_valid_latitude(lat):
    """
    Checks if the provided latitude value is valid.

    Parameters:
    - lat (str): The latitude value to be checked.

    Returns:
    - bool: True if the latitude is valid, False otherwise.

    A valid latitude is a number between -90 and 90.
    Non-numeric values or latitudes outside this range will return False.
    """
    
    try:
        # Check if latitude is within the valid range
        return -90 <= float(lat) <= 90
    except ValueError:
        # Return False if latitude is not a number
        return False

def is_valid_longitude(lon):
    """
    Checks if the provided longitude value is valid.

    Parameters:
    - lon (str): The longitude value to be checked.

    Returns:
    - bool: True if the longitude is valid, False otherwise.

    A valid longitude is a number between -180 and 180.
    Non-numeric values or longitudes outside this range will return False.
    """
    try:
        # Check if longitude is within the valid range
        return -180 <= float(lon) <= 180
    except ValueError:
        # Return False if longitude is not a number
        return False

In [7]:
# Unit Test (3) cases for is_valid_latitude
assert is_valid_latitude("45"), "45 should be a valid latitude"
assert is_valid_latitude("-90"), "-90 should be a valid latitude"
assert is_valid_latitude("90"), "90 should be a valid latitude"
assert not is_valid_latitude("100"), "100 should not be a valid latitude"
assert not is_valid_latitude("-91"), "-91 should not be a valid latitude"
assert not is_valid_latitude("not_a_number"), "'not_a_number' should not be a valid latitude"

In [8]:
# Unit Test (4) cases for is_valid_longitude
assert is_valid_longitude("90"), "90 should be a valid longitude"
assert is_valid_longitude("-180"), "-180 should be a valid longitude"
assert is_valid_longitude("180"), "180 should be a valid longitude"
assert not is_valid_longitude("190"), "190 should not be a valid longitude"
assert not is_valid_longitude("-181"), "-181 should not be a valid longitude"
assert not is_valid_longitude("not_a_number"), "'not_a_number' should not be a valid longitude"

#### 1.2.2.2 Functions performing data cleaning and filtering for each datafile

In [27]:
# 1. zipcode
# Function (5) Create a clean and filter function for zipcode data
def process_zipcode_shapefile(shapefile_path):
    """
    Processes and cleans the zipcode shapefile to prepare it for analysis.

    Args:
    - shapefile_path (str): Path to the zipcode shapefile.

    Returns:
    - gpd.GeoDataFrame: A GeoDataFrame containing the cleaned and processed zipcode data.

    This function performs several steps to clean the data, including selecting essential columns,
    removing duplicates, filtering out missing values, renaming and reformatting columns, 
    and adjusting geometries to a common Coordinate Reference System (CRS).
    """
    # Step 1: Reading the shapefile
    zipcode_gdf = gpd.read_file(shapefile_path)
    
    # Step 2: Selecting essential columns
    columns_required = ['ZIPCODE', 'geometry']
    zipcode_gdf = zipcode_gdf[columns_required]
    
    
    # Step 3: Remove duplicates
    zipcode_gdf.drop_duplicates(subset=['ZIPCODE'], inplace=True)
    
    
    # Step 4: Filtering out rows with missing values in critical columns
    zipcode_gdf.dropna(subset=['ZIPCODE', 'geometry'], inplace=True)
    
    
    # Step 5: Renaming and reformatting the columns
    # Renaming 'ZIPCODE' to 'zipcode' for consistency
    zipcode_gdf.rename(columns={'ZIPCODE': 'zipcode'}, inplace=True)
    
        
    # Converting the 'zipcode' to string format for standardization
    zipcode_gdf['zipcode'] = zipcode_gdf['zipcode'].astype(int)  # Convert zipcode to integer
    
    
    # check if it lasts with 5 digits
    zipcode_gdf = zipcode_gdf[zipcode_gdf['zipcode'].apply(lambda x: str(x).isdigit() and len(str(x)) == 5)]

    # Step 6: Adjusting geometries to a common CRS
    # Transforming geometries to the web mercator projection for mapping compatibility
    common_crs = "EPSG:3857"
    zipcode_gdf.to_crs(common_crs, inplace=True)
    
    # Ensuring all column names are in lowercase
    zipcode_gdf.columns = map(str.lower, zipcode_gdf.columns)
    
    return zipcode_gdf

In [28]:
# 2.1 311 Data
# Function (6)
def clean_filter_311(filename, nyc_zipcodes):
    """
    Cleans and filters 311 complaint data to retain only relevant information and valid entries.

    Args:
    - filename (str): Path to the CSV file containing 311 data.
    - nyc_zipcodes (list of int): List of valid NYC zip codes for filtering.

    Returns:
    - gpd.GeoDataFrame: A GeoDataFrame containing the cleaned and filtered 311 complaint data.

    The function includes data loading, column selection, data validation, normalization of column names and types, 
    and transformation of data into a geospatial format.
    """
    # Step 0: Load data
    df = pd.read_csv(filename)

    # Step 1: Select necessary columns
    columns_to_keep_311 = ['unique_key', 'created_date', 'complaint_type', 'incident_zip', 'latitude', 'longitude']
    df = df[columns_to_keep_311]

    # Step 2: Remove invalid and duplicate data
    # Remove duplicates and confirm unique_key
    df = df.drop_duplicates(subset='unique_key')
    df = df[df['unique_key'].notna() & df['unique_key'].apply(lambda x: str(x).isdigit())]

    # Remove rows with NaN in critical columns
    df = df.dropna(subset=['incident_zip', 'latitude', 'longitude', 'created_date'])

    # Check if latitude and longitude are valid
    df = df[df['latitude'].apply(is_valid_latitude) & df['longitude'].apply(is_valid_longitude)]

    # Step 3: Normalize Column Names, Column Types
    
    # Standardize 'incident_zip' to integer and filter by NYC zipcodes
    df['incident_zip'] = df['incident_zip'].astype(int)
    df = df[df['incident_zip'].isin(nyc_zipcodes)]

    # Rename and reformat columns for consistency
    df.rename(columns={'unique_key': 'complaint_id', 'created_date': 'date', 'incident_zip': 'zipcode'}, inplace=True)
    df['complaint_id'] = df['complaint_id'].astype(int)

    # Convert 'date' column to datetime format
    df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')

    # Drop rows with null values post transformations
    df = df.dropna()

    # Step 4: Normalize the SRID of any geometry
    
    # Convert DataFrame to GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude']))
    gdf.set_crs("EPSG:4326", inplace=True)
    gdf.to_crs("EPSG:3857", inplace=True)

    return gdf


In [29]:
# 2.2 Function (7) to clean and merge 311 data
def clean_merge_311_data(folder_path, file_prefix, years,nyc_zipcodes):
    """
    Cleans individual 311 data files for specified years and merges them into a single DataFrame.

    Args:
    - folder_path (str): Path to the folder containing 311 data files.
    - file_prefix (str): Prefix of the 311 data filenames.
    - years (list of int): List of years for which the 311 data needs to be cleaned and merged.
    - nyc_zipcodes (list of int): List of valid NYC zip codes for filtering.

    Returns:
    - pd.DataFrame: A merged DataFrame containing cleaned and consolidated 311 data for the specified years.

    The function iteratively cleans each year's data file using 'clean_filter_311' and then merges them into one DataFrame.
    """
    
    # List to hold cleaned DataFrames for each year
    cleaned_dfs = []
    
    # Iterate over each year, clean the corresponding 311 data file, and add it to the list
    for year in years:
        filename = f"{folder_path}/{file_prefix}{year}.csv"
        cleaned_df = clean_filter_311(filename,nyc_zipcodes)
        cleaned_dfs.append(cleaned_df)
    
    # # Merge all cleaned DataFrames into a single DataFrame
    merged_df = pd.concat(cleaned_dfs, ignore_index=True)
    return merged_df

In [30]:
# 3. Function (8) to clean and filter tree data
def clean_filter_tree(filename, nyc_zipcodes):
    """
    Cleans and filters tree data, retaining only the relevant information and valid entries.

    Args:
    - filename (str): Path to the CSV file containing tree data.
    - nyc_zipcodes (list of int): List of valid NYC zip codes for filtering.

    Returns:
    - gpd.GeoDataFrame: A GeoDataFrame containing the cleaned and filtered tree data.

    This function includes loading data, selecting necessary columns, removing invalid and duplicate data,
    and transforming the data into a geospatial format.
    """
    # Load data
    df = pd.read_csv(filename)

    # Select the necessary columns for analysis
    columns_to_keep_tree = ['tree_id', 'status', 'zipcode', 'latitude', 'longitude', 'health', 'spc_common', 'created_at']
    df = df[columns_to_keep_tree]

    # Remove duplicate entries and ensure tree IDs are valid (tree id as uid)
    df = df.drop_duplicates(subset='tree_id')
    df = df[df['tree_id'].notna() & df['tree_id'].apply(lambda x: str(x).isdigit())]

    # Drop rows with missing values in critical columns before type conversions
    df = df.dropna(subset=['zipcode', 'latitude', 'longitude', 'created_at'])

    # Standardize 'zipcode' to integer and filter by NYC zipcodes
    df['zipcode'] = df['zipcode'].astype(int)
    df = df[df['zipcode'].isin(nyc_zipcodes)]

    # Validate latitude and longitude values
    df = df[df['latitude'].apply(is_valid_latitude) & df['longitude'].apply(is_valid_longitude)]

    # Rename and reformat columns for consistency
    df.rename(columns={'created_at': 'date', 'spc_common': 'species'}, inplace=True)

    # Convert 'date' column to datetime format
    df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')

    # Convert to GeoDataFrame and normalize SRID
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude']))
    gdf.set_crs("EPSG:4326", inplace=True)
    gdf.to_crs("EPSG:3857", inplace=True)

    return gdf


In [32]:
# 4. Function (9) to clean and filter zillow rent data
def clean_filter_rent(filename, nyc_zipcodes):
    """
    Cleans and filters Zillow rent data, retaining only the relevant information for NYC regions.

    Args:
    - filename (str): Path to the CSV file containing Zillow rent data.
    - nyc_zipcodes (list of int): List of valid NYC zip codes for filtering.

    Returns:
    - pd.DataFrame: A DataFrame containing the cleaned and filtered Zillow rent data.

    This function includes loading the data, dropping unnecessary columns, renaming columns, 
    and transforming the data format for analysis.
    """
    # Load data
    df = pd.read_csv(filename)
    
    # Drop 'City' column
    df = df.drop(columns=['City'])
    
    # Rename 'RegionID' and 'RegionName' columns for standardization
    df.rename(columns={'RegionID': 'region_id', 'RegionName': 'zipcode'}, inplace=True)
    
    # Remove duplicate entries based on 'region_id'
    df = df.drop_duplicates(subset='region_id')
    
    # Drop rows with NA values in 'zipcode' column
    df = df.dropna(subset=['zipcode'])

    # Ensure that the 'zipcode' is an integer format for consistency
    df['zipcode'] = df['zipcode'].astype(int)

    # Filter the DataFrame to include only rows with NYC zip codes
    df = df[df['zipcode'].isin(nyc_zipcodes)]
    
    # Transpose the dataframe: each date becomes a row
    id_vars = ['region_id', 'zipcode']
    value_vars = [col for col in df.columns if col not in id_vars and col.startswith('20')]
    df_melted = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='date', value_name='rent')
    
    # Add 'rent_id' column as an identifier to uniquely identify each rent entry
    df_melted.insert(0, 'rent_id', range(1, 1 + len(df_melted)))
    
    # Filter out rows with NaN values in any column
    df_melted = df_melted.dropna()
    
    return df_melted


In [33]:
#311 data input arguments
folder_path = "data/311_data"
file_prefix = "311_data_"
years = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

# Function (10) load all data
def load_all_data():
    """
    Loads and processes all datasets: Zipcode, 311, Tree, and Zillow Rent data.

    This function first processes the NYC zipcode data to create a list of valid NYC zipcodes. 
    It then uses this list to filter and process the 311, Tree, and Zillow Rent datasets.

    Returns:
    Tuple: Containing GeoDataFrames for Zipcode and 311 data, and DataFrames for Tree and Zillow Rent data.
    """
    # Process the zipcode shapefile to create a GeoDataFrame of NYC zipcodes
    geodf_zipcode_data = process_zipcode_shapefile("data/nyc_zipcodes/ZIP_CODE_040114.shp")
    
    # Extract a list of valid NYC zipcodes from the processed zipcode data
    nyc_zipcodes = geodf_zipcode_data['zipcode'].tolist()
    
     # Clean and merge 311 data for the specified years using the list of NYC zipcodes
    geodf_311_data = clean_merge_311_data(folder_path, file_prefix, years,nyc_zipcodes)
    
    # Clean and filter tree data using the list of NYC zipcodes
    geodf_tree_data = clean_filter_tree('data/tree_data.csv',nyc_zipcodes)
    
    # Clean and filter Zillow Rent data using the list of NYC zipcodes
    df_zillow_data = clean_filter_rent('data/zillow_rent_data.csv',nyc_zipcodes)
    
    # Return all processed datasets
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )