### Big data course project
<strong>T5: External data: schools & colleges</strong>

Jovana Videnovic & Haris Kupinic

In [None]:
!hostnamectl

In [None]:
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
from pathlib import Path
import pandas as pd
import os
import requests

In [None]:
part_data_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data")
service_type = "green"

In [None]:
add_data_path = Path("/d/hpc/home/jv8043/BD/project/T5/add_data") 

In [None]:
tables_path = Path("/d/hpc/home/jv8043/BD/project/T5/T5_tables") / service_type
os.makedirs(tables_path, exist_ok=True)

In [None]:
cluster = LocalCluster(n_workers=4, threads_per_worker=1, memory_limit='16GB')
client = Client(cluster)

In [None]:
client

In [None]:
start_dates = {
    "yellow": pd.Timestamp("2012-01-01"),
    "green": pd.Timestamp("2014-01-01"),
    "fhv": pd.Timestamp("2015-01-01"),
    "fhvh": pd.Timestamp("2019-02-01"),
}
end_date = pd.Timestamp("2025-02-01")

In [None]:
df = dd.read_parquet(part_data_path / service_type, engine="pyarrow", assume_missing=True)

In [None]:
def _fetch_response(url, loc_key, name_key):
    """
    Fetches data from the given URL and returns it as a DataFrame.
    """
    df = pd.DataFrame(columns=["lat", "lon", "name"])

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

    for item in data:
        try:
            lon, lat = item[loc_key]["coordinates"]
            name = item[name_key]
            df.loc[len(df)] = {"lat": lat, "lon": lon, "name": name}
        except KeyError as e:
            print(f"Key error in schools: {e}")

    return df


def fetch_data(school_type):
    if school_type == "school":
        url = "https://data.cityofnewyork.us/resource/qa5p-6qzr.json"
        df = _fetch_response(url, "location_1", "school_name")

    elif school_type == "college":
        url = "https://data.cityofnewyork.us/resource/8pnn-kkif.json"
        df = _fetch_response(url, "the_geom", "name")

    return df

In [None]:
df = fetch_data("school")
df.to_csv(add_data_path / "schools.csv", index=False)

In [None]:
df = fetch_data("college")
df.to_csv(add_data_path / "colleges.csv", index=False)