### Big data course project
<strong>T5: External data: events</strong>

Jovana Videnovic & Haris Kupinic

In [1]:
import requests
import pandas as pd
from pathlib import Path

In [2]:
add_data_path = Path("/d/hpc/home/jv8043/BD/project/T5/add_data") 

In [4]:
def load_events(start_time, end_time, limit=1000):
    """
    Load NYC events from the JSON API between start_time and end_time.

    Parameters:
        start_time (str): Start datetime in ISO format (e.g. '2017-01-01T00:00:00')
        end_time (str): End datetime in ISO format (e.g. '2017-12-31T23:59:59')
        limit (int): Number of records to fetch per page (default is 1000)

    Returns:
        pd.DataFrame: DataFrame containing all matching events
    """
    BASE_URL = "https://data.cityofnewyork.us/resource/bkfu-528j.json"
    offset = 0
    all_data = []

    start_time = start_time.replace(" ", "T")
    end_time = end_time.replace(" ", "T")
    where_clause = f"start_date_time between '{start_time}' and '{end_time}'"

    while True:
        params = {
            "$limit": limit,
            "$offset": offset,
            "$where": where_clause
        }
        response = requests.get(BASE_URL, params=params)
        data = response.json()
        if not data:
            break

        all_data.extend(data)
        offset += limit
        print(f"Loaded {len(all_data)} records so far...")

    df = pd.DataFrame(all_data)
    return df

In [5]:
first_time = "2012-01-01T00:00:00"
last_time = "2025-02-02T00:00:00"
events_df = load_events(first_time, last_time, limit=500000)

In [7]:
events_df.to_parquet(
    add_data_path / "nyc_events_full.parquet",
    index=False,
)

In [8]:
df_  = pd.read_parquet(
    add_data_path / "nyc_events_full.parquet",
)
print(f"Data loaded with {len(df_)} records and {len(df_.columns)} columns.")

In [10]:
display(df_.head(3))

In [11]:
# compute NaN values in each column
nan_counts = df_.isna().sum()
print("NaN values in each column:")
print(nan_counts[nan_counts > 0])

In [12]:
# delete event_street_side column
if 'event_street_side' in events_df.columns:
    events_df.drop(columns=['event_street_side'], inplace=True)
    print("Dropped 'event_street_side' column.")
# drop rows with NaN values
events_df.dropna(inplace=True)

In [13]:
event_location = events_df["event_location"].tolist()

# filter only events with event_location
events_df = events_df[events_df["event_location"].notnull()]
events_df = events_df[events_df["event_location"] != ""]
events_df = events_df[events_df["event_location"] != "N/A"]
events_df = events_df[events_df["event_location"] != "Unknown"]

# filter only events with names
events_df = events_df[events_df["event_name"].notnull()]
events_df = events_df[events_df["event_name"] != ""]
events_df = events_df[events_df["event_name"] != "N/A"]
events_df = events_df[events_df["event_name"] != "Unknown"]

events_df["start_date_time"] = pd.to_datetime(events_df["start_date_time"], errors='coerce')
events_df["end_date_time"] = pd.to_datetime(events_df["end_date_time"], errors='coerce')
events_df = events_df[
    events_df["start_date_time"].notnull() & 
    events_df["end_date_time"].notnull()
]

In [15]:
events_df["start_date_time"] = events_df["start_date_time"].astype('datetime64[us]')
events_df["end_date_time"] = events_df["end_date_time"].astype('datetime64[us]')

In [14]:
# print unique value sin event_type
unique_event_types = events_df["event_type"].unique()
print(f"Unique event types: {len(unique_event_types)}")

In [20]:
def normalize_location(loc):
    loc = loc.strip()
    if " between" in loc:
        loc = loc.split(" between ")[0].strip()        
        return f"{loc}, New York"
    elif ":" in loc:
        parts = [p.strip() for p in loc.split(":", 1)]
        return f"{parts[0]}, New York"
    else:
        return f"{loc}, New York"

In [18]:
events_df = events_df[
    [
        "event_name",
        "start_date_time",
        "end_date_time",
        "event_location",
        "event_type"
    ]
]

In [21]:
event_location_col = events_df["event_location"].tolist()
event_location_col = [str(i) for i in event_location_col]
event_location_col = [normalize_location(i) for i in event_location_col]

unique_locs = list(set(event_location_col))

In [23]:
print(f"Filtered events DataFrame has {len(events_df)} records after cleaning.")
events_df.to_parquet(
    add_data_path / "nyc_events_cleaned.parquet",
    index=False,
)