In [2]:
# Import necessary libraries
import os
import requests
import pandas as pd
import zipfile 
from io import BytesIO

In [3]:
from pathlib import Path
# Path to your data folder
data_folder = r"Path("Data")"

In [4]:
# 1. Unzip all archives into a subfolder called "unzipped" ---
extract_folder = os.path.join(data_folder, "unzipped")
os.makedirs(extract_folder, exist_ok=True)

for item in os.listdir(data_folder):
    if item.lower().endswith(".zip"):
        zip_path = os.path.join(data_folder, item)
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.extractall(extract_folder)
        print(f"Extracted {item} → {extract_folder}")

Extracted 2013-citibike-tripdata.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted 2014-citibike-tripdata.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted 2015-citibike-tripdata.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted 2016-citibike-tripdata.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted 2017-citibike-tripdata.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted 2018-citibike-tripdata.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted JC-201510-citibike-tripdata.csv.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted JC-201511-citibike-tripdata.csv.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted JC-201512-citibike-tripdata.csv.zip → C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped
Extracted JC-201601

In [5]:
# 2. Find all CSVs in the "unzipped" folder ---
filepaths = [
    os.path.join(extract_folder, fname)
    for fname in os.listdir(extract_folder)
    if fname.lower().endswith(".csv")
]

print(f"{len(filepaths)} CSV files found in '{extract_folder}'.")

115 CSV files found in 'C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\unzipped'.


In [6]:
# 3. Read & concatenate all CSVs efficiently ---
df = pd.concat(
    (pd.read_csv(fp) for fp in filepaths),
    ignore_index=True
)

In [7]:
# 4. Inspect the combined DataFrame ---
print(f"Combined DataFrame shape: {df.shape}")
print(df.head())

Combined DataFrame shape: (5606691, 43)
   Trip Duration           Start Time            Stop Time  Start Station ID  \
0          376.0  2015-10-01 00:16:26  2015-10-01 00:22:42            3212.0   
1          739.0  2015-10-01 00:27:12  2015-10-01 00:39:32            3207.0   
2         2714.0  2015-10-01 00:32:46  2015-10-01 01:18:01            3193.0   
3          275.0  2015-10-01 00:34:31  2015-10-01 00:39:06            3199.0   
4          561.0  2015-10-01 00:40:12  2015-10-01 00:49:33            3183.0   

  Start Station Name  Start Station Latitude  Start Station Longitude  \
0    Christ Hospital               40.734786               -74.050444   
1        Oakland Ave               40.737604               -74.052478   
2       Lincoln Park               40.724605               -74.078406   
3       Newport Pkwy               40.728745               -74.032108   
4     Exchange Place               40.716247               -74.033459   

   End Station ID    End Station Name  E

Why this is “most effective”:

On‑the‑fly reading
By passing a generator expression—(pd.read_csv(fp) for fp in filepaths)—to pd.concat, you stream each CSV into the concatenation process one at a time, instead of first building a huge list of separate DataFrames in memory.

Avoids extra copies
Pandas will know up front roughly how big the final DataFrame will be (based on file sizes and row counts) and can allocate a single output buffer, rather than repeatedly resizing or creating intermediate copies.

Keeps code simple
It’s a one‑liner that scales from tens to hundreds of files without changing any other logic.

In [9]:
# NOAA token
token = "nUIntaIqEatKQTqwmaxmZignjvFLXkQa"

In [10]:
# API endpoint & params 
url = "https://www.ncei.noaa.gov/access/services/data/v1"
params = {
    "dataset":   "daily-summaries",
    "stations":  "USW00014732",
    "startDate": "2022-01-01",
    "endDate":   "2022-12-31",
    "dataTypes": "TMAX,TMIN,PRCP,SNOW,SNWD",
    "format":    "json"
}
headers = {"token": token}

In [11]:
# Fetch data 
resp = requests.get(url, params=params, headers=headers)
resp.raise_for_status()
df = pd.DataFrame(resp.json())

In [12]:
from pathlib import Path
# Ensure target directory exists
out_dir = r"Path("Data")"
os.makedirs(out_dir, exist_ok=True)

In [13]:
# Write CSV into folder
out_path = os.path.join(out_dir, "laguardia_2022_weather.csv")
df.to_csv(out_path, index=False)

print(f"✅ Saved LaGuardia 2022 weather to:\n{out_path}")

✅ Saved LaGuardia 2022 weather to:
C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\laguardia_2022_weather.csv


In [14]:
from pathlib import Path
# Configuration
NOAA_TOKEN = "nUIntaIqEatKQTqwmaxmZignjvFLXkQa"
STATION_ID = "USW00014732"          # LaGuardia GHCND code
START_DATE = "2022-01-01"
END_DATE   = "2022-12-31"

OUT_DIR       = r"Path("Data")"
WEATHER_FP    = os.path.join(OUT_DIR, "laguardia_2022_weather.csv")
MERGED_FP     = os.path.join(OUT_DIR, "citibike_weather_2022.csv")
# Note the "JC-" prefix on every monthly file
MONTHLY_URL_TMPL = "https://s3.amazonaws.com/tripdata/JC-{year}{month:02d}-citibike-tripdata.csv.zip"

os.makedirs(OUT_DIR, exist_ok=True)

In [15]:
# 1. Fetch LaGuardia daily summaries 
noaa_url = "https://www.ncei.noaa.gov/access/services/data/v1"
params = {
    "dataset":   "daily-summaries",
    "stations":  STATION_ID,
    "startDate": START_DATE,
    "endDate":   END_DATE,
    "dataTypes": "TMAX,TMIN,PRCP,SNOW,SNWD",
    "format":    "json"
}
headers = {"token": NOAA_TOKEN}

resp = requests.get(noaa_url, params=params, headers=headers)
resp.raise_for_status()
weather = pd.DataFrame(resp.json())

In [16]:
# prepare weather for merge
weather["date"] = pd.to_datetime(weather["DATE"]).dt.date
weather = weather.drop(columns=["DATE"])
weather.to_csv(WEATHER_FP, index=False)
print(f"✅ Weather saved to: {WEATHER_FP}")

✅ Weather saved to: C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\laguardia_2022_weather.csv


In [17]:
from zipfile import ZipFile 
# 2. Download & concatenate all 12 months of Citi Bike data 
all_months = []
year = 2022

for month in range(1, 13):
    url = MONTHLY_URL_TMPL.format(year=year, month=month)
    print(f"Downloading {url} …")
    r = requests.get(url)
    if not r.ok:
        print(f" ⚠️  Failed ({r.status_code}), skipping month {month:02d}")
        continue

    buf = BytesIO(r.content)
    with ZipFile(buf) as zf:
        # find the single CSV
        csv_name = next((f for f in zf.namelist() if f.lower().endswith(".csv")), None)
        if not csv_name:
            raise RuntimeError(f"No CSV found inside {url}")
        print(f" → Loading {csv_name}")
        dfm = pd.read_csv(zf.open(csv_name), parse_dates=["started_at", "ended_at"])
        all_months.append(dfm)

if not all_months:
    raise RuntimeError("No Citi Bike data downloaded — check the URL template or your network")

cb = pd.concat(all_months, ignore_index=True)
print(f"✅ Loaded {len(all_months)} months → {len(cb):,} total trips")

Downloading https://s3.amazonaws.com/tripdata/JC-202201-citibike-tripdata.csv.zip …
 → Loading JC-202201-citibike-tripdata.csv
Downloading https://s3.amazonaws.com/tripdata/JC-202202-citibike-tripdata.csv.zip …
 → Loading JC-202202-citibike-tripdata.csv
Downloading https://s3.amazonaws.com/tripdata/JC-202203-citibike-tripdata.csv.zip …
 → Loading JC-202203-citibike-tripdata.csv
Downloading https://s3.amazonaws.com/tripdata/JC-202204-citibike-tripdata.csv.zip …
 → Loading JC-202204-citibike-tripdata.csv
Downloading https://s3.amazonaws.com/tripdata/JC-202205-citibike-tripdata.csv.zip …
 → Loading JC-202205-citibike-tripdata.csv
Downloading https://s3.amazonaws.com/tripdata/JC-202206-citibike-tripdata.csv.zip …
 → Loading JC-202206-citibike-tripdata.csv
Downloading https://s3.amazonaws.com/tripdata/JC-202207-citibike-tripdata.csv.zip …
 ⚠️  Failed (404), skipping month 07
Downloading https://s3.amazonaws.com/tripdata/JC-202208-citibike-tripdata.csv.zip …
 → Loading JC-202208-citibike-tri

In [18]:
# 3. Merge with weather 
cb["date"] = cb["started_at"].dt.date
merged = cb.merge(weather, on="date", how="left")
print(f"✅ Merged DataFrame: {len(merged):,} rows × {merged.shape[1]} cols")

✅ Merged DataFrame: 786,983 rows × 20 cols


In [19]:
# 4. Export merged CSV 
merged.to_csv(MERGED_FP, index=False)
print(f"✅ Merged CSV saved to: {MERGED_FP}")

✅ Merged CSV saved to: C:\Users\henry\OneDrive\New York’s CitiBike trips in 2022\citibike_weather_2022.csv
