In [36]:
from dotenv import load_dotenv
import os
import requests
import time
import pandas as pd

In [30]:
load_dotenv("NOAA_WEB_API_TOKEN.env")
NOAA_WEB_API_TOKEN = os.getenv("NOAA_WEB_API_TOKEN")

In [None]:
# load_dotenv(dotenv_path="NOAA_WEB_API_TOKEN.env")
NOAA_WEB_API_TOKEN = os.getenv("NOAA_WEB_API_TOKEN")
assert NOAA_WEB_API_TOKEN is not None, "Token not found! Check your environment variable."

NOAA_API_BASE_URL = "https://www.ncei.noaa.gov/cdo-web/api/v2/data"

# Station IDs for the specified locations
stations  = {
  "washingtondc": "GHCND:USW00013743",
  "vancouver": "GHCND:CA001108395",
  "newyorkcity": "GHCND:USW00014732",
  "liestal": "GHCND:SZ000001940",
  "kyoto": "GHCND:JA000047759"
}

headers = {
    "token": NOAA_WEB_API_TOKEN
}

## Station Coverage Check

In [32]:
for city, station_id in stations.items():
    response = requests.get(
        "https://www.ncdc.noaa.gov/cdo-web/api/v2/stations/" + station_id,
        headers=headers
    )
    data = response.json()
    print(f"{city}: {data['mindate']} → {data['maxdate']}")

washingtondc: 1936-09-01 → 2026-02-24
vancouver: 1957-01-01 → 2025-08-24
newyorkcity: 1939-10-07 → 2026-02-24
liestal: 1901-01-01 → 2026-01-31
kyoto: 1945-10-31 → 2026-02-01


## Acquiring 2025 temperatures for Washington, DC
I used the Global Historical Climatology Network (GHCN), avaiable throughthe NOAA web API, to acquire these temperatures. 

In [None]:
params = {
    "datasetid": "GHCND",
    "stationid": "GHCND:USW00013743",
    "startdate": "2025-01-01",
    "enddate": "2025-12-31",
    "datatypeid": ["TMIN", "TMAX", "TAVG"],
    "units": "standard",
    "limit": 1000
}

response = requests.get(
    "https://www.ncei.noaa.gov/cdo-web/api/v2/data",
    headers=headers,
    params=params
)

print(response.status_code)
data = response.json()
#print(data)

if "results" in data:
    df = pd.DataFrame(data["results"])
else:
    print("No results returned:", data)

200


### Cleaning JSON 

In [14]:
df['date'] = pd.to_datetime(df['date'])

# Pivot to wide format
df_wide = df.pivot(index='date', columns='datatype', values='value').reset_index()

df_wide.columns.name = None
df_wide = df_wide.rename(columns={
    'TMIN': 'tmin',
    'TMAX': 'tmax',
    'TAVG': 'tavg'
})

# If TAVG is missing, computed as the mean of min/max
if df_wide['tavg'].isna().any():
    df_wide['tavg'] = df_wide[['tmin','tmax']].mean(axis=1)
print(df_wide.shape)
print(df_wide.isna().sum())

(335, 4)
date    0
tavg    0
tmax    0
tmin    1
dtype: int64


API had some missing values that were left out, so I am combining the results of the API with a dataset that is the full length of the year. 

In [15]:
# Full 2025 date range
full_2025 = pd.DataFrame({'date': pd.date_range(start='2025-01-01', end='2025-12-31')})


dc_2025_full = full_2025.merge(df_wide, on='date', how='left')

# Fill missing TMIN/TMAX with interpolation
dc_2025_full['tmin'] = dc_2025_full['tmin'].interpolate()
dc_2025_full['tmax'] = dc_2025_full['tmax'].interpolate()
dc_2025_full['tavg'] = dc_2025_full['tavg'].interpolate()

print(dc_2025_full.isna().sum()) 
print(dc_2025_full.shape)  

date    0
tavg    0
tmax    0
tmin    0
dtype: int64
(365, 4)


In [16]:
dc_2025_full.to_csv("Data/dc_Temps_2025.csv", index=False)

### Combining Temperature data sets

In [17]:
temps_dc = pd.read_csv('Q_temps_dc.csv')
full_temps_dc = pd.concat([temps_dc, dc_2025_full], ignore_index=True)

full_temps_dc['date'] = pd.to_datetime(full_temps_dc['date'])

full_temps_dc['doy'] = full_temps_dc['date'].dt.dayofyear

full_temps_dc['year'] = full_temps_dc['date'].dt.year

if 'temp' in full_temps_dc.columns:
    full_temps_dc = full_temps_dc.drop(columns=['temp'])

full_temps_dc = full_temps_dc[['year','date', 'doy', 'tmin','tmax','tavg']]

print(full_temps_dc.tail())
print(full_temps_dc.isna().sum())

full_temps_dc.to_csv('Data/full_temps_dc.csv', index=False)

       year       date  doy  tmin  tmax  tavg
30638  2025 2025-12-27  361  37.0  46.0  46.0
30639  2025 2025-12-28  362  37.0  46.0  46.0
30640  2025 2025-12-29  363  37.0  46.0  46.0
30641  2025 2025-12-30  364  37.0  46.0  46.0
30642  2025 2025-12-31  365  37.0  46.0  46.0
year        0
date        0
doy         0
tmin        0
tmax        0
tavg    30278
dtype: int64


### Adding 2025 bloom date to bloom data set

In [18]:
cherry_dc = pd.read_csv('Q_blooms_dc.csv')
print(cherry_dc.tail())

    year  bloom_date  bloom_doy
78  2020  2020-03-20         80
79  2021  2021-03-28         87
80  2022  2022-03-21         80
81  2023  2023-03-23         82
82  2024  2024-03-17         77


In [19]:
bloom_date_2025 = [2025, '2025-03-28', 87]
cherry_dc.loc[len(df)] = bloom_date_2025

cherry_dc['bloom_date'] = pd.to_datetime(cherry_dc['bloom_date'])

print(cherry_dc.tail())

cherry_dc.to_csv("Data/blooms_dc.csv")

      year bloom_date  bloom_doy
79    2021 2021-03-28         87
80    2022 2022-03-21         80
81    2023 2023-03-23         82
82    2024 2024-03-17         77
1000  2025 2025-03-28         87


## Kyoto

In [55]:
all_results = []
failed = []

for year in range(1945, 2026):
    params = {
        "datasetid": "GHCND",
        "stationid": "GHCND:JA000047759",
        "startdate": f"{year}-01-01",
        "enddate": f"{year}-12-31",
        "datatypeid": ["TMIN", "TMAX", "TAVG"],
        "units": "standard",
        "limit": 1000
    }

    for attempt in range(3):  # retry up to 3 times
        response = requests.get(
            "https://www.ncei.noaa.gov/cdo-web/api/v2/data",
            headers=headers,
            params=params
        )

        if response.status_code == 200 and response.text.strip():
            data = response.json()
            if "results" in data:
                all_results.extend(data["results"])
                break
        
        time.sleep(2 * (attempt + 1)) 
    else:
        failed.append(year)

    time.sleep(0.5) 

kyoto_temps_df = pd.DataFrame(all_results)
print(f"Total rows: {kyoto_temps_df.shape[0]}")
print(f"Years failed: {failed if failed else 'None'}")

Total rows: 67496
Years failed: [1946, 1947, 1948, 1949, 1950, 2005]


In [80]:
kyoto_temps_df['date'] = pd.to_datetime(kyoto_temps_df['date'])

# Pivot to wide format
kyoto_temps_df_wide = kyoto_temps_df.pivot(index='date', columns='datatype', values='value').reset_index()

kyoto_temps_df_wide.columns.name = None
kyoto_temps_df_wide = kyoto_temps_df_wide.rename(columns={
    'TMIN': 'tmin',
    'TMAX': 'tmax',
    'TAVG': 'tavg'
})

# If TAVG is missing, computed as the mean of min/max
if kyoto_temps_df_wide['tavg'].isna().any():
    kyoto_temps_df_wide['tavg'] = kyoto_temps_df_wide[['tmin','tmax']].mean(axis=1)
print(kyoto_temps_df_wide.shape)
print(kyoto_temps_df_wide.isna().sum())

(25987, 4)
date       0
tavg     227
tmax     513
tmin    5539
dtype: int64


In [108]:
kyoto_temps = kyoto_temps_df_wide.copy()

# Fill missing TMIN/TMAX with interpolation
kyoto_temps['tmin'] = kyoto_temps['tmin'].interpolate()
kyoto_temps['tmax'] = kyoto_temps['tmax'].interpolate()
kyoto_temps['tavg'] = kyoto_temps['tavg'].interpolate()

# Converting to celsius to fit model
kyoto_temps["tmin"] = (kyoto_temps["tmin"] - 32) * 5/9
kyoto_temps["tmax"] = (kyoto_temps["tmax"] - 32) * 5/9
kyoto_temps["tavg"] = (kyoto_temps["tavg"] - 32) * 5/9

print(kyoto_temps.isna().sum()) 
print(kyoto_temps.shape)  

date     0
tavg    14
tmax    14
tmin    14
dtype: int64
(25987, 4)


In [109]:
print(kyoto_temps.shape)
print(kyoto_temps.dropna().shape)
print(kyoto_temps['date'].min(), kyoto_temps_df['date'].max())

(25987, 4)
(25973, 4)
1945-10-31 00:00:00 2025-12-09 00:00:00


In [110]:
kyoto_temps.to_csv('Data/kyoto_temps.csv', index=False)

## Liestal

In [60]:
all_results = []
failed = []

for year in range(1901, 2026):
    params = {
        "datasetid": "GHCND",
        "stationid": "GHCND:SZ000001940",
        "startdate": f"{year}-01-01",
        "enddate": f"{year}-12-31",
        "datatypeid": ["TMIN", "TMAX", "TAVG"],
        "units": "standard",
        "limit": 1000
    }

    response = requests.get(
        "https://www.ncei.noaa.gov/cdo-web/api/v2/data",
        headers=headers,
        params=params
    )

    if response.status_code != 200 or not response.text.strip():
        failed.append(year)
        time.sleep(1)
        continue

    data = response.json()

    if "results" in data:
        all_results.extend(data["results"])
    else:
        failed.append(year)

    time.sleep(2)

liestal_temps_df = pd.DataFrame(all_results)
print(f"Total rows: {liestal_temps_df.shape[0]}")
print(f"Years failed: {failed if failed else 'None'}")

Total rows: 95885
Years failed: [1966, 2007, 2014]


In [84]:
liestal_temps_df['date'] = pd.to_datetime(liestal_temps_df['date'])

# Pivot to wide format
liestal_temps_df_wide = liestal_temps_df.pivot(index='date', columns='datatype', values='value').reset_index()

liestal_temps_df_wide.columns.name = None
liestal_temps_df_wide = liestal_temps_df_wide.rename(columns={
    'TMIN': 'tmin',
    'TMAX': 'tmax',
    'TAVG': 'tavg'
})

# If TAVG is missing, computed as the mean of min/max
if liestal_temps_df_wide['tavg'].isna().any():
    liestal_temps_df_wide['tavg'] = liestal_temps_df_wide[['tmin','tmax']].mean(axis=1)
print(liestal_temps_df_wide.shape)
print(liestal_temps_df_wide.isna().sum())

(43858, 4)
date     0
tavg    60
tmax    63
tmin    79
dtype: int64


In [105]:
liestal_temps = liestal_temps_df_wide.copy()

# Fill missing TMIN/TMAX with interpolation
liestal_temps['tmin'] = liestal_temps['tmin'].interpolate()
liestal_temps['tmax'] = liestal_temps['tmax'].interpolate()
liestal_temps['tavg'] = liestal_temps['tavg'].interpolate()

# Converting to celsius to fit model
liestal_temps["tmin"] = (liestal_temps["tmin"] - 32) * 5/9
liestal_temps["tmax"] = (liestal_temps["tmax"] - 32) * 5/9
liestal_temps["tavg"] = (liestal_temps["tavg"] - 32) * 5/9

print(liestal_temps.isna().sum()) 
print(liestal_temps.shape)

date    0
tavg    0
tmax    0
tmin    0
dtype: int64
(43858, 4)


In [106]:
print(liestal_temps.shape)
print(liestal_temps.dropna().shape)
print(liestal_temps['date'].min(), liestal_temps_df['date'].max())

(43858, 4)
(43858, 4)
1901-01-01 00:00:00 2025-12-03 00:00:00


In [107]:
liestal_temps.to_csv('Data/liestal_temps.csv', index=False)

## NYC 

In [65]:
all_results = []
failed = []

for year in range(1939, 2026):
    params = {
        "datasetid": "GHCND",
        "stationid": "GHCND:USW00014732",
        "startdate": f"{year}-01-01",
        "enddate": f"{year}-12-31",
        "datatypeid": ["TMIN", "TMAX", "TAVG"],
        "units": "standard",
        "limit": 1000
    }

    response = requests.get(
        "https://www.ncei.noaa.gov/cdo-web/api/v2/data",
        headers=headers,
        params=params
    )

    if response.status_code != 200 or not response.text.strip():
        failed.append(year)
        time.sleep(1)
        continue

    data = response.json()

    if "results" in data:
        all_results.extend(data["results"])
    else:
        failed.append(year)

    time.sleep(2)

nyc_temps_df = pd.DataFrame(all_results)
print(f"Total rows: {nyc_temps_df.shape[0]}")
print(f"Years failed: {failed if failed else 'None'}")

Total rows: 66136
Years failed: [1943, 1980, 2018]


In [88]:
nyc_temps_df['date'] = pd.to_datetime(nyc_temps_df['date'])

# Pivot to wide format
nyc_temps_df_wide = nyc_temps_df.pivot(index='date', columns='datatype', values='value').reset_index()

nyc_temps_df_wide.columns.name = None
nyc_temps_df_wide = nyc_temps_df_wide.rename(columns={
    'TMIN': 'tmin',
    'TMAX': 'tmax',
    'TAVG': 'tavg'
})

# If TAVG is missing, computed as the mean of min/max
if nyc_temps_df_wide['tavg'].isna().any():
    nyc_temps_df_wide['tavg'] = nyc_temps_df_wide[['tmin','tmax']].mean(axis=1)
print(nyc_temps_df_wide.shape)
print(nyc_temps_df_wide.isna().sum())

(29869, 4)
date     0
tavg    14
tmax    14
tmin    17
dtype: int64


In [101]:
nyc_temps = nyc_temps_df_wide.copy()

# Fill missing TMIN/TMAX with interpolation
nyc_temps['tmin'] = nyc_temps['tmin'].interpolate()
nyc_temps['tmax'] = nyc_temps['tmax'].interpolate()
nyc_temps['tavg'] = nyc_temps['tavg'].interpolate()

# Converting to celsius to fit model
nyc_temps["tmin"] = (nyc_temps["tmin"] - 32) * 5/9
nyc_temps["tmax"] = (nyc_temps["tmax"] - 32) * 5/9
nyc_temps["tavg"] = (nyc_temps["tavg"] - 32) * 5/9

print(nyc_temps.isna().sum()) 
print(nyc_temps.shape)

date    0
tavg    0
tmax    0
tmin    0
dtype: int64
(29869, 4)


In [102]:
print(nyc_temps.shape)
print(nyc_temps.dropna().shape)
print(nyc_temps['date'].min(), nyc_temps_df['date'].max())

(29869, 4)
(29869, 4)
1939-10-07 00:00:00 2025-12-01 00:00:00


In [103]:
nyc_temps.to_csv('Data/nyc_temps.csv', index=False)

## Vancouver

In [73]:
all_results = []
failed = []

for year in range(1957, 2026):
    params = {
        "datasetid": "GHCND",
        "stationid": "GHCND:CA001108446",
        "startdate": f"{year}-01-01",
        "enddate": f"{year}-12-31",
        "datatypeid": ["TMIN", "TMAX", "TAVG"],
        "units": "standard",
        "limit": 1000
    }

    response = requests.get(
        "https://www.ncei.noaa.gov/cdo-web/api/v2/data",
        headers=headers,
        params=params
    )

    if response.status_code != 200 or not response.text.strip():
        failed.append(year)
        time.sleep(1)
        continue

    data = response.json()

    if "results" in data:
        all_results.extend(data["results"])
    else:
        failed.append(year)

    time.sleep(2)

vancouver_temps_df = pd.DataFrame(all_results)
print(f"Total rows: {vancouver_temps_df.shape[0]}")
print(f"Years failed: {failed if failed else 'None'}")

Total rows: 43393
Years failed: [1957, 1963, 1964, 1965, 1997, 2016]


In [92]:
vancouver_temps_df['date'] = pd.to_datetime(vancouver_temps_df['date'])

# Pivot to wide format
vancouver_temps_df_wide = vancouver_temps_df.pivot(index='date', columns='datatype', values='value').reset_index()

vancouver_temps_df_wide.columns.name = None
vancouver_temps_df_wide = vancouver_temps_df_wide.rename(columns={
    'TMIN': 'tmin',
    'TMAX': 'tmax',
    'TAVG': 'tavg'
})

# If TAVG is missing, computed as the mean of min/max
if vancouver_temps_df_wide['tavg'].isna().any():
    vancouver_temps_df_wide['tavg'] = vancouver_temps_df_wide[['tmin','tmax']].mean(axis=1)
print(vancouver_temps_df_wide.shape)
print(vancouver_temps_df_wide.isna().sum())

(20510, 4)
date      0
tavg      7
tmax    294
tmin    393
dtype: int64


In [97]:
vancouver_temps = vancouver_temps_df_wide.copy()

# Fill missing TMIN/TMAX with interpolation
vancouver_temps['tmin'] = vancouver_temps['tmin'].interpolate()
vancouver_temps['tmax'] = vancouver_temps['tmax'].interpolate()
vancouver_temps['tavg'] = vancouver_temps['tavg'].interpolate()

# Converting to celsius to fit model
vancouver_temps["tmin"] = (vancouver_temps["tmin"] - 32) * 5/9
vancouver_temps["tmax"] = (vancouver_temps["tmax"] - 32) * 5/9
vancouver_temps["tavg"] = (vancouver_temps["tavg"] - 32) * 5/9

print(vancouver_temps.isna().sum()) 
print(vancouver_temps.shape)

date    0
tavg    0
tmax    0
tmin    0
dtype: int64
(20510, 4)


In [98]:
print(vancouver_temps.shape)
print(vancouver_temps.dropna().shape)
print(vancouver_temps['date'].min(), vancouver_temps_df['date'].max())

(20510, 4)
(20510, 4)
1958-01-01 00:00:00 2025-11-30 00:00:00


In [99]:
vancouver_temps.to_csv('Data/vancouver_temps.csv', index=False)

## All citites Temperature Forecasts for 2026

In [96]:
forecast = pd.read_csv("accuweather_forecast_2026.csv")
forecast["date"] = pd.to_datetime(forecast["date"])

for city in forecast["location"].unique():
    
    city_df = forecast[forecast["location"] == city].copy()
    
    city_df["tavg"] = city_df["temp"]
    city_df["doy"] = city_df["date"].dt.dayofyear
    city_df["month"] = city_df["date"].dt.month
    
    city_df["season_year"] = city_df["year"]
    city_df.loc[city_df["month"] >= 10, "season_year"] += 1
    
    city_df.to_csv(f"Data/forecast_2026_{city}.csv", index=False)