<a href="https://colab.research.google.com/github/haraldriisager/ML-Project/blob/weather-api/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gdown
!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas
!pip install tqdm

Collecting openmeteo-requests
  Downloading openmeteo_requests-1.3.0-py3-none-any.whl.metadata (9.7 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo-requests)
  Downloading openmeteo_sdk-1.18.0-py3-none-any.whl.metadata (934 bytes)
Downloading openmeteo_requests-1.3.0-py3-none-any.whl (6.0 kB)
Downloading openmeteo_sdk-1.18.0-py3-none-any.whl (7.6 kB)
Installing collected packages: openmeteo-sdk, openmeteo-requests
Successfully installed openmeteo-requests-1.3.0 openmeteo-sdk-1.18.0
Collecting requests-cache
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting retry-requests
  Downloading retry_requests-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting cattrs>=22.2 (from requests-cache)
  Downloading cattrs-24.1.2-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━

In [2]:
import gdown
import pandas as pd
import openmeteo_requests
from openmeteo_sdk.Variable import Variable
import requests_cache
from requests_cache import CachedSession
from retry_requests import retry
from datetime import datetime, timedelta
from tqdm import tqdm
import pickle

In [3]:
# Import Flight Delay data set

file_id = '14aF7ZORUZFGKoAG7IE9Cd6vjTLtU2ytz'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'data.csv', quiet=False)
df = pd.read_csv('data.csv')

print(df.columns)
print(df.shape[0])

Downloading...
From (original): https://drive.google.com/uc?id=14aF7ZORUZFGKoAG7IE9Cd6vjTLtU2ytz
From (redirected): https://drive.google.com/uc?id=14aF7ZORUZFGKoAG7IE9Cd6vjTLtU2ytz&confirm=t&uuid=2343fe02-a659-4f46-b522-6b5a07c7191b
To: /content/data.csv
100%|██████████| 614M/614M [00:11<00:00, 55.8MB/s]


Index(['FL_DATE', 'AIRLINE', 'AIRLINE_DOT', 'AIRLINE_CODE', 'DOT_CODE',
       'FL_NUMBER', 'ORIGIN', 'ORIGIN_CITY', 'DEST', 'DEST_CITY',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'DELAY_DUE_CARRIER',
       'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY',
       'DELAY_DUE_LATE_AIRCRAFT'],
      dtype='object')
3000000


In [4]:
# Import Airport locations data set

file_id = '1eK1b3XX3jl-9XtQrH-_924YjV3rlqUFB'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'airport_locations.csv', quiet=False)
airport_locations = pd.read_csv('airport_locations.csv', delimiter=';')

airport_locations = airport_locations[['Airport Code', 'Latitude', 'Longitude']].dropna()
airport_locations = airport_locations.set_index('Airport Code').to_dict(orient='index')

Downloading...
From: https://drive.google.com/uc?id=1eK1b3XX3jl-9XtQrH-_924YjV3rlqUFB
To: /content/airport_locations.csv
100%|██████████| 875k/875k [00:00<00:00, 9.55MB/s]


In [5]:
# Append the origin and destination coordinates to the data set

def get_airport_coordinates(airport_code):
  if airport_code in airport_locations:
    return airport_locations[airport_code]['Latitude'], airport_locations[airport_code]['Longitude']
  return None, None

df["ORIGIN_LAT"], df["ORIGIN_LONG"] = zip(*df["ORIGIN"].apply(get_airport_coordinates))
df["DEST_LAT"], df["DEST_LONG"] = zip(*df["DEST"].apply(get_airport_coordinates))

In [6]:
# Remove rows that have null coordinates

missing_coordinates_rows = df[df[['ORIGIN_LAT', 'ORIGIN_LONG', 'DEST_LAT', 'DEST_LONG']].isna().any(axis=1)]

missing_count = missing_coordinates_rows.shape[0]

missing_origin = set(missing_coordinates_rows['ORIGIN'])
missing_destination = set(missing_coordinates_rows['DEST'])
missing_airports = missing_origin.union(missing_destination)

print("Airport codes with missing coordinates:", list(missing_airports))
print("Number of rows with missing coordinates:", missing_count)

df = df.dropna(subset=['ORIGIN_LAT', 'ORIGIN_LONG', 'DEST_LAT', 'DEST_LONG'])

print("Number of rows after removal:", df.shape[0])

df = df[:10000]

print("Number of rows after removal:", df.shape[0])

Airport codes with missing coordinates: ['PIT', 'PBI', 'FAR', 'PIA', 'MDW', 'MVY', 'SPI', 'SAN', 'ATL', 'CVG', 'SRQ', 'FWA', 'BIS', 'MSO', 'XNA', 'MEM', 'GRR', 'OGD', 'CID', 'MFR', 'ICT', 'TXK', 'FCA', 'VPS', 'BLV', 'PHL', 'STC', 'GRI', 'GJT', 'IAH', 'USA', 'BIL', 'DCA', 'TUL', 'BWI', 'SBN', 'LGA', 'GTF', 'LAS', 'DSM', 'CMH', 'FSD', 'SGU', 'STL', 'TUS', 'ATW', 'HOU', 'MKE', 'SMX', 'DAL', 'BZN', 'JFK', 'TOL', 'GEG', 'CSG', 'XWA', 'AZA', 'ECP', 'MCI', 'MLI', 'IDA', 'MSY', 'BOI', 'SCK', 'SFB', 'MLB', 'ORD', 'MOT', 'BNA', 'PSC', 'RFD', 'OMA', 'HPN', 'FNT', 'EUG', 'RDM', 'EWR', 'GFK', 'MSP', 'TVC', 'CLT', 'DEN', 'SGF', 'BKG', 'DFW', 'FLL', 'SNA', 'BLI', 'PIE', 'MFE', 'AMA', 'RAP', 'IND', 'AUS', 'PVU', 'OAK', 'JAC', 'PGD', 'BOS']
Number of rows with missing coordinates: 17348
Number of rows after removal: 2982652
Number of rows after removal: 10000


In [7]:
# Format departure and arrival times to ensure they are in 'HH:MM' format
df["CRS_DEP_TIME"] = df["CRS_DEP_TIME"].apply(lambda x: f"{str(int(x)).zfill(4)[:2]}:{str(int(x)).zfill(4)[2:]}")
df["CRS_ARR_TIME"] = df["CRS_ARR_TIME"].apply(lambda x: f"{str(int(x)).zfill(4)[:2]}:{str(int(x)).zfill(4)[2:]}")

# Verify that times are properly formatted as 'HH:MM'
print(df[["CRS_DEP_TIME", "CRS_ARR_TIME"]].head())

  CRS_DEP_TIME CRS_ARR_TIME
0        11:55        15:01
1        21:20        23:15
2        09:54        12:52
3        16:09        18:29
4        18:40        20:41


In [8]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)
url = "https://archive-api.open-meteo.com/v1/archive"

In [9]:
def save_progress(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f)

def load_progress(filename):
    try:
        with open(filename, "rb") as f:
            return pickle.load(f)
    except FileNotFoundError:
        return []

In [10]:
def get_weather_data(date, time, latitude, longitude):

  flight_datetime = datetime.strptime(f"{date} {time}", "%Y-%m-%d %H:%M")
  start_date = flight_datetime.strftime("%Y-%m-%d")
  end_date = (flight_datetime + timedelta(hours=1)).strftime("%Y-%m-%d")

  params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": date,
    "end_date": date,
    "hourly": [
        "temperature_2m",
        "precipitation",
        "rain",
        "snowfall",
        "weather_code",
        "windspeed_10m",
        "windspeed_100m",
        "winddirection_10m",
        "winddirection_100m",
        "windgusts_10m",
    ],
    "timezone": "auto"
  }

  responses = openmeteo.weather_api(url, params=params)

  if responses and isinstance(responses, list):
    response = responses[0]
    hourly_data = response.Hourly() if response else None
  else:
    hourly_data = None

  if hourly_data:
    hourly_variables = list(map(lambda i: hourly_data.Variables(i), range(0, hourly_data.VariablesLength())))

    weather_data = {}

    try:
        weather_data["temperature_2m"] = next(
            filter(lambda x: x.Variable() == Variable.temperature and x.Altitude() == 2, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["precipitation"] = next(
            filter(lambda x: x.Variable() == Variable.precipitation, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["rain"] = next(
            filter(lambda x: x.Variable() == Variable.rain, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["snowfall"] = next(
            filter(lambda x: x.Variable() == Variable.snowfall, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windspeed_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_speed and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windspeed_100m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_speed and x.Altitude() == 100, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["winddirection_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_direction and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["winddirection_100m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_direction and x.Altitude() == 100, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windgusts_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_gusts and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
    except StopIteration:
        print("One or more weather variables were not found in the response.")
        return None

    return weather_data

  return None

In [11]:
origin_weather_data = load_progress("origin_weather.pkl")
destination_weather_data = load_progress("destination_weather.pkl")

# Fetch weather data
for index, row in tqdm(df.iterrows(), total=len(df), desc="Fetching weather data"):
        if index < len(origin_weather_data):
            if index % 200 == 0:
                print("still works")
            continue  # Skip already processed rows

        # Fetch origin weather
        origin_weather = get_weather_data(row["FL_DATE"], row["CRS_DEP_TIME"], row["ORIGIN_LAT"], row["ORIGIN_LONG"])
        origin_weather_data.append(origin_weather)
        save_progress(origin_weather_data, "origin_weather.pkl")

        # Fetch destination weather
        destination_weather = get_weather_data(row["FL_DATE"], row["CRS_ARR_TIME"], row["DEST_LAT"], row["DEST_LONG"])
        destination_weather_data.append(destination_weather)
        save_progress(destination_weather_data, "destination_weather.pkl")

# Create DataFrames from weather data
origin_weather_df = pd.DataFrame(origin_weather_data).add_prefix('origin_')
destination_weather_df = pd.DataFrame(destination_weather_data).add_prefix('destination_')

# Combine with the main DataFrame
df = pd.concat([df, origin_weather_df, destination_weather_df], axis=1)

print(df.head())


Fetching weather data:  39%|███▉      | 3911/10000 [00:00<00:00, 19835.27it/s]

still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works
still works


Fetching weather data: 100%|██████████| 10000/10000 [2:24:58<00:00,  1.15it/s]

      FL_DATE                AIRLINE                AIRLINE_DOT AIRLINE_CODE  \
0  2019-01-09  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
1  2022-11-19   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
2  2022-07-22  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
3  2023-03-06   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
4  2020-02-23       Spirit Air Lines       Spirit Air Lines: NK           NK   

   DOT_CODE  FL_NUMBER ORIGIN          ORIGIN_CITY DEST  \
0   19977.0     1562.0    FLL  Fort Lauderdale, FL  EWR   
1   19790.0     1149.0    MSP      Minneapolis, MN  SEA   
2   19977.0      459.0    DEN           Denver, CO  MSP   
3   19790.0     2295.0    MSP      Minneapolis, MN  SFO   
4   20416.0      407.0    MCO          Orlando, FL  DFW   

               DEST_CITY  ...  \
0             Newark, NJ  ...   
1            Seattle, WA  ...   
2        Minneapolis, MN  ...   
3      San Francisco, CA  ...   




In [12]:
# Select only the features that we want

required_columns = [
    'FL_DATE',
    'AIRLINE',
    'FL_NUMBER',
    'ORIGIN',
    'DEST',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
    'DEP_DELAY',
    'ARR_DELAY',
    'DELAY_DUE_WEATHER'
]

weather_columns = [col for col in df.columns if col.startswith('origin_') or col.startswith('destination_')]

final_columns = required_columns + weather_columns

df = df[final_columns]

print(df.head())

      FL_DATE                AIRLINE  FL_NUMBER ORIGIN DEST CRS_DEP_TIME  \
0  2019-01-09  United Air Lines Inc.     1562.0    FLL  EWR        11:55   
1  2022-11-19   Delta Air Lines Inc.     1149.0    MSP  SEA        21:20   
2  2022-07-22  United Air Lines Inc.      459.0    DEN  MSP        09:54   
3  2023-03-06   Delta Air Lines Inc.     2295.0    MSP  SFO        16:09   
4  2020-02-23       Spirit Air Lines      407.0    MCO  DFW        18:40   

  CRS_ARR_TIME  CRS_ELAPSED_TIME  DISTANCE  DEP_DELAY  ...  \
0        15:01             186.0    1065.0       -4.0  ...   
1        23:15             235.0    1399.0       -6.0  ...   
2        12:52             118.0     680.0        6.0  ...   
3        18:29             260.0    1589.0       -1.0  ...   
4        20:41             181.0     985.0       -2.0  ...   

                                origin_windgusts_10m  \
0  [11.159999, 10.799999, 10.799999, 10.08, 11.15...   
1  [34.56, 33.12, 36.719997, 38.519997, 39.96, 41...   
2 

In [13]:
# Write the cleaned data to a new csv

df.to_csv('flight_data_with_weather.csv', index=False)

print("Data saved to 'flight_data_with_weather.csv'")

Data saved to 'flight_data_with_weather.csv'
