<a href="https://colab.research.google.com/github/haraldriisager/ML-Project/blob/weather-api/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown
!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas
!pip install tqdm

In [None]:
import gdown
import pandas as pd
import openmeteo_requests
from openmeteo_sdk.Variable import Variable
import requests_cache
from retry_requests import retry
from datetime import datetime, timedelta
from tqdm import tqdm

In [None]:
# Import Flight Delay data set

file_id = '14aF7ZORUZFGKoAG7IE9Cd6vjTLtU2ytz'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'data.csv', quiet=False)
df = pd.read_csv('data.csv')

print(df.columns)
print(df.shape[0])

In [None]:
# Import Airport locations data set

file_id = '1eK1b3XX3jl-9XtQrH-_924YjV3rlqUFB'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'airport_locations.csv', quiet=False)
airport_locations = pd.read_csv('airport_locations.csv', delimiter=';')

airport_locations = airport_locations[['Airport Code', 'Latitude', 'Longitude']].dropna()
airport_locations = airport_locations.set_index('Airport Code').to_dict(orient='index')

In [None]:
# Append the origin and destination coordinates to the data set

def get_airport_coordinates(airport_code):
  if airport_code in airport_locations:
    return airport_locations[airport_code]['Latitude'], airport_locations[airport_code]['Longitude']
  return None, None

df["ORIGIN_LAT"], df["ORIGIN_LONG"] = zip(*df["ORIGIN"].apply(get_airport_coordinates))
df["DEST_LAT"], df["DEST_LONG"] = zip(*df["DEST"].apply(get_airport_coordinates))

In [None]:
# Remove rows that have null coordinates

missing_coordinates_rows = df[df[['ORIGIN_LAT', 'ORIGIN_LONG', 'DEST_LAT', 'DEST_LONG']].isna().any(axis=1)]

missing_count = missing_coordinates_rows.shape[0]

missing_origin = set(missing_coordinates_rows['ORIGIN'])
missing_destination = set(missing_coordinates_rows['DEST'])
missing_airports = missing_origin.union(missing_destination)

print("Airport codes with missing coordinates:", list(missing_airports))
print("Number of rows with missing coordinates:", missing_count)

df = df.dropna(subset=['ORIGIN_LAT', 'ORIGIN_LONG', 'DEST_LAT', 'DEST_LONG'])

print("Number of rows after removal:", df.shape[0])

In [None]:
# Format departure and arrival times to ensure they are in 'HH:MM' format
df["CRS_DEP_TIME"] = df["CRS_DEP_TIME"].apply(lambda x: f"{str(int(x)).zfill(4)[:2]}:{str(int(x)).zfill(4)[2:]}")
df["CRS_ARR_TIME"] = df["CRS_ARR_TIME"].apply(lambda x: f"{str(int(x)).zfill(4)[:2]}:{str(int(x)).zfill(4)[2:]}")

# Verify that times are properly formatted as 'HH:MM'
print(df[["CRS_DEP_TIME", "CRS_ARR_TIME"]].head())

In [None]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)
url = "https://archive-api.open-meteo.com/v1/archive"

In [None]:
def get_weather_data(date, time, latitude, longitude):

  flight_datetime = datetime.strptime(f"{date} {time}", "%Y-%m-%d %H:%M")
  start_date = flight_datetime.strftime("%Y-%m-%d")
  end_date = (flight_datetime + timedelta(hours=1)).strftime("%Y-%m-%d")

  params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": date,
    "end_date": date,
    "hourly": [
        "temperature_2m",
        "precipitation",
        "rain",
        "snowfall",
        "weather_code",
        "windspeed_10m",
        "windspeed_100m",
        "winddirection_10m",
        "winddirection_100m",
        "windgusts_10m",
    ],
    "timezone": "auto"
  }

  responses = openmeteo.weather_api(url, params=params)

  if responses and isinstance(responses, list):
    response = responses[0]
    hourly_data = response.Hourly() if response else None
  else:
    hourly_data = None

  if hourly_data:
    hourly_variables = list(map(lambda i: hourly_data.Variables(i), range(0, hourly_data.VariablesLength())))

    weather_data = {}

    try:
        weather_data["temperature_2m"] = next(
            filter(lambda x: x.Variable() == Variable.temperature and x.Altitude() == 2, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["precipitation"] = next(
            filter(lambda x: x.Variable() == Variable.precipitation, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["rain"] = next(
            filter(lambda x: x.Variable() == Variable.rain, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["snowfall"] = next(
            filter(lambda x: x.Variable() == Variable.snowfall, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windspeed_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_speed and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windspeed_100m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_speed and x.Altitude() == 100, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["winddirection_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_direction and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["winddirection_100m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_direction and x.Altitude() == 100, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windgusts_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_gusts and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
    except StopIteration:
        print("One or more weather variables were not found in the response.")
        return None

    return weather_data

  return None

In [None]:
# Append the weather data to the data set

origin_weather_data = []
destination_weather_data = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Fetching weather data"):
    origin_weather = get_weather_data(row["FL_DATE"], row["CRS_DEP_TIME"], row["ORIGIN_LAT"], row["ORIGIN_LONG"])
    origin_weather_data.append(origin_weather)

    destination_weather = get_weather_data(row["FL_DATE"], row["CRS_ARR_TIME"], row["DEST_LAT"], row["DEST_LONG"])
    destination_weather_data.append(destination_weather)

origin_weather_df = pd.DataFrame(origin_weather_data).add_prefix('origin_')
destination_weather_df = pd.DataFrame(destination_weather_data).add_prefix('destination_')

df = pd.concat([df, origin_weather_df, destination_weather_df], axis=1)

print(df.head())

In [None]:
# Select only the features that we want

required_columns = [
    'FL_DATE',
    'AIRLINE',
    'FL_NUMBER',
    'ORIGIN',
    'DEST',
    'CRS_DEP_TIME',
    'CRS_ARR_TIME',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
    'DEP_DELAY',
    'ARR_DELAY',
    'DELAY_DUE_WEATHER'
]

weather_columns = [col for col in df.columns if col.startswith('origin_') or col.startswith('destination_')]

final_columns = required_columns + weather_columns

df = df[final_columns]

print(df.head())

In [None]:
# Write the cleaned data to a new csv

df.to_csv('flight_data_with_weather.csv', index=False)

print("Data saved to 'flight_data_with_weather.csv'")