<a href="https://colab.research.google.com/github/haraldriisager/ML-Project/blob/weather-api/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install gdown
!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas



In [26]:
import gdown
import pandas as pd
import openmeteo_requests
import requests_cache
from retry_requests import retry
from datetime import datetime, timedelta

In [36]:
# Import Flight Delay data set

file_id = '14aF7ZORUZFGKoAG7IE9Cd6vjTLtU2ytz'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'data.csv', quiet=False)
df = pd.read_csv('data.csv')

print(df.columns)

Downloading...
From (original): https://drive.google.com/uc?id=14aF7ZORUZFGKoAG7IE9Cd6vjTLtU2ytz
From (redirected): https://drive.google.com/uc?id=14aF7ZORUZFGKoAG7IE9Cd6vjTLtU2ytz&confirm=t&uuid=9eb5255e-0c11-4c6c-8e44-1631f2e3eadf
To: /content/data.csv
100%|██████████| 614M/614M [00:10<00:00, 56.4MB/s]


Index(['FL_DATE', 'AIRLINE', 'AIRLINE_DOT', 'AIRLINE_CODE', 'DOT_CODE',
       'FL_NUMBER', 'ORIGIN', 'ORIGIN_CITY', 'DEST', 'DEST_CITY',
       'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
       'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'DELAY_DUE_CARRIER',
       'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY',
       'DELAY_DUE_LATE_AIRCRAFT'],
      dtype='object')


In [37]:
# Import Airport locations data set

file_id = '1eK1b3XX3jl-9XtQrH-_924YjV3rlqUFB'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'airport_locations.csv', quiet=False)
airport_locations = pd.read_csv('airport_locations.csv', delimiter=';')
airport_locations = airport_locations[['Airport Code', 'Latitude', 'Longitude']].dropna()
airport_locations = airport_locations.set_index('Airport Code').to_dict(orient='index')

Downloading...
From: https://drive.google.com/uc?id=1eK1b3XX3jl-9XtQrH-_924YjV3rlqUFB
To: /content/airport_locations.csv
100%|██████████| 875k/875k [00:00<00:00, 77.2MB/s]


In [38]:
# Append the origin and destination coordinates to the data set

def get_airport_coordinates(airport_code):
  if airport_code in airport_locations:
    return airport_locations[airport_code]['Latitude'], airport_locations[airport_code]['Longitude']
  return None, None

df["ORIGIN_LAT"], df["ORIGIN_LONG"] = zip(*df["ORIGIN"].apply(get_airport_coordinates))
df["DEST_LAT"], df["DEST_LONG"] = zip(*df["DEST"].apply(get_airport_coordinates))

In [39]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)
url = "https://archive-api.open-meteo.com/v1/archive"

In [52]:
def get_weather_data(date, time, latitude, longitude):

  flight_datetime = datetime.strptime(f"{date} {time}", "%Y-%m-%d %H%M")
  start_date = flight_datetime.strftime("%Y-%m-%d")
  end_date = (flight_datetime + timedelta(hours=1)).strftime("%Y-%m-%d")

  params = {
    "latitude": latitude,
    "longitude": longitude,
    "start_date": date,
    "end_date": date,
    "hourly": [
        "temperature_2m",
        "precipitation",
        "rain",
        "snowfall",
        "weather_code",
        "windspeed_10m",
        "windspeed_100m",
        "winddirection_10m",
        "winddirection_100m",
        "windgusts_10m",
    ],
    "timezone": "auto"
  }

  responses = openmeteo.weather_api(url, params=params)

  if responses and isinstance(responses, list):
    response = responses[0]
    hourly_data = response.Hourly() if response else None
  else:
    hourly_data = None

  if hourly_data:
    hourly_variables = list(map(lambda i: hourly_data.Variables(i), range(0, hourly_data.VariablesLength())))

    weather_data = {}

    try:
        weather_data["temperature_2m"] = next(
            filter(lambda x: x.Variable() == Variable.temperature and x.Altitude() == 2, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["precipitation"] = next(
            filter(lambda x: x.Variable() == Variable.precipitation, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["rain"] = next(
            filter(lambda x: x.Variable() == Variable.rain, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["snowfall"] = next(
            filter(lambda x: x.Variable() == Variable.snowfall, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windspeed_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_speed and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windspeed_100m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_speed and x.Altitude() == 100, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["winddirection_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_direction and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["winddirection_100m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_direction and x.Altitude() == 100, hourly_variables)
        ).ValuesAsNumpy()
        weather_data["windgusts_10m"] = next(
            filter(lambda x: x.Variable() == Variable.wind_gusts and x.Altitude() == 10, hourly_variables)
        ).ValuesAsNumpy()
    except StopIteration:
        print("One or more weather variables were not found in the response.")
        return None

    return weather_data

  return None

In [53]:
# Append the weather data to the data set

origin_weather_data = []
destination_weather_data = []

for _, row in df.iterrows():
  origin_weather = get_weather_data(row["FL_DATE"], row["CRS_DEP_TIME"], row["ORIGIN_LAT"], row["ORIGIN_LONG"])
  destination_weather = get_weather_data(row["FL_DATE"], row["CRS_ARR_TIME"], row["DEST_LAT"], row["DEST_LONG"])

  origin_weather_data.append(origin_weather)
  destination_weather_data.append(destination_weather)

origin_weather_df = pd.DataFrame(origin_weather_data).add_prefix('origin_')
destination_weather_df = pd.DataFrame(destination_weather_data).add_prefix('destination_')

df = pd.concat([df, origin_weather_df, destination_weather_df], axis=1)

print(df.head())

NameError: name 'Variable' is not defined

In [35]:
# Clean data so that only the desired features are present

required_columns = [
    'FlightDate',
    'Airline',
    'FlightNumber',
    'Origin',
    'Destination',
    'ScheduledDeparture',
    'ScheduledArrival',
    'ScheduledElapsedTime',
    'Distance',
    'DepartureDelay',
    'ArrivalDelay',
    'DelayDueWeather'
]

weather_columns = [col for col in df.columns if col.startswith('origin_') or col.startswith('destination_')]

final_columns = required_columns + weather_columns

df_cleaned = df[final_columns]

print(df_cleaned.head())

KeyError: "['FlightDate', 'ScheduledElapsedTime', 'DepartureDelay', 'ArrivalDelay', 'DelayDueWeather'] not in index"