In [2]:
import pandas as pd
import numpy as np

flight_df = pd.read_csv("../data/raw/flights_2025_01.csv")
weather_df = pd.read_csv("../data/raw/weather_2025_01.csv")

flight_df = flight_df.dropna(subset=['DEP_TIME', 'ARR_DELAY', 'CRS_DEP_TIME'])

def get_hour_from_time(dep_time):
    try:
        return int(dep_time) // 100
    except:
        return 0

flight_df['DepHour'] = flight_df['DEP_TIME'].apply(get_hour_from_time)
flight_df.loc[flight_df['DepHour'] == 24, 'DepHour'] = 0

flight_df['FL_DATE'] = pd.to_datetime(flight_df['FL_DATE'])
flight_df['Join_Time'] = flight_df['FL_DATE'] + pd.to_timedelta(flight_df['DepHour'], unit='h')

def convert_crs_to_minutes(crs_time):
    try:
        crs_int = int(crs_time)
        hours = crs_int // 100
        minutes = crs_int % 100
        return hours * 60 + minutes
    except:
        return 0

flight_df['CRS_MINUTES'] = flight_df['CRS_DEP_TIME'].apply(convert_crs_to_minutes)

weather_df['time'] = pd.to_datetime(weather_df['time'])
merged_df = pd.merge(
    flight_df, weather_df,
    left_on=['ORIGIN', 'Join_Time'], right_on=['Airport', 'time'], how='left'
)

df_clean = merged_df.dropna(subset=['temp', 'DEP_DEL15','prcp'])


cols_to_keep = [
    'ORIGIN', 'DEST', 'OP_UNIQUE_CARRIER', 
    'DAY_OF_MONTH', 'DAY_OF_WEEK', 'CRS_MINUTES',
    'DISTANCE',
    'temp', 'rhum', 'prcp', 'wspd', 'coco',
    'DEP_DEL15'
]

df_final = df_clean[cols_to_keep].copy()

for col in ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'CRS_MINUTES']:
    df_final[col] = df_final[col].astype(int)

print(f"Dataset shape: {df_final.shape}")
print(df_final.head(30000))

df_final.to_csv("../data/processed/flights_final_dataset.csv", index=False)
print("File saved successfully")


  flight_df['FL_DATE'] = pd.to_datetime(flight_df['FL_DATE'])


Dataset shape: (190679, 13)
      ORIGIN DEST OP_UNIQUE_CARRIER  DAY_OF_MONTH  DAY_OF_WEEK  CRS_MINUTES  \
0        SFO  JFK                AA             1            3          630   
1        JFK  SFO                AA             1            3          360   
3        JFK  LAX                AA             1            3         1260   
5        LAX  JFK                AA             1            3          690   
10       JFK  LAX                AA             1            3          419   
...      ...  ...               ...           ...          ...          ...   
80616    DFW  ORD                F9             5            7          747   
80618    LAX  LAS                F9             5            7         1143   
80619    LAS  LAX                F9             5            7          702   
80620    LAX  SMF                F9             5            7          840   
80622    MCO  MKE                F9             5            7          450   

       DISTANCE  temp  