In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define the path
path = '/content/drive/MyDrive/dataset/flights.csv'

# Load the dataset
# Using low_memory=False because the dataset has mixed data types
df = pd.read_csv(path, low_memory=False)

df.head()

In [None]:
# Create the binary target variable
# 1 = Delayed (>15 mins), 0 = On Time (<=15 mins)
df['Is_Delayed'] = (df['ARRIVAL_DELAY'] > 15).astype(int)

# Check how many of each we have
print("Target Variable Distribution:")
print(df['Is_Delayed'].value_counts(normalize=True))

In [None]:
# Check for missing values in each column
missing_data = df.isnull().sum()
print("Missing values per column:\n", missing_data[missing_data > 0])

# Check for duplicate rows
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")

In [None]:
# Drop columns that are mostly empty (over 90% missing values)
limit = len(df) * 0.9
df_cleaned = df.dropna(thresh=limit, axis=1)

# Drop rows with missing values in critical columns
critical_cols = ['DEPARTURE_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']
df_cleaned = df_cleaned.dropna(subset=critical_cols)

# Check how much data is left?
print(f"Original rows: {len(df)}")
print(f"Cleaned rows: {len(df_cleaned)}")
print("\nRemaining missing values:")
print(df_cleaned.isnull().sum().sum())