<a href="https://colab.research.google.com/github/haraldriisager/ML-Project/blob/evan/Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown
import gdown
import pandas as pd

file_id = '1Uf3q-CoSVK84kogmfTAtvIZ3k8crjN1d'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'data.csv', quiet=False)
df = pd.read_csv('data.csv')

print(df.head())



Downloading...
From (original): https://drive.google.com/uc?id=1Uf3q-CoSVK84kogmfTAtvIZ3k8crjN1d
From (redirected): https://drive.google.com/uc?id=1Uf3q-CoSVK84kogmfTAtvIZ3k8crjN1d&confirm=t&uuid=14fff4ba-4a5d-4dd3-8b9a-f926308e7de6
To: /content/data.csv
100%|██████████| 256M/256M [00:04<00:00, 58.5MB/s]


   FlightID    Airline  FlightNumber Origin Destination ScheduledDeparture  \
0         1     United          4558    ORD         MIA   2024-09-01 08:11   
1         2      Delta          8021    LAX         MIA   2024-09-01 10:25   
2         3  Southwest          7520    DFW         SFO   2024-09-01 16:53   
3         4      Delta          2046    ORD         BOS   2024-09-01 14:44   
4         5      Delta          6049    LAX         SEA   2024-09-01 01:51   

    ActualDeparture  ScheduledArrival     ActualArrival  DelayMinutes  \
0  2024-09-01 08:30  2024-09-01 12:11  2024-09-01 12:19             8   
1  2024-09-01 10:41  2024-09-01 13:25  2024-09-01 13:27             2   
2  2024-09-01 17:05  2024-09-01 17:53  2024-09-01 18:07            14   
3  2024-09-01 15:04  2024-09-01 18:44  2024-09-01 18:34           -10   
4  2024-09-01 02:08  2024-09-01 05:51  2024-09-01 06:15            24   

           DelayReason  Cancelled  Diverted AircraftType TailNumber  Distance  
0           

In [None]:
# Define delay classification based on DelayMinutes
def classify_delay(delay_minutes):
    if delay_minutes <= 15:
        return 'No delay'
    elif 15 < delay_minutes <= 30:
        return 'Slight delay'
    elif 30 < delay_minutes <= 60:
        return 'Moderate delay'
    else:
        return 'Severe delay'

# Apply classification function
df['delay_category'] = df['DelayMinutes'].apply(classify_delay)

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Select relevant features
features = ['Airline', 'FlightNumber', 'Origin', 'Destination', 'ScheduledDeparture',
            'ScheduledArrival', 'Distance', 'DelayReason', 'AircraftType']
# Create a copy of the DataFrame to avoid SettingWithCopyWarning
df = df[features + ['delay_category']].copy()

# Encode categorical columns
label_encoder = LabelEncoder()
for col in ['Airline', 'Origin', 'Destination', 'DelayReason', 'AircraftType']:
    df.loc[:, col] = label_encoder.fit_transform(df[col].fillna("Unknown"))

# Convert ScheduledDeparture and ScheduledArrival to datetime
df.loc[:, 'ScheduledDeparture'] = pd.to_datetime(df['ScheduledDeparture'])
df.loc[:, 'ScheduledArrival'] = pd.to_datetime(df['ScheduledArrival'])

# Feature engineering for departure time
df.loc[:, 'DepartureHour'] = df['ScheduledDeparture'].dt.hour
df.loc[:, 'ArrivalHour'] = df['ScheduledArrival'].dt.hour

# Standardize numerical columns
scaler = StandardScaler()
df[['FlightNumber', 'Distance', 'DepartureHour', 'ArrivalHour']] = scaler.fit_transform(
    df[['FlightNumber', 'Distance', 'DepartureHour', 'ArrivalHour']]
)

In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('delay_category', axis=1)
y = df['delay_category']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Drop datetime columns if they are still in X_train
X_train = X_train.drop(columns=['ScheduledDeparture', 'ScheduledArrival'], errors='ignore')

# Verify all columns are numeric
print(X_train.dtypes)

rf_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

Airline            int64
FlightNumber     float64
Origin             int64
Destination        int64
Distance         float64
DelayReason        int64
AircraftType       int64
DepartureHour    float64
ArrivalHour      float64
dtype: object


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

X_train = X_train.drop(['ScheduledArrival', 'ScheduledDeparture'], axis=1, errors='ignore')
X_test = X_test.drop(['ScheduledArrival', 'ScheduledDeparture'], axis=1, errors='ignore')

# Now you should be able to predict without errors
y_pred = rf_model.predict(X_test)

# Evaluate predictions
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    No delay       0.67      0.83      0.74    222103
Slight delay       0.50      0.30      0.37    127423

    accuracy                           0.64    349526
   macro avg       0.59      0.56      0.56    349526
weighted avg       0.61      0.64      0.61    349526

[[184369  37734]
 [ 89722  37701]]
