### Load flights

In [14]:
import pandas as pd

# Load the flights
flights = pd.read_csv('flights.csv')

flights.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Carrier,OriginAirportID,OriginAirportName,OriginCity,OriginState,DestAirportID,DestAirportName,DestCity,DestState,CRSDepTime,DepDelay,DepDel15,CRSArrTime,ArrDelay,ArrDel15,Cancelled
0,2013,9,16,1,DL,15304,Tampa International,Tampa,FL,12478,John F. Kennedy International,New York,NY,1539,4,0.0,1824,13,0,0
1,2013,9,23,1,WN,14122,Pittsburgh International,Pittsburgh,PA,13232,Chicago Midway International,Chicago,IL,710,3,0.0,740,22,1,0
2,2013,9,7,6,AS,14747,Seattle/Tacoma International,Seattle,WA,11278,Ronald Reagan Washington National,Washington,DC,810,-3,0.0,1614,-7,0,0
3,2013,7,22,1,OO,13930,Chicago O'Hare International,Chicago,IL,11042,Cleveland-Hopkins International,Cleveland,OH,804,35,1.0,1027,33,1,0
4,2013,5,16,4,DL,13931,Norfolk International,Norfolk,VA,10397,Hartsfield-Jackson Atlanta International,Atlanta,GA,545,-1,0.0,728,-9,0,0


### Clean flights, encode categorical variables

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Handle missing values if any
flights = flights.fillna(0)

# Encode categorical variables
label_encoder = LabelEncoder()
flights['Carrier'] = label_encoder.fit_transform(flights['Carrier'])
flights['OriginAirportName'] = label_encoder.fit_transform(flights['OriginAirportName'])
flights['DestAirportName'] = label_encoder.fit_transform(flights['DestAirportName'])
flights['OriginCity'] = label_encoder.fit_transform(flights['OriginCity'])
flights['DestCity'] = label_encoder.fit_transform(flights['DestCity'])
flights['OriginState'] = label_encoder.fit_transform(flights['OriginState'])
flights['DestState'] = label_encoder.fit_transform(flights['DestState'])
flights.head()

### select features as well as model target 

In [40]:
# Create the target variable
flights['Delayed'] = flights['ArrDel15']

# Select features
features = ['Month', 'OriginAirportID', 'DestAirportID']
X = flights[features]
y = flights['Delayed']

# Split the flights into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

### Export model to file

In [None]:
import joblib
joblib.dump(model, 'flight_delay_model.pkl')

### Example of how to load the model

In [None]:
model = joblib.load('flight_delay_model.pkl')

In [42]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.7756490402294623
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.97      0.87     42766
           1       0.35      0.06      0.10     11622

    accuracy                           0.78     54388
   macro avg       0.57      0.51      0.49     54388
weighted avg       0.70      0.78      0.71     54388



### Example of how to predict the probability for a flight to be delayed

In [29]:
# Example prediction for a given day and airport pair
example = pd.DataFrame({
    'Year': [2013],
    'Month': [9],
    'DayofMonth': [16],
    'DayOfWeek': [1],
    'OriginAirportID': [15304],
    'DestAirportID': [12478],
    'CRSDepTime': [1539]
})

# Predict the probability of delay
probability = model.predict_proba(example)[:, 1]
print(f'Probability of delay: {probability[0]}')

Probability of delay: 0.0
