In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

In [51]:
# Load the data
data = pd.read_csv('resources/flights.csv')
data.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,...,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,3,EV,20366,EV,N48901,4397,13930,1393007,ORD,...,1197705,GRB,1003.0,0.0,1000-1059,1117.0,0.0,0.0,0.0,174.0
1,1,3,EV,20366,EV,N16976,4401,15370,1537002,TUL,...,1393007,ORD,1027.0,0.0,1000-1059,1216.0,0.0,0.0,0.0,585.0
2,1,3,EV,20366,EV,N12167,4404,11618,1161802,EWR,...,1541205,TYS,1848.0,0.0,1800-1859,2120.0,0.0,0.0,0.0,631.0
3,1,3,EV,20366,EV,N14902,4405,10781,1078105,BTR,...,1226603,IAH,1846.0,0.0,1800-1859,2004.0,0.0,0.0,0.0,253.0
4,1,3,EV,20366,EV,N606UX,4407,14524,1452401,RIC,...,1226603,IAH,1038.0,0.0,1000-1059,1330.0,0.0,0.0,0.0,1157.0


In [52]:
# Clean and prepare the data
data.dropna(inplace=True)
data.drop(['TAIL_NUM', 'ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_SEQ_ID'], axis=1, inplace=True)



In [53]:
# Encode categorical variables
label_encoder = LabelEncoder()
categorical_columns = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_TIME_BLK']
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])



In [54]:
# Split the data into training and testing datasets
X = data.drop('ARR_DEL15', axis=1)
y = data['ARR_DEL15']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [56]:
# Train the model (Logistic Regression)
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Train the model (Random Forest Classifier)
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

# Train the model (XGBClassifier)
XGBClassifier_model = RandomForestClassifier()
XGBClassifier_model.fit(X_train, y_train)

RandomForestClassifier()

In [57]:
# Test the model (Logistic Regression)
y_pred_lr = logistic_regression_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print('Logistic Regression accuracy:', accuracy_lr)

Logistic Regression accuracy: 0.9308241694061108


In [58]:
# Test the model (Random Forest Classifier)
y_pred_rf = random_forest_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print('Random Forest Classifier accuracy:', accuracy_rf)

Random Forest Classifier accuracy: 0.9375156440335742


In [59]:
# Test the model (XGBClassifier)
y_pred_rf = XGBClassifier_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print('XGBClassifier_model accuracy:', accuracy_rf)

XGBClassifier_model accuracy: 0.9375907353947303
