In [14]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load your data
data = pd.read_csv('flight_data.csv')

In [3]:
data.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,DEST,DEP_DELAY,CRS_ELAPSED_TIME,DISTANCE,CRS_DEP_M,...,Humidity,Wind,Wind Speed,Wind Gust,Pressure,Condition,sch_dep,sch_arr,TAXI_OUT,Is_delayed
0,11,1,5,B6,N828JB,CHS,-1,124,636,324,...,58,W,25,38,29.86,Fair / Windy,9,17,14,0
1,11,1,5,B6,N992JB,LAX,-7,371,2475,340,...,58,W,25,38,29.86,Fair / Windy,9,17,15,0
2,11,1,5,B6,N959JB,FLL,40,181,1069,301,...,58,W,25,38,29.86,Fair / Windy,9,17,22,1
3,11,1,5,B6,N999JQ,MCO,-2,168,944,345,...,58,W,25,38,29.86,Fair / Windy,9,17,12,0
4,11,1,5,DL,N880DN,ATL,-4,139,760,360,...,58,W,24,35,29.91,Fair / Windy,9,17,13,0


In [4]:
data.dropna(inplace=True)
data["Dew Point"] = data["Dew Point"].astype("int64")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8825 entries, 0 to 8824
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MONTH              8825 non-null   int64  
 1   DAY_OF_MONTH       8825 non-null   int64  
 2   DAY_OF_WEEK        8825 non-null   int64  
 3   OP_UNIQUE_CARRIER  8825 non-null   object 
 4   TAIL_NUM           8825 non-null   object 
 5   DEST               8825 non-null   object 
 6   DEP_DELAY          8825 non-null   int64  
 7   CRS_ELAPSED_TIME   8825 non-null   int64  
 8   DISTANCE           8825 non-null   int64  
 9   CRS_DEP_M          8825 non-null   int64  
 10  DEP_TIME_M         8825 non-null   int64  
 11  CRS_ARR_M          8825 non-null   int64  
 12  Temperature        8825 non-null   int64  
 13  Dew Point          8825 non-null   int64  
 14  Humidity           8825 non-null   int64  
 15  Wind               8825 non-null   object 
 16  Wind Speed         8825 

In [5]:
features = ['CRS_ELAPSED_TIME', 'DISTANCE', 'CRS_DEP_M', 'DEP_TIME_M', 'CRS_ARR_M',
            'Temperature', 'Dew Point', 'Humidity', 'Wind Speed', 'Wind Gust',
            'Pressure', 'sch_dep', 'sch_arr', 'TAXI_OUT']

In [6]:
X = data[features]
y = data['Is_delayed']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Save the model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [15]:
from sklearn.model_selection import GridSearchCV

parameters = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}
grid_search_log_reg = GridSearchCV(model, parameters, cv=5)
grid_search_log_reg.fit(X, y)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

log_reg_predictions = grid_search_log_reg.predict(X_test)

In [19]:
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
log_reg_precision = precision_score(y_test, log_reg_predictions)
log_reg_recall = recall_score(y_test, log_reg_predictions)

Logistic Regression:
Accuracy: 0.9031161473087819
Precision: 1.0
Recall: 0.03389830508474576


In [21]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

parameters_rf = {'max_depth': range(1, 21)}

grid_search_rf = GridSearchCV(rf_model, parameters_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X, y)

In [22]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()

parameters_xgb = {'max_depth': range(1, 21)}

grid_search_xgb = GridSearchCV(xgb_model, parameters_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X, y)

In [23]:
rf_predictions = grid_search_rf.predict(X_test)

xgb_predictions = grid_search_xgb.predict(X_test)

In [27]:
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)

xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_precision = precision_score(y_test, xgb_predictions)
xgb_recall = recall_score(y_test, xgb_predictions)

print("Evaluation results:")
print("Logistic Regression:")
print("Accuracy:", log_reg_accuracy)
print("Precision:", log_reg_precision)
print("Recall:", log_reg_recall)
print("\nRandom Forest:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("\nXGBoost:")
print("Accuracy:", xgb_accuracy)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)

Evaluation results:
Logistic Regression:
Accuracy: 0.9031161473087819
Precision: 1.0
Recall: 0.03389830508474576

Random Forest:
Accuracy: 0.8997167138810198
Precision: 0.0
Recall: 0.0

XGBoost:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
