In [26]:
# Import Dependencies
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
airlines_df = pd.read_csv(Path('./Resources/updated_flights_data.csv'))
airlines_df.head()

Unnamed: 0,carrier_code,flight_number,tail_number,destination_airport,scheduled_date,scheduled_departure_time,actual_departure_time,departure_delay,wheels_off,taxi_out_time,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircraft_arrival,departure_from,is_delayed
0,UA,438,N39297,EWR,1/4/22,10:00:00,10:19:00,19,10:38:00,19,8,0,0,0,8,ATL,1
1,UA,593,N17233,EWR,1/4/22,21:45:00,21:55:00,10,22:10:00,15,0,0,0,0,0,ATL,0
2,UA,1249,,EWR,1/4/22,5:45:00,0:00:00,0,0:00:00,0,0,0,0,0,0,ATL,0
3,UA,1575,N815UA,EWR,1/4/22,16:00:00,15:59:00,-1,16:12:00,13,0,0,45,0,0,ATL,1
4,UA,1680,N809UA,EWR,1/4/22,12:20:00,13:11:00,51,13:24:00,13,2,0,26,0,49,ATL,1


In [7]:
# Add column to create binary target based on departure_delay column
def create(row):
    if row['departure_delay'] <= 0:
        val = 0
    elif row['departure_delay'] > 0:
        val = 1
    else:
        val = -1
    return val

airlines_df['delay_outcome'] = airlines_df.apply(create, axis = 1)
airlines_df.head()

Unnamed: 0,carrier_code,flight_number,tail_number,destination_airport,scheduled_date,scheduled_departure_time,actual_departure_time,departure_delay,wheels_off,taxi_out_time,delay_carrier,delay_weather,delay_national_aviation_system,delay_security,delay_late_aircraft_arrival,departure_from,is_delayed,delay_outcome
0,UA,438,N39297,EWR,1/4/22,10:00:00,10:19:00,19,10:38:00,19,8,0,0,0,8,ATL,1,1
1,UA,593,N17233,EWR,1/4/22,21:45:00,21:55:00,10,22:10:00,15,0,0,0,0,0,ATL,0,1
2,UA,1249,,EWR,1/4/22,5:45:00,0:00:00,0,0:00:00,0,0,0,0,0,0,ATL,0,0
3,UA,1575,N815UA,EWR,1/4/22,16:00:00,15:59:00,-1,16:12:00,13,0,0,45,0,0,ATL,1,0
4,UA,1680,N809UA,EWR,1/4/22,12:20:00,13:11:00,51,13:24:00,13,2,0,26,0,49,ATL,1,1


In [10]:
# Check to see null values
airlines_df.isnull().sum()

carrier_code                        0
flight_number                       0
tail_number                       246
destination_airport                 0
scheduled_date                      0
scheduled_departure_time            0
actual_departure_time               0
departure_delay                     0
wheels_off                          0
taxi_out_time                       0
delay_carrier                       0
delay_weather                       0
delay_national_aviation_system      0
delay_security                      0
delay_late_aircraft_arrival         0
departure_from                      0
is_delayed                          0
delay_outcome                       0
dtype: int64

In [11]:
# Drop rows with null values
airlines_df = airlines_df.dropna()

In [13]:
# Check dtypes
airlines_df.dtypes

carrier_code                      object
flight_number                      int64
tail_number                       object
destination_airport               object
scheduled_date                    object
scheduled_departure_time          object
actual_departure_time             object
departure_delay                    int64
wheels_off                        object
taxi_out_time                      int64
delay_carrier                      int64
delay_weather                      int64
delay_national_aviation_system     int64
delay_security                     int64
delay_late_aircraft_arrival        int64
departure_from                    object
is_delayed                         int64
delay_outcome                      int64
dtype: object

In [18]:
# Create our features
X = airlines_df.copy()
X = X.drop(["departure_delay", "delay_carrier", "delay_weather", "delay_national_aviation_system", "delay_security", "delay_late_aircraft_arrival", "departure_from", "is_delayed", "delay_outcome"], axis = 1)
X = pd.get_dummies(X)

# Create our target
y = airlines_df["delay_outcome"]

In [19]:
# Look at X
X.head()

Unnamed: 0,flight_number,taxi_out_time,carrier_code_AA,carrier_code_B6,carrier_code_DL,carrier_code_F9,carrier_code_NK,carrier_code_UA,carrier_code_WN,tail_number_N101DQ,...,wheels_off_9:50:00,wheels_off_9:51:00,wheels_off_9:52:00,wheels_off_9:53:00,wheels_off_9:54:00,wheels_off_9:55:00,wheels_off_9:56:00,wheels_off_9:57:00,wheels_off_9:58:00,wheels_off_9:59:00
0,438,19,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,593,15,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1575,13,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1680,13,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,225,14,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Split the Data into Training and Testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

In [27]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
X.shape

(131062, 6929)

In [29]:
# Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [30]:
# Train the data
classifier.fit(X_train, y_train)

In [31]:
# Predict outcomes for test data set
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,1,0
1,0,0
2,1,1
3,1,1
4,1,1


In [32]:
# Calculate the balance accuracy score
print(accuracy_score(y_test, y_pred))

0.6836354757980834
