In [None]:
import pandas as pd
df = pd.read_csv('2018.csv')

In [None]:
#Data Cleaning

#clean columns
important_columns = ['FL_DATE', 'OP_CARRIER', 'ORIGIN', 'CRS_DEP_TIME', 'DEP_DELAY']
df_cleaned = df[important_columns]

#check for null values & Remove rows with null
#nan_count = df_cleaned.isna().sum()
#print(nan_count)
df_cleaned = df_cleaned.dropna(subset=important_columns)

df_cleaned = df_cleaned.sample(n=100000, random_state=1)

#Ccleaning data types
df_cleaned['FL_DATE'] = pd.to_datetime(df_cleaned['FL_DATE'])
df_cleaned['CRS_DEP_TIME'] = df_cleaned['CRS_DEP_TIME'].astype(int)
df_cleaned['DEP_DELAY'] = df_cleaned['DEP_DELAY'].astype(float)

#flight times into month and day of week
df_cleaned['MONTH'] = df_cleaned['FL_DATE'].dt.month
df_cleaned['DAY_OF_WEEK'] = df_cleaned['FL_DATE'].dt.dayofweek



In [None]:
# Checking for correlation between arr_delay and all other variables

# corr_matrix = df.corr()

# # Check correlation with 'arr_delay'
# arr_delay_corr = corr_matrix['arr_delay']

# print("Correlation with arr_delay:")
# print(arr_delay_corr)

# print(df.head(3))

In [None]:
#Linear Regression Algorithm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


#label encoding airlines and airport origins(split categorical into new columns with binary values)
encoder = LabelEncoder()
df_cleaned['OP_CARRIER'] = encoder.fit_transform(df_cleaned['OP_CARRIER'])
df_cleaned['ORIGIN'] = encoder.fit_transform(df_cleaned['ORIGIN'])
#X_encode = pd.get_dummies(X, columns=['OP_CARRIER', 'ORIGIN'])

#model features
#X = df_cleaned[['MONTH', 'DAY_OF_WEEK', 'OP_CARRIER', 'ORIGIN', 'CRS_DEP_TIME', 'DEP_DELAY', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']] 
X = df_cleaned[['MONTH', 'DAY_OF_WEEK', 'OP_CARRIER', 'ORIGIN', 'CRS_DEP_TIME']] 
#model target
y = df_cleaned['DEP_DELAY'] 


#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#train model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

#apply model to test set
y_pred_lr = model_lr.predict(X_test)

#model evaluation
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression Mean Squared Error: {mse_lr}")
print(f"Linear Regression R^2 Score: {r2_lr}")


In [None]:
# Random Forest Algorithm
from sklearn.ensemble import RandomForestRegressor

#train randomforestregressor model
model_rf = RandomForestRegressor(n_estimators=100, random_state=1)
model_rf.fit(X_train, y_train)

#predict on test set
y_pred_rf = model_rf.predict(X_test)

#model evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Regression Mean Squared Error: {mse_rf}")
print(f"Random Forest Regression R^2 Score: {r2_rf}")

In [None]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#model target
y_class = (df_cleaned['DEP_DELAY'] > 0).astype(int)

#split into training and testing sets
X_train, X_test, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=1)

#train model
model_class = RandomForestClassifier(random_state=1)
model_class.fit(X_train, y_train_class)

#predict on test set
y_pred_class = model_class.predict(X_test)

#model evluatiation
class_accuracy = accuracy_score(y_test_class, y_pred_class)
print(f'Classification Accuracy: {class_accuracy}')
