In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data1 = pd.read_csv("csv-flight/preprocessed_df.csv")

In [None]:
data.head()

In [None]:
data = data1.drop(columns = ['FlightDate', 'Tail_Number', 'Flight_Number_Reporting_Airline', 'OriginCityName', 'OriginState', 'DestCityName', 'DestState', 'MinSince'])


In [None]:
data = pd.get_dummies(data)

In [None]:
data['Delay'] = data['DepDelay'].apply(lambda x: 1 if x < 15.0 else 0)


In [None]:
data = data.drop(columns = ['DepDelay'])

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('Delay', axis=1)
y = data['Delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=1)


In [None]:

#remove any rows with DepDelay <= 0
df_new = data1[data1['DepDelay'] > 0]

#Keep these columns
df_new = df_new[['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'Reporting_Airline', 'Origin', 'Dest', 'CRSDepTime', 'DepDelay', 'CRSElapsedTime', 'Distance', 'is_holiday_week', 'TotalDensity', 'Visibility', 'WindSpeed', 'SevereWeather', 'BadWeather']]

#turn all non-numerical data into numerical data, using .factorize()
object_cols = df_new.select_dtypes(include='object').columns
df_nums = df_new.copy()
for col in object_cols:
    df_nums[col],  = pd.factorize(df_new[col])

#convert DepDelay into bins
bins = [15, 30, 60, 120, 300, df_nums['DepDelay'].max()]
labels = [0, 1, 2, 3, 4]
df_nums['DepDelay'] = pd.cut(df_nums['DepDelay'], bins=bins, labels=labels)
#drop rows with NaN
df_nums = df_nums.dropna()

#oversample the data to balance the classes
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X = df_nums.drop(columns=['DepDelay'])
y = df_nums['DepDelay']

X_resampled, y_resampled = ros.fit_resample(X, y)

#split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=32)

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

xgb_classifier = xgb.XGBClassifier()

param_grid = {
    'max_depth': [3],
    'learning_rate': [0.2],
    'n_estimators': [100],
    'gamma': [0.2]
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_xgb_classifier = grid_search.best_estimator_

y_pred = best_xgb_classifier.predict(X_test)

testing_accuracy = accuracy_score(y_test, y_pred)

print(testing_accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


In [None]:
feature_importances = best_xgb_classifier.feature_importances_

top_feature_indices = feature_importances.argsort()[::-1]

top_features = [(X_train.columns[i], feature_importances[i]) for i in top_feature_indices]
print("Top features:")
for feature, importance in top_features:
    print(f"{feature}: {importance}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train, y_train)

predictions = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)

print("Random Forest Classifier Accuracy:", accuracy)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


In [None]:
X_train

In [None]:
importances = rf_classifier.feature_importances_

indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. Feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

svm_classifier = SVC()

param_grid = {
    'C': [1.0],
    'kernel': ['rbf'],
    'gamma': ['scale']
}

grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_svm_classifier = grid_search.best_estimator_

y_pred = best_svm_classifier.predict(X_test)

testing_accuracy = accuracy_score(y_test, y_pred)

print(testing_accuracy)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

logistic_classifier = LogisticRegression()

param_grid = {
    'C': [1.0],
    'solver': ['liblinear'],
    'penalty': ['l2']
}

grid_search = GridSearchCV(estimator=logistic_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_logistic_classifier = grid_search.best_estimator_

y_pred = best_logistic_classifier.predict(X_test)

testing_accuracy = accuracy_score(y_test, y_pred)

print(testing_accuracy)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = GaussianNB()

nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

In [None]:

means = nb_classifier.theta_

variances = nb_classifier.var_

total_variance = variances.sum(axis=0)

feature_importance = (total_variance / total_variance.sum()) * 100

for i, importance in enumerate(feature_importance):
    print(f"Feature {i+1}: {importance:.2f}%")
