In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score, confusion_matrix, make_scorer, accuracy_score

from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, log_loss

In [2]:
with open('dec_flights_model_copy.pickle','rb') as read_file:
    dec_flights = pickle.load(read_file)

In [3]:
dec_flights.drop(columns=['carrier_delay','weather_delay','nas_delay','security_delay','late_aircraft_delay', 'dep_delay_indict','arr_delayed_indict'],axis=1,inplace=True)
dec_flights.columns

Index(['air_time', 'num_flights', 'distance', 'delay_level',
       'scheduled_arr_hr', 'airline_AA', 'airline_AS', 'airline_B6',
       'airline_DL', 'airline_EV',
       ...
       'dest_SFO', 'dest_SHD', 'dest_SLC', 'dest_SLN', 'dest_SPN', 'dest_STS',
       'dest_SWO', 'dest_TPA', 'dest_UIN', 'dest_VEL'],
      dtype='object', length=152)

### Train_test_spilt

In [4]:
X1, y1 = dec_flights.drop('delay_level',axis=1), dec_flights['delay_level']

from imblearn.under_sampling import RandomUnderSampler
X, y = RandomUnderSampler(random_state=42).fit_sample(X1,y1)

# hold out 20% of the data for final testing
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)

#further spilt into 60% and 20% for train and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)

Using TensorFlow backend.


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
#initialise model
model = ['Logistic']
accuracy = []
f1 = []
auc = []
recall = []

In [None]:
#fine tuned
model_2 = ['Logistic']
accuracy_2 = []
f1_2 = []
auc_2 = []
recall_2 = []

### Logistic Regression

In [None]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
print("The score for logistic regression is")
print("Training: {:6.2f}%".format(100*logistic.score(X_train, y_train)))
print("Validate: {:6.2f}%".format(100*logistic.score(X_val, y_val)))
print("Test set: {:6.2f}%".format(100*logistic.score(X_test, y_test)))
print("Log-loss: {:6.4f}".format(log_loss(y_test, logistic.predict_proba(X_test))))

In [None]:
logistic_pred = logistic.predict(X_val)
print('Logistic Results:')
print(confusion_matrix(y_val,logistic_pred))
print(classification_report(y_val, logistic_pred))
logistic_score = logistic.fit(X_train, y_train).score(X_val, y_val)
print('Logistic score: %f' % logistic_score)

In [None]:
logistic_pred = logistic.predict(X_test)
logistic_f1 = f1_score(logistic_pred, y_test)

In [None]:
logistic_y_score = logistic.predict_proba(X_val)[:, 1]

#calculate roc curve
logistic_fpr, logistic_tpr, logistic_auc_thresholds = roc_curve(y_val, logistic_y_score)

#calculate auc
auc_logistic = roc_auc_score(y_val, logistic_y_score)
print('AUC: %.3f' % auc_logistic)

In [None]:
recall_logistic = recall_score(y_train, logistic.predict(X_train))
print('Recall: %.3f' % recall_logistic)

In [None]:
accuracy.append(logistic_score)
print('Accuracy: ',accuracy)

f1.append(logistic_f1)
print('F1: ', f1)

auc.append(auc_logistic)
print('AUC: ', auc)

recall.append(recall_logistic)
print('Recall: ', recall)

#### Randomised Search Logistic Regression

In [None]:
est = LogisticRegression()

rf_p_dist={
           'C':[0.1,0.5,1,5,10],
           'fit_intercept':[True,False],
           'verbose':[0.1,0.5,1.0,3,5],
           'max_iter':[1,5,10,20,50]
          }

def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,
                                  n_jobs=-1, n_iter=nbr_iter, cv=5)
    #CV = Cross-Validation ( here using Stratified KFold CV)
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [None]:
hypertuning_rscv(est, rf_p_dist, 40, X, y)

In [None]:
logistic2 = LogisticRegression(C=10, verbose=5, max_iter=50, fit_intercept=True)
logistic2.fit(X_train, y_train)
print("The score for logistic regression is")
print("Training: {:6.2f}%".format(100*logistic2.score(X_train, y_train)))
print("Validate: {:6.2f}%".format(100*logistic2.score(X_val, y_val)))
print("Test set: {:6.2f}%".format(100*logistic2.score(X_test, y_test)))
print("Log-loss: {:6.4f}".format(log_loss(y_test, logistic2.predict_proba(X_test))))

In [None]:
logistic2_pred = logistic2.predict(X_val)
print('Logistic Results:')
print(confusion_matrix(y_val,logistic2_pred))
print(classification_report(y_val, logistic2_pred))
logistic2_score = logistic2.fit(X_train, y_train).score(X_val, y_val)
print('Logistic score: %f' % logistic2_score)

In [None]:
logistic2_pred = logistic2.predict(X_test)
f1_score(logistic2_pred, y_test)

In [None]:
logistic2_y_score = logistic2.predict_proba(X_val)[:, 1]
#calculate roc curve
logistic_fpr, logistic_tpr, logistic_auc_thresholds = roc_curve(y_val, logistic2_y_score)

#calculate auc
auc_logistic2 = roc_auc_score(y_val, logistic2_y_score)
print('AUC: %.3f' % auc_logistic2)

In [None]:
recall_logistic2 = recall_score(y_train, logistic2.predict(X_train))
print('Recall: %.3f' % recall_logistic2)

In [None]:
accuracy_2.append(logistic2_score)
print('Accuracy: ',accuracy_2)

logistic2_f1 = f1_score(logistic2_pred, y_test)
f1_2.append(logistic2_f1)
print('F1: ', f1_2)

auc_2.append(auc_logistic2)
print('AUC: ', auc_2)

recall_2.append(recall_logistic2)
print('Recall: ', recall_2)

### Graphs

In [None]:
#initialise model
model = ['Logistic', 'Bernoulli', 'Forest']
accuracy = []
f1 = []
auc = []
recall = []

In [None]:
pwd

In [None]:
accuracy = [0.9132170538698307, 0.9132170538698307, 0.8978068081388852]

plt.figure(figsize=(12,7))
sns.despine()
ax = sns.barplot(x=model, y=accuracy,color='#85cbb3')

# plt.title('Accuracy Score', fontsize=20, fontweight='bold')

plt.xlabel('Model', fontsize=16, fontweight='bold')
plt.xticks(fontsize=14)

#plt.ylabel('Score', fontsize=14, fontweight='bold')
plt.yticks(fontsize=14)
plt.ylim([0,1.0])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)

plt.savefig('accuracy_score_refined.jpg', transparent=True)
;

In [None]:
plt.figure(figsize=(12,7))
sns.despine()
ax = sns.barplot(x=model, y=f1,color='#85cbb3')

# plt.title('F1 Score', fontsize=20, fontweight='bold')

plt.xlabel('Model', fontsize=16, fontweight='bold')
plt.xticks(fontsize=14)

#plt.ylabel('Score', fontsize=14, fontweight='bold')
plt.yticks(fontsize=14)
plt.ylim([0,1.0])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)

# plt.savefig('f1_score_refined.jpg', transparent=True)
;

In [None]:
plt.figure(figsize=(12,7))
sns.despine()
ax = sns.barplot(x=model, y=auc,color='#85cbb3')

plt.title('Area Under Curve(AUC)', fontsize=20, fontweight='bold')

plt.xlabel('Model', fontsize=16, fontweight='bold')
plt.xticks(fontsize=14)

#plt.ylabel('Score', fontsize=14, fontweight='bold')
plt.yticks(fontsize=14)
plt.ylim([0,1.0])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
;

In [None]:
plt.figure(figsize=(12,7))
sns.despine()
ax = sns.barplot(x=model, y=recall,color='#85cbb3')

# plt.title('Recall Score', fontsize=20, fontweight='bold')

plt.xlabel('Model', fontsize=16, fontweight='bold')
plt.xticks(fontsize=14)

#plt.ylabel('Score', fontsize=14, fontweight='bold')
plt.yticks(fontsize=14)
plt.ylim([0,1.0])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)

plt.savefig('recall_refined.jpg', transparent=True)
;