# Flight Delay Prediction

**Abstract:** 


Finding and measuring factors affecting aircraft delays on the ground and in the air and developing machine learning algorithms to optimize airline and airport operations based on the factors responsible for the flight delay

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#directory where our dataset present
data_dir = "/content/drive/My Drive/Colab Notebooks/Airline Delay Prediction/Project2"

import os
print(os.listdir(data_dir)) #folders inside dataset directory

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data= pd.read_csv(data_dir+"/data_final.csv")
data.head()

Before Beginning Data Transformtion, let us first split the dataset into Train-Test and then seperately perform the transformations on the datasets.

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.33,random_state = 1)
print(train.shape)
print(test.shape)

test.head()


**Target Distribution for our Regression Problem**

In [None]:
_ = train['arrival_delay_converted'].plot(kind = 'density', title = 'Distribution of Arrival Delays', fontsize=14, figsize=(10, 6))

In [None]:
_ = train['departure_delay_converted'].plot(kind = 'density', title = 'Distribution of Departure Delays', fontsize=14, figsize=(10, 6))

Observation: Since both distributions show almost a gaussian curve (no skewness), no other transformations like logarithmic transformations needed.

In [None]:
#Check if Data is Imbalanced

In [None]:
# selecting rows based on condition 
train_delayed_flights = train[train['arrival_delay_converted'] > 0] 
train_delayed_flights.shape

In [None]:
train_early_flights = train[train['arrival_delay_converted'] <= 0] 
train_early_flights.shape

In [None]:
print(f'Percentage of Flights which got delayed is: {100*2684/(2684+6710)}')
print(f'Percentage of Flights which arrived early is: {100*6710/(2684+6710)}')


Hence it is an imbalanced dataset.

**Variable Datatypes:**

In [None]:
train.dtypes

In [None]:
#year column should be categorical not numeric.
train['year'] = train['year'].astype('object')
test['year'] = test['year'].astype('object')

**Data Transformations on Numeric and Categorical Features:**



1.Handling Categorical Features:

'Carrier'

'Airplane Type'

'Year'

'Month'

'Departure Airport'

'Arrival Airport'


We will apply one hot encoding on 'Carrier','Airplane Type','Departure Airport', 'Arrival Airport' and 'year' since less than 6 categories for each columns are there.






In [None]:
train_transformed= pd.get_dummies(train)

test_transformed= pd.get_dummies(test)

test_transformed.head()

#month not considered in this one hot encoding (as we haven't converted it to object from int64 type). 
#We are handling it by another encoding as shown below.

Handling Month Column:

In [None]:
#Mean Encoding of Month Variable according to Target (Arrival_Delay)

#calculate mean of target in train data according to month
Mean_encoded_month = train_transformed.groupby(['month'])['arrival_delay_converted'].mean().to_dict() 


print(Mean_encoded_month)

print("===========================================================================================")

#use the SAME train mean value for both train,test encoding to avoid data leakage
train_transformed['month_encoded'] =  train_transformed['month'].map(Mean_encoded_month) 
test_transformed['month_encoded'] =  test_transformed['month'].map(Mean_encoded_month) 

#somewhat follows trend we saw while doing EDA of average delayed flights per month

train_transformed.head()

In [None]:
test_transformed.head()

In [None]:
#drop column after encoding
train_transformed.drop('month',axis=1,inplace=True)
test_transformed.drop('month',axis=1,inplace=True)

In [None]:
#ID_COL= ''
TARGET_COL='arrival_delay_converted'

features = [c for c in train_transformed.columns if c not in [TARGET_COL]]
print(f'\nThe train dataset contains {len(features)} input features')

#also let's drop the ID_COL from dataframe right now itself
#data= data.drop([ID_COL],axis=1)

In [None]:
features

In [None]:
print(train_transformed.shape)
print(test_transformed.shape)


Get Features and Predictor Variable Vales:

In [None]:
#Input to our model will be the features
X_trn, X_test = train_transformed[features], test_transformed[features]

#Output of our model will be the TARGET_COL
y_trn, y_test = train_transformed[TARGET_COL], test_transformed[TARGET_COL]

In [None]:
X_trn.head()

In [None]:
y_trn #actual values of target in train dataset

**List of all columns after Categorical Variables are encoded**

In [None]:

cols=X_trn.columns.tolist()
cols

In [None]:

num_cols=[ 'month_encoded',
 'D_DewPointC', 'D_WindGustKmph', 'D_cloudcover', 'D_humidity', 'D_precipMM', 'D_pressure', 'D_tempC', 'D_visibility','D_winddirDegree', 'D_windspeedKmph', 
 'A_DewPointC', 'A_WindGustKmph','A_cloudcover', 'A_humidity', 'A_precipMM', 'A_pressure', 'A_tempC','A_visibility', 'A_winddirDegree', 'A_windspeedKmph',
 'duration_converted', 'departure_delay_converted', ]

cat_cols=[
       'Airplane Type_A20N','Airplane Type_A320', 'Airplane Type_B737', 'Airplane Type_B738',
       'Carrier_Air Asia', 'Carrier_Air India', 'Carrier_Go Air','Carrier_Indigo', 'Carrier_Spicejet', 'Carrier_Vistara',
       'Departure Airport_BLR', 'Departure Airport_BOM','Departure Airport_CCU', 'Departure Airport_DEL', 
       'Arrival Airport_BOM','Arrival Airport_DEL', 'Arrival Airport_HYD', 
       'year_2018', 'year_2019','year_2020']

In [None]:
num_cols_index=[X_trn.columns.get_loc(c) for c in num_cols if c in X_trn]
num_cols_index

In [None]:
cat_cols_index=[X_trn.columns.get_loc(c) for c in cat_cols if c in X_trn]
cat_cols_index

2. StandardScaler on numerical features


In [None]:
#Standardize numeirc features by removing the mean and scaling to unit variance. This is necessary for Logistic Regression.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
_ = scaler.fit(X_trn)

X_trn = scaler.transform(X_trn)
X_test = scaler.transform(X_test)

In [None]:
print(X_trn.shape)
print(X_test.shape)

In [None]:
X_trn[0] #1st row in train dataset

In [None]:
X_test #test dataset after transformation

**Metric for our Regression Problem:**


1. **RMSE**
2. **R2 Value**

In [None]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

def rmse(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

def rmsle(y_true, y_pred):
  return np.sqrt(mean_squared_log_error(y_true, y_pred))


In [None]:
y_trn #actual values of target in train dataset

**Baseline Model 1: Predicting Every instance as Mean of Target Column**

In [None]:
#predictions on train data
preds_train = pd.Series([y_trn.mean()] * len(X_trn))
print(preds_train)

#performance on train data
rmse_score = rmse(y_trn,preds_train)
#rmsle_score = rmsle(y_trn, preds_target)

print(f'RMSE metric score for Train Data is: {rmse_score}')
#print(f'RMSLE metric score is: {rmsle_score}')

In [None]:
#predictions on Test data (using mean of target in train data)

preds_test = pd.Series([y_trn.mean()] * len(X_test))
print(preds_test)


#performance on Test Data
rmse_score = rmse(y_test,preds_test)

print(f'RMSE metric score for Test Data is: {rmse_score}')


**Baseline Model 2: Predicting Every instance as Median of Target Column**

In [None]:
#predictions on train data
preds_train = pd.Series([y_trn.median()] * len(X_trn))
print(preds_train)

#performance on train data
rmse_score = rmse(y_trn,preds_train)

print(f'RMSE metric score for Train Data is: {rmse_score}')

In [None]:
#predictions on Test data (using median of target in train data)

preds_test = pd.Series([y_trn.median()] * len(X_test))
print(preds_test)


#performance on Test Data
rmse_score = rmse(y_test,preds_test)

print(f'RMSE metric score for Test Data is: {rmse_score}')

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
rmse_scores={}
coeff_det={}


# Model 1: Linear Regression

a) Only Numeric Columns


In [None]:
X_trn[:, num_cols_index]


In [None]:
X_trn[:, num_cols_index].shape

In [None]:
lr1 = LinearRegression()

_ = lr1.fit(X_trn[:, num_cols_index], y_trn)




preds_test = lr1.predict(X_test[:, num_cols_index])

rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score for Test Data is: {rmse_score}')



b) Only Categorical Columns


In [None]:
lr2 = LinearRegression()

_ = lr2.fit(X_trn[:, cat_cols_index], y_trn)




preds_test = lr2.predict(X_test[:, cat_cols_index])

rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score for Test Data is: {rmse_score}')



c) Both Numeric and Categorical Columns

In [None]:
lr3 = LinearRegression()

_ = lr3.fit(X_trn, y_trn)

#Predictions on Test Data
preds_test = lr3.predict(X_test)


In [None]:
print('Performance Metrics for Linear Regression on Test Data: \n')


# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
# The coefficients
print('Coefficients of Linear Regression: \n', lr3.coef_)


In [None]:
rmse_scores['lr3']= 6.609363296368543
coeff_det['lr3']=0.80


In [None]:
#Plot:

In [None]:
plt.scatter(y_pred, y_test)
plt.plot(np.linspace(0,400,400), np.linspace(0,400,400), c = 'orange', linestyle='--')
plt.xlabel('prediction')
plt.ylabel('true values')
plt.xlim(0,400)
plt.ylim(0,400)
plt.title('Predicted vs True values')

# Model 2: SVM Regressor

a) Linear SVR


In [None]:
from sklearn.svm import SVR
lr_svr = SVR(kernel='linear')

_ = lr_svr.fit(X_trn[:, num_cols_index], y_trn)



#get predictions on test data
preds_test = lr_svr.predict(X_test[:, num_cols_index])


In [None]:
print('Performance Metrics for Support Vector Regression on Test Data: \n')


# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
from sklearn.model_selection import GridSearchCV

hyperparam_combs = {
    
    'C':[0.125, 0.5, 1, 2, 8, 16]

    }


lr_svr = GridSearchCV(SVR(kernel='linear'),
                         hyperparam_combs,
                         scoring='neg_root_mean_squared_error',
                         )

search = lr_svr.fit(X_trn, y_trn)

search.best_params_

In [None]:
best_params = {'C':0.5
}

lr_svr_tuned = SVR(**best_params)

_ = lr_svr_tuned.fit(X_trn, y_trn)



#Predictions on Test Data
preds_test = lr_svr_tuned.predict(X_test)

In [None]:
print('Performance Metrics for Support Vector Regression on Test Data: \n')


# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
rmse_scores['linear_svr']= 8.568946496064715
coeff_det['linear_svr']=0.67



b) Kernel SVR

In [None]:
from sklearn.svm import SVR
kernel_svr = SVR(kernel='rbf')

_ = kernel_svr.fit(X_trn[:, num_cols_index], y_trn)

#get predictions on test data
preds_test = kernel_svr.predict(X_test[:, num_cols_index])


In [None]:
print('Performance Metrics for Kernal SVR on Test Data: \n')


# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
hyperparam_combs={

'C': [0.1, 0.5, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]

}

kernel_svr = GridSearchCV(SVR(kernel='rbf'),
                         hyperparam_combs,
                         scoring='neg_root_mean_squared_error',
                         )

search = kernel_svr.fit(X_trn, y_trn)

search.best_params_

In [None]:
best_params = {'C':100, 'gamma': 0.01
}

kernel_svr_tuned = SVR(kernel='rbf',**best_params)

_ = kernel_svr_tuned.fit(X_trn, y_trn)


#Predictions on Test Data
preds_test = kernel_svr_tuned.predict(X_test)

In [None]:
print('Performance Metrics for Support Vector Regression on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
rmse_scores['kernel_svr']= 4.165074172142831
coeff_det['kernel_svr']=0.92


# Model 3: Decision Tree Regressor

In [None]:
dt = DecisionTreeRegressor(random_state=2)

_ = dt.fit(X_trn, y_trn)


#Predictions on Test Data
preds_test = dt.predict(X_test)


In [None]:
print('Performance Metrics for Decision Tree on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
dt.get_params()

In [None]:
rmse_scores['dt']= 7.579939863342771
coeff_det['dt']=0.74


**Hyperparameter Tuning on Decision Tree Regressor**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

hyperparam_combs = {
    'max_depth': [4, 6, 8, 10, 12],
    'min_samples_split': [2, 10, 20, 30, 40],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128],
}


dt = RandomizedSearchCV(DecisionTreeRegressor(),
                         hyperparam_combs,
                         scoring='neg_root_mean_squared_error',
                         random_state=2,
                         n_iter=30)

search = dt.fit(X_trn, y_trn)

search.best_params_

In [None]:
best_params = {'max_depth': 10,
 'max_features': 0.6,
 'max_leaf_nodes': 128,
 'min_samples_split': 20,
}

dt_tuned = DecisionTreeRegressor(random_state = 2,**best_params)

_ = dt_tuned.fit(X_trn, y_trn)



#Predictions on Test Data
preds_test = dt_tuned.predict(X_test)

In [None]:
print('Performance Metrics for Decision Tree on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

# Model 4: Random Forest Regressor

In [None]:
#creation of random forrest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=2) #by default no of estimators=10
rf.fit(X_trn, y_trn)


#Predictions on Test Data
preds_test = rf.predict(X_test)


In [None]:
print('Performance Metrics for Random Forrest on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
rmse_scores['rf']= 5.432977750205356
coeff_det['rf']=0.87


In [None]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
print(rf.get_params())

In [None]:
hyperparam_combs = {
    'max_depth': [4, 6, 8, 10, 12],
    'min_samples_split': [2, 10, 20, 30, 40],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128],
}

rf = RandomizedSearchCV(RandomForestRegressor(),
                         hyperparam_combs,
                         scoring='neg_root_mean_squared_error',
                         random_state=2,
                         n_iter=10)

search = rf.fit(X_trn, y_trn)

search.best_params_

In [None]:
best_params = {'max_depth': 10,
 'max_features': 0.6,
 'max_leaf_nodes': 128,
 'min_samples_split': 20,
}

rf_tuned = RandomForestRegressor(random_state = 2,**best_params)

_ = rf_tuned.fit(X_trn, y_trn)



#Predictions on Test Data

preds_test = rf_tuned.predict(X_test)

In [None]:
print('Performance Metrics for Random Forest on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
#Visualizing Feature Importance

def plot_feature_importances(model):
      features = cols #cols varible we made above has list of all columns in our training data
      importances = model.feature_importances_
      indices = np.argsort(importances)

      plt.figure(figsize=(12,12))

      plt.title('Feature Importances')
      plt.barh(range(len(indices)), importances[indices], color='r', align='center')
      plt.yticks(range(len(indices)), [features[i] for i in indices])
      plt.ylabel("Feature")
      plt.xlabel('Relative Importance')
      plt.show()
          

plot_feature_importances(rf_tuned)   

# Model 5: GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbdt = GradientBoostingRegressor(random_state=2)
gbdt.fit(X_trn, y_trn)


#Predictions on Test Data
preds_test = gbdt.predict(X_test)

In [None]:
print('Performance Metrics for Gradient Boost Decision Tree on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
#Hyperparameter Tuning


param_grid = {'max_depth': np.arange(5,8,1), \
             'n_estimators':np.arange(10,170,10),
    'max_depth': [4, 6, 8, 10, 12],
    'min_samples_split': [2, 10, 20, 30, 40],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128],

}


gbdt = RandomizedSearchCV(GradientBoostingRegressor(),
                         param_grid,
                         scoring='neg_root_mean_squared_error',
                         random_state=2,
                         n_iter=50)

search = gbdt.fit(X_trn, y_trn)

search.best_params_

In [None]:
best_params = {'max_depth': 10,
 'max_features': 0.4,
 'max_leaf_nodes': 32,
 'min_samples_split': 40,
 'n_estimators': 140
}

gbdt_tuned = GradientBoostingRegressor(random_state = 2,**best_params)

_ = gbdt_tuned.fit(X_trn, y_trn)



#Predictions on Test Data

preds_test = gbdt_tuned.predict(X_test)

In [None]:
print('Performance Metrics for Gradient Boost Decision Tree on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
plot_feature_importances(gbdt_tuned)   

In [None]:
rmse_scores['gbdt']= 4.447249461638161
coeff_det['gbdt']=0.91


# Model 6: More Advanced Models

a)LightGBM


In [None]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(random_state=2)
lgbm.fit(X_trn, y_trn)

#Predictions on Test Data
preds_test = lgbm.predict(X_test)


In [None]:
print('Performance Metrics for LightGBM on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')
class
# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:

plot_feature_importances(lgbm)   

In [None]:
#taking random parameter values 
lgbm2 = LGBMRegressor(n_estimators = 1000,
                        learning_rate = 0.05,
                        colsample_bytree = 0.76, #each tree won't consider all columns. only 76% features will be used by each tree!
                        metric = 'None',
                      random_state=2
                        )
lgbm2.fit(X_trn, y_trn)

#Predictions on Test Data
preds_test = lgbm2.predict(X_test)


In [None]:
print('Performance Metrics for LightGBM on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:

plot_feature_importances(lgbm2)   

In [None]:
rmse_scores['lgb']= 4.240479118593097
coeff_det['lgb']=0.92



In [None]:
#Hyperparameter Tuning

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

parameters={ 
            "eval_metric" : 'rmse', 
            'verbose': 100,
}

parameter_tuning ={
             'max_depth': sp_randint(10,50),
             'num_leaves': sp_randint(6, 50), 
             'learning_rate ': [0.1,0.01,0.001],
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
}




lgbm = LGBMRegressor(n_estimators = 1000, n_jobs=-1, metric="None", silent=True, random_state=2)

lgbm = RandomizedSearchCV(lgbm,
                         param_distributions= parameter_tuning,
                         scoring='neg_root_mean_squared_error',
                         random_state=2,
                         n_iter=50, verbose=False)

search = lgbm.fit(X_trn, y_trn, **parameters)


In [None]:
print('Best score : {} with parameters: {} '.format(search.best_score_, search.best_params_))

In [None]:
best_parameters = search.best_params_
best_parameters

In [None]:
lgbm_tuned = LGBMRegressor(**best_parameters)
lgbm_tuned.set_params(**best_parameters)

In [None]:
#Predictions on Test Data
preds_test = lgbm_tuned.predict(X_test)

In [None]:
print('Performance Metrics for XGBoost on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

b)XGBoost

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=2)
xgb.fit(X_trn, y_trn)

#Predictions on Test Data
preds_test = xgb.predict(X_test)


In [None]:
print('Performance Metrics for XGBoost on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
#Taking some random parameters

In [None]:
xgb = XGBRegressor(n_estimators = 1000,
                    max_depth = 6,
                    learning_rate = 0.05,
                    colsample_bytree = 0.5,
                    random_state=2,
                    )

xgb.fit(X_trn, y_trn)

#Predictions on Test Data
preds_test = xgb.predict(X_test)


In [None]:
print('Performance Metrics for XGBoost on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [None]:
rmse_scores['xgboost']= 4.271406328482151
coeff_det['xgboost']=0.92


In [None]:
#Hyperparameter Tuning of XGBoost


c)CatBoost

# Comparing all Models:

In [None]:
rmse_scores

In [None]:
coeff_det

In [None]:
print('\n                                   RMSE      r2_Score')
print('------------------------------------------------------------------------------')
print('Linear Regression:                 {:.04}     {:.04}'.format(rmse_scores['lr3'] ,coeff_det['lr3']))

print('Linear SVR:                        {:.04}     {:.04} '.format(rmse_scores['linear_svr'],\
                                                        coeff_det['linear_svr']))

print('RBF-Kernel SVR:                    {:.04}     {:.04} '.format(rmse_scores['kernel_svr'],\
                                                          coeff_det['kernel_svr']))

print('Decision Tree Regressor:           {:.04}      {:.04} '.format(rmse_scores['dt'],\
                                                      coeff_det['dt']))

print('Random Forest Regressor:           {:.04}     {:.04} '.format(rmse_scores['rf'],\
                                                           coeff_det['rf']))

print('Gradient Boosting DT Regressor:    {:.04}     {:.04} '.format(rmse_scores['gbdt'],\
                                                        coeff_det['gbdt']))

print('XGBoost Regressor:                 {:.04}     {:.04} '.format(rmse_scores['xgboost'],\
                                                        coeff_det['xgboost']))

print('LightGBM Regressor:                {:.04}      {:.04} '.format(rmse_scores['lgb'],\
                                                        coeff_det['lgb']))

Saving the Models:

In [None]:
import pickle

# save the kernel-svr model to disk
filename = "/content/drive/My Drive/Colab Notebooks/Airline Delay Prediction/Project2/rbfkernelsvr.sav"
pickle.dump(kernel_svr, open(filename, 'wb'))

# save the lightgbm model to disk
filename = "/content/drive/My Drive/Colab Notebooks/Airline Delay Prediction/Project2/lightgmb.sav"
pickle.dump(lgbm2, open(filename, 'wb'))

# save the xgboost model to disk
filename = "/content/drive/My Drive/Colab Notebooks/Airline Delay Prediction/Project2/xgb.sav"
pickle.dump(xgb, open(filename, 'wb'))


In [None]:
# load the model from disk

filename = "/content/drive/My Drive/Colab Notebooks/Airline Delay Prediction/Project2/rbfkernelsvr.sav"

loaded_model = pickle.load(open(filename, 'rb'))



#Predictions on Test Data
preds_test = loaded_model.predict(X_test)


print('Performance Metrics for LightGBM on Test Data: \n')

# The Root Mean Squared error
rmse_score = rmse(y_test,preds_test)
print(f'RMSE metric score is: {rmse_score}')

# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, preds_test))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'% r2_score(y_test, preds_test))

In [1]:
import requests
import json

In [31]:
result = requests.get("https://randomuser.me/api/")

In [32]:
result.status_code

200

In [33]:
text = result.text

In [34]:
text

'{"results":[{"gender":"female","name":{"title":"Ms","first":"Carol","last":"Sutton"},"location":{"street":{"number":1006,"name":"O\'Connell Street"},"city":"Ashbourne","state":"Wexford","country":"Ireland","postcode":54114,"coordinates":{"latitude":"20.6173","longitude":"163.5112"},"timezone":{"offset":"-9:00","description":"Alaska"}},"email":"carol.sutton@example.com","login":{"uuid":"2ef1190f-7d6e-4914-a52b-195b82fd7029","username":"silverbutterfly615","password":"bowman","salt":"3y6Mgrku","md5":"6979fcc99dd3be217d264e305fd2af35","sha1":"2e79ecf9adb1a15c1c0430b6370e613b4416e7e3","sha256":"a29072ff8fba235b103aa50251220c90d66768c728cd29136ef4fd8c48714c58"},"dob":{"date":"1963-05-04T05:30:38.118Z","age":58},"registered":{"date":"2015-05-02T17:45:04.099Z","age":6},"phone":"021-984-0159","cell":"081-850-2924","id":{"name":"PPS","value":"9758250T"},"picture":{"large":"https://randomuser.me/api/portraits/women/48.jpg","medium":"https://randomuser.me/api/portraits/med/women/48.jpg","thumbna

In [35]:
if 'json' in result.headers.get('Content-Type'):
    js = result.json()
else:
    print('Response content is not in JSON format.')
    js = 'spam'

In [7]:
from urllib import request

In [8]:
url = "https://www.expedia.com/api/flight/search?departureDate=2021-03-03&departureAirport=HYD&arrivalAirport=BOM"
url1="https://www.wikipedia.org/"
#URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

In [14]:
resp = request.urlopen(url1)


In [15]:
type(resp)

http.client.HTTPResponse

In [16]:
resp.code

200

In [17]:
resp.peek()

b'<!DOCTYPE html>\n<html lang="mul" class="no-js">\n<head>\n<meta charset="utf-8">\n<title>Wikipedia</title>\n<meta name="description" content="Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.">\n<script>\ndocument.documentElement.className = document.documentElement.className.replace( /(^|\\s)no-js(\\s|$)/, "$1js-enabled$2" );\n</script>\n<meta name="viewport" content="initial-scale=1,user-scalable=yes">\n<link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png">\n<link rel="shortcut icon" href="/static/favicon/wikipedia.ico">\n<link rel="license" href="//creativecommons.org/licenses/by-sa/3.0/">\n<style>\n.sprite{background-image:linear-gradient(transparent,transparent),url(portal/wikipedia.org/assets/img/sprite-46c49284.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle}.svg-Commons-logo_sister{background-position:0 0;width:47px;height:47px}.svg-MediaWiki-logo_siste

In [18]:
data = resp.read()
print(type(data))
print(len(data))

<class 'bytes'>
66728


In [19]:
html = data.decode("UTF-8")

In [20]:
html

'<!DOCTYPE html>\n<html lang="mul" class="no-js">\n<head>\n<meta charset="utf-8">\n<title>Wikipedia</title>\n<meta name="description" content="Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation.">\n<script>\ndocument.documentElement.className = document.documentElement.className.replace( /(^|\\s)no-js(\\s|$)/, "$1js-enabled$2" );\n</script>\n<meta name="viewport" content="initial-scale=1,user-scalable=yes">\n<link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png">\n<link rel="shortcut icon" href="/static/favicon/wikipedia.ico">\n<link rel="license" href="//creativecommons.org/licenses/by-sa/3.0/">\n<style>\n.sprite{background-image:linear-gradient(transparent,transparent),url(portal/wikipedia.org/assets/img/sprite-46c49284.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle}.svg-Commons-logo_sister{background-position:0 0;width:47px;height:47px}.svg-MediaWiki-logo_sister

In [21]:
from urllib import parse
dir(parse)

['DefragResult',
 'DefragResultBytes',
 'MAX_CACHE_SIZE',
 'ParseResult',
 'ParseResultBytes',
 'Quoter',
 'ResultBase',
 'SplitResult',
 'SplitResultBytes',
 '_ALWAYS_SAFE',
 '_ALWAYS_SAFE_BYTES',
 '_DefragResultBase',
 '_NetlocResultMixinBase',
 '_NetlocResultMixinBytes',
 '_NetlocResultMixinStr',
 '_ParseResultBase',
 '_ResultMixinBytes',
 '_ResultMixinStr',
 '_SplitResultBase',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_asciire',
 '_checknetloc',
 '_coerce_args',
 '_decode_args',
 '_encode_result',
 '_hexdig',
 '_hextobyte',
 '_hostprog',
 '_implicit_encoding',
 '_implicit_errors',
 '_noop',
 '_parse_cache',
 '_portprog',
 '_safe_quoters',
 '_splitattr',
 '_splithost',
 '_splitnetloc',
 '_splitnport',
 '_splitparams',
 '_splitpasswd',
 '_splitport',
 '_splitquery',
 '_splittag',
 '_splittype',
 '_splituser',
 '_splitvalue',
 '_to_bytes',
 '_typeprog',
 'clear_cache',
 'collections',
 'namedtuple',

In [22]:
params={"departureDate":"2021-03-03","departureAirport":"HYD","arrivalAirport":"DEL"}

In [23]:
querystring = parse.urlencode(params)

In [24]:
querystring

'departureDate=2021-03-03&departureAirport=HYD&arrivalAirport=DEL'

In [25]:
url = "https://www.expedia.com/api/flight/search"+"?"+querystring

In [26]:
requests.get(url, headers = {'User-agent': 'your bot 0.1'})

<Response [429]>

In [27]:
#to check if we still have connecton with the server
resp.isclosed()

True

In [28]:
resp.code

200

In [29]:
#From Skyscanner
import requests

url = "https://skyscanner-skyscanner-flight-search-v1.p.rapidapi.com/apiservices/browseroutes/v1.0/US/USD/en-US/SFO-sky/ORD-sky/2021-03-03"

querystring = {"inboundpartialdate":"2021-03-03"}

headers = {
    'x-rapidapi-key': "e4f45619a7msh00ab3d2d7433e6ep15bb05jsnb3bc14c37ccb",
    'x-rapidapi-host': "skyscanner-skyscanner-flight-search-v1.p.rapidapi.com"
    }

response = requests.request("GET", url, headers=headers, params=querystring)

print(response.text)

{
  "Quotes" : [ {
    "QuoteId" : 1,
    "MinPrice" : 120,
    "Direct" : false,
    "OutboundLeg" : {
      "CarrierIds" : [ 1793 ],
      "OriginId" : 81727,
      "DestinationId" : 73076,
      "DepartureDate" : "2021-03-03T00:00:00"
    },
    "QuoteDateTime" : "2021-03-01T01:37:00"
  }, {
    "QuoteId" : 2,
    "MinPrice" : 347,
    "Direct" : true,
    "OutboundLeg" : {
      "CarrierIds" : [ 1793 ],
      "OriginId" : 81727,
      "DestinationId" : 73076,
      "DepartureDate" : "2021-03-03T00:00:00"
    },
    "QuoteDateTime" : "2021-03-01T07:49:00"
  } ],
  "Carriers" : [ {
    "CarrierId" : 1793,
    "Name" : "United"
  } ],
  "Places" : [ {
    "Name" : "Chicago O'Hare International",
    "Type" : "Station",
    "PlaceId" : 73076,
    "IataCode" : "ORD",
    "SkyscannerCode" : "ORD",
    "CityName" : "Chicago",
    "CityId" : "CHIA",
    "CountryName" : "United States"
  }, {
    "Name" : "San Francisco International",
    "Type" : "Station",
    "PlaceId" : 81727,
    "Iat

In [30]:
import requests

url = "https://www.expedia.com/api/flight/search?departureDate=2021-03-03&departureAirport=HYD&arrivalAirport=BLR"

#querystring = {"aircraftCode":"<REQUIRED>","flightNumber":"<REQUIRED>","carrierCode":"<REQUIRED>","departureTime":"<REQUIRED>","duration":"<REQUIRED>","originLocationCode":"<REQUIRED>","arrivalDate":"<REQUIRED>","destinationLocationCode":"<REQUIRED>","arrivalTime":"<REQUIRED>","departureDate":"<REQUIRED>"}
querystring = {"departureDate":"2021-03-03","departureAirport":"HYD","arrivalAirport":"DEL"}
headers = {
  "activity-id": "<!--tlactivity-id: e19bd8bf-6abd-4611-bb5c-faf0c54964be-->",
  "content-encoding": "gzip",
  "content-language": "en-US",
  "content-length": "9761",
  "content-type": "application/json;charset=UTF-8",
  "date": "Mon, 01 Mar 2021 06:23:45 GMT",
  "p3p": "policyref=\"/w3c/p3p.xml\", CP=\"CAO DSP IND COR ADM CONo CUR CUSi DEV PSA PSD DELi OUR COM NAV PHY ONL PUR UNI\"",
  "same_site_supported": "true",
  "server": "Expedia WebServer",
  "strict-transport-security": "max-age=2592000; includeSubDomains;",
  "trace-id": "e19bd8bf-6abd-4611-bb5c-faf0c54964be",
  "vary": "Accept-Encoding",
  "x-app-info": "expweb,release-2021-02-r4.12419.2344752,us-west-2:expweb",
  "x-b3-traceid": "e19bd8bf6abd4611bb5cfaf0c54964be",
  "x-cgp-info": "noJvmRouteSet;a9c29634-7a56-11eb-b1c2-0242b5f86d60",
  "x-content-type-options": "nosniff",
  "x-edgeconnect-cache-status": "0",
  "x-page-id": "page.MobileFlightApiSearch,F,20",
  "x-ua-compatible": "IE=Edge",
  "x-xss-protection": "1"
}

response = requests.request("GET", url, headers=headers, params=querystring)

print(response.text)

<HTML><HEAD>
<TITLE>Access Denied</TITLE>
</HEAD><BODY>
<H1>Access Denied</H1>
 
You don't have permission to access "http&#58;&#47;&#47;www&#46;expedia&#46;com&#47;api&#47;flight&#47;search&#63;" on this server.<P>
Reference&#32;&#35;18&#46;9e01d517&#46;1614605395&#46;2be299c
</BODY>
</HTML>

