In [2]:
# Import 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score ,classification_report, mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV

# Import models
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier, SGDRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from xgboost import XGBClassifier, XGBRegressor

from tqdm.notebook import tqdm,trange
import time 

# Import scripts
from optional.feature_engineering import *
from optional.prepare_flight_data import *
from optional.dummies import *
from optional.predict import *

RSEED = 42

In [3]:
train_df = pd.read_csv('data/final_train.csv', index_col=[0])
train_df.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,...,delayed,domestic,dep_hour,dep_weekday,duration_min,arr_hour,flight_month,flight_month_name,year,distance
0,train_id_15674,2016-01-01 00:00:00,TU 0564,NKC,TUN,2016-01-01 00:15:00,2016-01-01 04:30:00,ATA,TU 320IMV,0.0,...,0,0,0,Friday,255.0,4,1,January,2016,3298.067996
1,train_id_15676,2016-01-01 00:00:00,TU 0714,JED,TUN,2016-01-01 00:55:00,2016-01-01 05:30:00,ATA,TU 332IFM,195.0,...,1,0,0,Friday,275.0,5,1,January,2016,3256.052105
2,train_id_15675,2016-01-01 00:00:00,TU 0614,DKR,TUN,2016-01-01 01:20:00,2016-01-01 05:55:00,ATA,TU 320IMU,49.0,...,1,0,1,Friday,275.0,5,1,January,2016,3678.974557
3,train_id_30980,2016-01-01 00:00:00,UG 0002,TUN,DJE,2016-01-01 06:15:00,2016-01-01 07:15:00,SCH,UG AT7LBD,0.0,...,0,1,6,Friday,60.0,7,1,January,2016,333.916459
4,train_id_7179,2016-01-01 00:00:00,TU 0880,TUN,AMS,2016-01-01 06:30:00,2016-01-01 09:20:00,ATA,TU 736IOP,36.0,...,1,0,6,Friday,170.0,9,1,January,2016,1770.371959


## Feature Engineering
Firstly we drop a number of columns that we should not use for modeling. Afterwards we separate the data set into features and the response that we are going to predict. 

In [4]:
# List of dropping feature for regression
drop_features_reg = ['DATOP', 'ID','FLTID','STD', 'STA','icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'city_ARR', 'subd_ARR',
       'country_ARR', 'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR','arr_hour', 'flight_month','delay_or_onTime', 'target']

In [7]:
# List of dropping feature for classification
drop_features_class = ['DATOP', 'ID','FLTID','STD', 'STA','target','icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'city_ARR', 'subd_ARR',
       'country_ARR', 'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR','arr_hour', 'flight_month','delay_or_onTime', 'delayed']

In [8]:
# Feature and target variable for Classification modelling
X_class = drop_column(train_df, drop_features_class)
y_class = train_df['delayed']
# Feature and target variable for Regression modelling
X_reg = drop_column(train_df.query("delayed == 1"), drop_features_reg)
y_reg = train_df.query("delayed == 1")['target']

Split Data 

In [10]:
# Split the 'features' and 'target' data into training and testing sets for classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, stratify=y_class, random_state=RSEED)
# Split the 'features' and 'target' data into training and testing sets for classification
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=RSEED)

# resetting indices
reset_indices(X_train_class,X_test_class,X_train_reg,X_test_reg)

# Print shape of the test and train data
print('Train data')
print('# Training data for classification:     {}'.format(X_train_class.shape[0]))
print('# Training data for regression     {}'.format(X_train_reg.shape[0]))
print('==================')
print('Test data')
print('# Test data for classification:     {}'.format(X_test_class.shape[0]))
print('# Test data for regression:     {}'.format(X_test_reg.shape[0]))


Train data
# Training data for classification:     81848
# Training data for regression     46072
Test data
# Test data for classification:     20462
# Test data for regression:     11518


### Regression

In [11]:
# Creating a Dummie regressor that always predicts the same value
y_pred_dumm_reg = dummie_reg(X_test_reg)

In [12]:
print("                 Linear Regression Model:")
print("==="*20)
print("                 RSME:",mean_squared_error(y_test_reg, y_pred_dumm_reg,squared=False))
print("==="*20)

                 Linear Regression Model:
                 RSME: 140.09043602473963
