In [4]:
# Import 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import figure

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import  mean_squared_error, make_scorer, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Import models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import  DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,  AdaBoostRegressor

import time 

# Import scripts
from optional.feature_engineering import *
from optional.prepare_flight_data import *
from optional.dummies import *
from optional.predict import *

RSEED = 42

In [17]:
df = pd.read_csv('data/final_train.csv', index_col=[0])
df_test = pd.read_csv('data/final_test.csv')
df_test.head()

Unnamed: 0.1,Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,...,icao_ARR,iata_ARR,name_ARR,city_ARR,subd_ARR,country_ARR,elevation_ARR,lat_ARR,lon_ARR,tz_ARR
0,0,test_id_86,2016-05-01,TU 0714,JED,TUN,2016-05-01 00:30:00,2016-05-01 05.25.00,ATA,TU 332IFM,...,DTTA,TUN,Tunis Carthage International Airport,Tunis,Tunis,TN,22.0,36.851002,10.2272,Africa/Tunis
1,1,test_id_85,2016-05-01,TU 6033,TUN,JED,2016-05-01 00:50:00,2016-05-01 05.15.00,ATA,TU 320IMV,...,OEJN,JED,King Abdulaziz International Airport,Jeddah,Makkah,SA,48.0,21.6796,39.156502,Asia/Riyadh
2,2,test_id_87,2016-05-01,TU 6032,JED,SFA,2016-05-01 00:50:00,2016-05-01 05.35.00,ATA,TU 320IMR,...,DTTX,SFA,Sfax Thyna International Airport,Sfax,Safaqis,TN,85.0,34.717999,10.691,Africa/Tunis
3,3,test_id_1799,2016-05-01,TU 0440,MIR,ORY,2016-05-01 05:05:00,2016-05-01 07.30.00,ATA,TU 31AIMJ,...,LFPO,ORY,Paris-Orly Airport,Paris,Ile-de-France,FR,291.0,48.7253,2.35944,Europe/Paris
4,4,test_id_1800,2016-05-01,TU 0480,DJE,NTE,2016-05-01 05:15:00,2016-05-01 08.05.00,ATA,TU 736IOQ,...,LFRS,NTE,Nantes Atlantique Airport,Nantes,Pays-de-la-Loire,FR,90.0,47.153198,-1.61073,Europe/Paris


In [6]:
df.columns

Index(['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS',
       'AC', 'target', 'icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'city_ARR', 'subd_ARR',
       'country_ARR', 'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR',
       'delay_or_onTime', 'delayed', 'domestic', 'dep_hour', 'dep_weekday',
       'duration_min', 'arr_hour', 'flight_month', 'flight_month_name', 'year',
       'distance'],
      dtype='object')

# Data Cleaning 


In [7]:
# convert time to unix 
df['STA'] = df['STA'].str.replace('.', ':', regex=False)
df['STA'] = pd.to_datetime(df['STA']).map(pd.Timestamp.timestamp)
df['STD'] = pd.to_datetime(df['STD']).map(pd.Timestamp.timestamp)

In [8]:
# Drop features by list

drop_arr = [
    'ID', 'DATOP', 'FLTID', 'AC', 'STATUS', 
    'DEPSTN', 'ARRSTN', 'flight_month_name', 
    'delayed', 'year', 'delay_or_onTime', 
    'city_ARR', 'country_ARR'
    ]

try: 
    df = df.drop(labels=drop_arr, axis=1)
    df_test = df_test.drop(labels=drop_arr, axis=1)
    df.head()
except KeyError:
    df.head()

In [9]:
# create Target Vector

X = df.drop('target', axis=1)
y = df['target']

X_test = df.drop('target', axis=1)
y_test = df['target']

print(f'Values in Feature Vector: {X.shape[0]}\nValues in Target Vector: {y.shape[0]}')

Values in Feature Vector: 102310
Values in Target Vector: 102310


In [10]:
X.columns

Index(['STD', 'STA', 'icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'subd_ARR',
       'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR', 'domestic', 'dep_hour',
       'dep_weekday', 'duration_min', 'arr_hour', 'flight_month', 'distance'],
      dtype='object')

In [14]:
test = X.subd_ARR
test.head()

0            Tunis
1            Tunis
2            Tunis
3          Madanin
4    North Holland
Name: subd_ARR, dtype: object

## Feature Engineering

In [None]:
# Spliting Features types

arr = X.columns

cat_feats = [x for x in arr if X[x].dtype == 'object']
numeric_feats = [x for x in arr if X[x].dtype == 'float64']

arr = X_test.columns

cat_feats = [x for x in arr if X_test[x].dtype == 'object']
numeric_feats = [x for x in arr if X_test[x].dtype == 'float64']

In [None]:
# Create Pipelines

cat_pipe = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

num_pipe = Pipeline([
    ('std_scaler', StandardScaler())
])

In [None]:
preprocessor = ColumnTransformer([
     ('cat', cat_pipe, cat_feats), 
    ('num', num_pipe, numeric_feats)
])

### Regression

#### Baseline

In [None]:
pipe_linreg = Pipeline([
    ('preprocessor', preprocessor),
    ('linreg', LinearRegression())
])

pipe_forestreg = Pipeline([
    ('preprocessor', preprocessor),
    ('forestreg', RandomForestRegressor())
])

pipe_decreg = Pipeline([
    ('preprocessor', preprocessor),
    ('decreg', DecisionTreeRegressor())
])

pipe_svr = Pipeline([
    ('preprocessor', preprocessor),
    ('svr',SVR())
])

In [None]:

param_decreg = {
            "decreg__max_depth" : [5,7,9],
            "decreg__min_samples_leaf":[6,7,8],
            "decreg__min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
            "decreg__max_features":["log2","sqrt",None],
            "decreg__max_leaf_nodes":[10,20,30]
            }


grid_decreg = GridSearchCV(pipe_decreg, param_grid=param_decreg, cv=5, scoring='neg_mean_squared_error',
            verbose=1, n_jobs=-1)


In [None]:
grid_decreg.fit(X, y)

In [None]:
best_dec_model = grid_decreg.best_estimator_
best_dec_model


In [None]:
print('Best score:\n{:.2f}'.format(grid_decreg.best_score_))
print("Best parameters:\n{}".format(grid_decreg.best_params_))

In [None]:
best_dec_model.fit(X,y)

In [None]:
pipe_decreg.fit(X,y)

In [None]:
param_svr = {
            "svr__kernel" : ('linear', 'poly', 'rbf', 'sigmoid'),
            "svr__degree":[2,3,4,5,6]
            }


grid_svr = GridSearchCV(pipe_svr, param_grid=param_svr, cv=5, scoring='neg_mean_squared_error',
            verbose=1, n_jobs=-1)

In [None]:
grid_svr.fit(X, y)

In [None]:
best_svr_model = grid_svr.best_estimator_
best_svr_model

In [None]:

pipe_svr.fit(X, y)

In [None]:
y_hat_dec_untune = pipe_decreg.predict(X_test)
y_hat_dec_tune = best_dec_model.predict(X_test)

In [None]:
y_hat_svr_untune = pipe_svr.predict(X_test)
y_hat_svr_tune = best_svr_model.predict(X_test)

In [None]:
print('\n')
print('_'*20)
print('SVR')
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_svr_untune)))
print("Tuned Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_svr_tune)))

print('\n')
print('_'*20)
print('DecisionTree Regression')
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_dec_untune)))
print("Tuned Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_dec_tune)))