In [1]:
# Import 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import figure

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score ,classification_report, mean_squared_error, make_scorer, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Import models
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import  DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,  AdaBoostRegressor

import time 

# Import scripts
from optional.feature_engineering import *
from optional.prepare_flight_data import *
from optional.dummies import *
from optional.predict import *

RSEED = 42

In [2]:
df = pd.read_csv('data/final_train.csv', index_col=[0])
df_test = pd.read_csv('data/final_test.csv')
df.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,...,delayed,domestic,dep_hour,dep_weekday,duration_min,arr_hour,flight_month,flight_month_name,year,distance
0,train_id_15674,2016-01-01 00:00:00,TU 0564,NKC,TUN,2016-01-01 00:15:00,2016-01-01 04:30:00,ATA,TU 320IMV,0.0,...,0,0,0,Friday,255.0,4,1,January,2016,3298.067996
1,train_id_15676,2016-01-01 00:00:00,TU 0714,JED,TUN,2016-01-01 00:55:00,2016-01-01 05:30:00,ATA,TU 332IFM,195.0,...,1,0,0,Friday,275.0,5,1,January,2016,3256.052105
2,train_id_15675,2016-01-01 00:00:00,TU 0614,DKR,TUN,2016-01-01 01:20:00,2016-01-01 05:55:00,ATA,TU 320IMU,49.0,...,1,0,1,Friday,275.0,5,1,January,2016,3678.974557
3,train_id_30980,2016-01-01 00:00:00,UG 0002,TUN,DJE,2016-01-01 06:15:00,2016-01-01 07:15:00,SCH,UG AT7LBD,0.0,...,0,1,6,Friday,60.0,7,1,January,2016,333.916459
4,train_id_7179,2016-01-01 00:00:00,TU 0880,TUN,AMS,2016-01-01 06:30:00,2016-01-01 09:20:00,ATA,TU 736IOP,36.0,...,1,0,6,Friday,170.0,9,1,January,2016,1770.371959


In [3]:
df.columns

Index(['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS',
       'AC', 'target', 'icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'city_ARR', 'subd_ARR',
       'country_ARR', 'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR',
       'delay_or_onTime', 'delayed', 'domestic', 'dep_hour', 'dep_weekday',
       'duration_min', 'arr_hour', 'flight_month', 'flight_month_name', 'year',
       'distance'],
      dtype='object')

# Data Cleaning 


In [4]:
# convert time to unix 
df['STA'] = df['STA'].str.replace('.', ':', regex=False)
df['STA'] = pd.to_datetime(df['STA']).map(pd.Timestamp.timestamp)
df['STD'] = pd.to_datetime(df['STD']).map(pd.Timestamp.timestamp)

In [5]:
# Drop features by list

drop_arr = [
    'ID', 'DATOP', 'FLTID', 'AC', 'STATUS', 
    'DEPSTN', 'ARRSTN', 'flight_month_name', 
    'delayed', 'year', 'delay_or_onTime', 
    'city_ARR', 'country_ARR']

try: 
    df = df.drop(labels=drop_arr, axis=1)
    df_test = df_test.drop(labels=drop_arr, axis=1)
    df.head()
except KeyError:
    df.head()

In [6]:
# create Target Vector

X = df.drop('target', axis=1)
y = df['target']

X_test = df.drop('target', axis=1)
y_test = df['target']

print(f'Values in Feature Vector: {X.shape[0]}\nValues in Target Vector: {y.shape[0]}')

Values in Feature Vector: 102310
Values in Target Vector: 102310


## Feature Engineering

In [7]:
# Spliting Features types

arr = X.columns

cat_feats = [x for x in arr if X[x].dtype == 'object']

numeric_feats = [x for x in arr if X[x].dtype == 'float64']

arr = X_test.columns

cat_feats = [x for x in arr if X_test[x].dtype == 'object']

numeric_feats = [x for x in arr if X_test[x].dtype == 'float64']

In [8]:
# Create Pipelines

cat_pipe = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

num_pipe = Pipeline([
    ('std_scaler', StandardScaler())
])

In [9]:
preprocessor = ColumnTransformer([
     ('cat', cat_pipe, cat_feats), 
    ('num', num_pipe, numeric_feats)
])

### Regression

#### Baseline

In [10]:
pipe_linreg = Pipeline([
    ('preprocessor', preprocessor),
    ('linreg', LinearRegression())
])

pipe_forestreg = Pipeline([
    ('preprocessor', preprocessor),
    ('forestreg', RandomForestRegressor())
])

pipe_decreg = Pipeline([
    ('preprocessor', preprocessor),
    ('decreg', DecisionTreeRegressor())
])

pipe_svm = Pipeline([
    ('preprocessor', preprocessor),
    ('svm',SVC())
])

In [11]:

param_forestreg = {}

grid_forestreg = GridSearchCV(pipe_forestreg, param_grid=param_forestreg, cv=5, scoring='neg_mean_squared_error', 
                           verbose=5, n_jobs=-1)

In [12]:
# best_Forest_model = grid_forestreg.best_estimator_
# best_Forest_model

In [13]:

param_decreg = {'criterion': ('gini', 'entropy', 'log_loss'), 
                'max_depth':(3, 5, 7, 9)
                }

grid_decreg = GridSearchCV(pipe_decreg, param_grid=param_decreg, cv=5, scoring='neg_mean_squared_error', 
                           verbose=5, n_jobs=-1)

In [14]:
best_dec_model = grid_decreg.best_estimator_
best_dec_model

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [15]:
param_svm = {'svm__kernel':('linear','poly', 'rbf'),
                'svm__C': [0.01, 0.1, 1, 10],
                'svm__degree': [1,2,3]
               }

grid_svm = GridSearchCV(pipe_svm, param_grid=param_svm, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

In [16]:
best_svm_model = grid_svm.best_estimator_
best_svm_model

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
pipe_linreg.fit(X, y)

In [None]:
y_hat_lin = pipe_linreg.predict(X_test)
y_hat_log = pipe_forestreg.predict(X_test)
y_hat_dec = best_dec_model.predict(X_test)
y_hat_svm = best_svm_model.predict(X_test)

print('Linear Regression')
print('_'*20)
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_lin)))

print('Log Regression')
print('_'*20)
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_log)))

print('DecisionTree Regression')
print('_'*20)
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_dec)))

print('SVM')
print('_'*20)
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_hat_svm)))

print('ANN Because i have no life ...')
print('_'*20)
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_test)))