In [1]:
# Import 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import figure

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score ,classification_report, mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV

# Import models
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier, SGDRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from xgboost import XGBClassifier, XGBRegressor

from tqdm.notebook import tqdm,trange
import time 

# Import scripts
from optional.feature_engineering import *
from optional.prepare_flight_data import *
from optional.dummies import *
from optional.predict import *

RSEED = 42

In [2]:
train_df = pd.read_csv('data/final_train.csv', index_col=[0])
train_df.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,...,delayed,domestic,dep_hour,dep_weekday,duration_min,arr_hour,flight_month,flight_month_name,year,distance
0,train_id_15674,2016-01-01 00:00:00,TU 0564,NKC,TUN,2016-01-01 00:15:00,2016-01-01 04:30:00,ATA,TU 320IMV,0.0,...,0,0,0,Friday,255.0,4,1,January,2016,3298.067996
1,train_id_15676,2016-01-01 00:00:00,TU 0714,JED,TUN,2016-01-01 00:55:00,2016-01-01 05:30:00,ATA,TU 332IFM,195.0,...,1,0,0,Friday,275.0,5,1,January,2016,3256.052105
2,train_id_15675,2016-01-01 00:00:00,TU 0614,DKR,TUN,2016-01-01 01:20:00,2016-01-01 05:55:00,ATA,TU 320IMU,49.0,...,1,0,1,Friday,275.0,5,1,January,2016,3678.974557
3,train_id_30980,2016-01-01 00:00:00,UG 0002,TUN,DJE,2016-01-01 06:15:00,2016-01-01 07:15:00,SCH,UG AT7LBD,0.0,...,0,1,6,Friday,60.0,7,1,January,2016,333.916459
4,train_id_7179,2016-01-01 00:00:00,TU 0880,TUN,AMS,2016-01-01 06:30:00,2016-01-01 09:20:00,ATA,TU 736IOP,36.0,...,1,0,6,Friday,170.0,9,1,January,2016,1770.371959


## Feature Engineering
Firstly we drop a number of columns that we should not use for modeling. Afterwards we separate the data set into features and the response that we are going to predict. 

In [3]:
train_df.columns

Index(['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS',
       'AC', 'target', 'icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'city_ARR', 'subd_ARR',
       'country_ARR', 'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR',
       'delay_or_onTime', 'delayed', 'domestic', 'dep_hour', 'dep_weekday',
       'duration_min', 'arr_hour', 'flight_month', 'flight_month_name', 'year',
       'distance'],
      dtype='object')

In [4]:
# List of dropping feature for regression
drop_features_reg = ['ID', 'DATOP', 'FLTID', 'AC', 'STATUS', 'DEPSTN', 'ARRSTN', 'flight_month_name', 'delayed', 'year','delay_or_onTime','city_ARR','country_ARR']

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102310 entries, 0 to 107832
Data columns (total 41 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ID                 102310 non-null  object 
 1   DATOP              102310 non-null  object 
 2   FLTID              102310 non-null  object 
 3   DEPSTN             102310 non-null  object 
 4   ARRSTN             102310 non-null  object 
 5   STD                102310 non-null  object 
 6   STA                102310 non-null  object 
 7   STATUS             102310 non-null  object 
 8   AC                 102310 non-null  object 
 9   target             102310 non-null  float64
 10  icao_DEP           102310 non-null  object 
 11  iata_DEP           102310 non-null  object 
 12  name_DEP           102310 non-null  object 
 13  city_DEP           102310 non-null  object 
 14  subd_DEP           100266 non-null  object 
 15  country_DEP        102310 non-null  object 
 16  el

In [6]:
train_df.columns

Index(['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS',
       'AC', 'target', 'icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'city_ARR', 'subd_ARR',
       'country_ARR', 'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR',
       'delay_or_onTime', 'delayed', 'domestic', 'dep_hour', 'dep_weekday',
       'duration_min', 'arr_hour', 'flight_month', 'flight_month_name', 'year',
       'distance'],
      dtype='object')

In [7]:
# List of dropping feature for classification
drop_features_class = ['DATOP', 'ID','FLTID','STD', 'STA','target','icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'city_ARR', 'subd_ARR',
       'country_ARR', 'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR','arr_hour', 'flight_month','delay_or_onTime', 'delayed']

In [8]:
drop_features_reg

['ID',
 'DATOP',
 'FLTID',
 'AC',
 'STATUS',
 'DEPSTN',
 'ARRSTN',
 'flight_month_name',
 'delayed',
 'year',
 'delay_or_onTime',
 'city_ARR',
 'country_ARR']

In [9]:
# Creating list for categorical predictors/features 
# (dates are also objects so if you have them in your data you would deal with them first)
cat_features = [ 'icao_DEP', 'iata_DEP', 'name_DEP', 
       'subd_DEP', 'country_DEP',
       'tz_DEP' ,'tz_ARR']

num_features = ['elevation_ARR', 'distance', 'elevation_DEP']

In [10]:
scaler_reg = StandardScaler()
scaler_reg.fit(train_df[num_features])
train_df[num_features] = scaler_reg.transform(train_df[num_features])
#X_test_reg[num_features] = scaler_reg.transform(X_test_reg[num_features])

In [11]:
#col2 = OneHotEncoder_labels(train_df, cat_features)

#encoder_reg = OneHotEncoder(handle_unknown='ignore', sparse=False, drop='first')
#encoder_reg.fit(train_df[cat_features])

#X_train_dummie_columns = pd.DataFrame(encoder_reg.transform(train_df[cat_features]))
#X_train_reg = train_df.drop(cat_features, axis=1)
#X_train_reg = train_df.join(X_train_dummie_columns)
#X_train_reg.columns = col2

#X_test_dummie_columns = pd.DataFrame(encoder_reg.transform(train_df[cat_features]))
#X_test_reg = X_test_reg.drop(cat_features, axis=1)
#X_test_reg = X_test_reg.join(X_test_dummie_columns)
#X_test_reg.columns = col2

In [12]:
X = train_df.drop('target', axis=1)
y = train_df.target


In [13]:
X = X.drop(labels=drop_features_reg, axis=1)

In [14]:
X.columns

Index(['STD', 'STA', 'icao_DEP', 'iata_DEP', 'name_DEP', 'city_DEP',
       'subd_DEP', 'country_DEP', 'elevation_DEP', 'lat_DEP', 'lon_DEP',
       'tz_DEP', 'icao_ARR', 'iata_ARR', 'name_ARR', 'subd_ARR',
       'elevation_ARR', 'lat_ARR', 'lon_ARR', 'tz_ARR', 'domestic', 'dep_hour',
       'dep_weekday', 'duration_min', 'arr_hour', 'flight_month', 'distance'],
      dtype='object')

In [15]:
X = pd.get_dummies(train_df, columns=cat_features, drop_first=True)

Split Data 

In [16]:
# Split the 'features' and 'target' data into training and testing sets for classification
#X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.2, stratify=y_class, random_state=RSEED)
# Split the 'features' and 'target' data into training and testing sets for classification
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y, test_size=0.2, random_state=RSEED)



# Print shape of the test and train data
print('Train data')
#print('# Training data for classification:     {}'.format(X_train_class.shape[0]))
print('# Training data for regression     {}'.format(X_train_reg.shape[0]))
print('==================')
print('Test data')
#print('# Test data for classification:     {}'.format(X_test_class.shape[0]))
print('# Test data for regression:     {}'.format(X_test_reg.shape[0]))


Train data
# Training data for regression     81848
Test data
# Test data for regression:     20462


### Regression

In [17]:
X['STA'] = X['STA'].str.replace('.', ':', regex=False)
X['STA'] = pd.to_datetime(X['STA']).map(pd.Timestamp.timestamp)
X['STD'] = pd.to_datetime(X['STD']).map(pd.Timestamp.timestamp)

In [18]:
)

KeyError: "['ID'] not found in axis"

In [20]:
# Fit linear regression model
print(X.info()

lin_reg2 = LinearRegression()
lin_reg2.fit(X, y)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102310 entries, 0 to 107832
Columns: 701 entries, ID to tz_ARR_Europe/Zurich
dtypes: float64(11), int64(6), object(17), uint8(667)
memory usage: 96.4+ MB


ValueError: could not convert string to float: 'train_id_15674'

In [None]:
# Calculate r-squared 
y_hat2 = lin_reg2.predict(X2)
print("R-squared:", r2_score(y2, y_hat2).round(3))

In [None]:
print("                 Linear Regression Model:")
print("==="*20)
print("                 RSME:",mean_squared_error(y_test_reg, y_pred_dumm_reg,squared=False))
print("==="*20)