# Load Feature Engineered Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
sns.set() # Setting seaborn as default style even if use only matplotlib
pd.set_option('display.max_rows', None)

In [None]:
import pickle

with open(r'..\data\flight_data_engineered.pickle','rb') as flight_data_file:
     df_flights = pickle.load(flight_data_file)

In [None]:
df_flights = df_flights.reset_index(drop=True)

In [None]:
df_flights.head()

# Check for Missing Values

In [None]:
sum(df_flights.isnull().sum())

# Fix DataTypes

In [None]:
df_flights.info()

In [None]:
# fix datatypes for categorical data

df_flights['mkt_carrier_fl_num'] = df_flights['mkt_carrier_fl_num'].astype(str)
df_flights['op_carrier_fl_num'] = df_flights['op_carrier_fl_num'].astype(str)
df_flights['origin_airport_id'] = df_flights['origin_airport_id'].astype(str)
df_flights['dest_airport_id'] = df_flights['dest_airport_id'].astype(str)

In [None]:
# df_flights.drop(columns='fl_date', inplace = True)
# df_flights.drop(columns='dup', inplace = True)
# df_flights.drop(columns='flights', inplace = True)

In [None]:
sum(df_flights.isnull().sum())

# Detect / Treat Outliers

In [None]:
df_flights.describe()

In [None]:
df_flights.head()

In [None]:
def uni_analyis_continuous(df, feature, feature_title):
    
    import matplotlib.pyplot as plt
    import matplotlib.transforms as mtrans

    fig, axes = plt.subplots(1,2, sharex=False, figsize=(10,5))
    fig.suptitle(f'Univariate Analysis - {feature_title}')

    sns.boxplot(ax=axes[0], data=df[feature])
    axes[0].set_title(f'{feature_title} Boxplot')
    sns.histplot(ax=axes[1], data=df[feature])
    axes[1].set_title(f'{feature_title} Histogram')

    Minimum = min(df[feature])
    Median = df[feature].median()
    Mean = df[feature].mean()
    Maximum = max(df[feature])
    
    trans = mtrans.blended_transform_factory(fig.transFigure, mtrans.IdentityTransform())

    txt = fig.text(0.5,10, f'Minimum : {Minimum}, Median : {Median}, Mean : {Mean}, Maximum : {Maximum}', ha='center', va='center_baseline')
    txt.set_transform(trans)

In [None]:
for feat in ['crs_elapsed_time','distance','arr_delay']:
    uni_analyis_continuous(df_flights, feature=feat, feature_title=feat)

In [None]:
# check if the outliers seem like mistakes, or if they could be possible. If mistakes, then drop, otherwise keep.

In [None]:
from scipy import stats
import numpy as np

#Finding Z Score on Column
stats.zscore(df_flights['crs_elapsed_time'])

# Turning Absolute
np.abs(stats.zscore(df_flights['crs_elapsed_time']))

# view the outliers
outliers = df_flights['crs_elapsed_time'][(np.abs(stats.zscore(df_flights['crs_elapsed_time'])) > 3)]
outliers.max()
# # drop outliers
# df_flights.drop(outliers.index, inplace=True)

In [None]:
# 11 hour flights are possible - so keep

In [None]:
# df_flights.shape

In [None]:
from scipy import stats
import numpy as np

#Finding Z Score on Column
stats.zscore(df_flights['distance'])

# Turning Absolute
np.abs(stats.zscore(df_flights['distance']))

# view the outliers
outliers = df_flights['distance'][(np.abs(stats.zscore(df_flights['distance'])) > 3)]
outliers.max()
# # # drop outliers
# df_flights.drop(outliers.index, inplace=True)

In [None]:
# 4983 mile flights are possible, so keep outliers

In [None]:
df_flights.shape

In [None]:
from scipy import stats
import numpy as np

#Finding Z Score on Column
stats.zscore(df_flights['arr_delay'])

# Turning Absolute
np.abs(stats.zscore(df_flights['arr_delay']))

# view the outliers
outliers = df_flights['arr_delay'][(np.abs(stats.zscore(df_flights['arr_delay'])) > 3)]
outliers.max()
# # drop outliers
# df_flights.drop(outliers.index, inplace=True)

In [None]:
# a 25 hour delay seems possible, so keep

In [None]:
df_flights = df_flights.reset_index(drop=True)

In [None]:
# do log transformation of 'crs_elapsed_time','distance','arr_delay'

# df_flights['crs_elapsed_time_log'] = np.log(df_flights['crs_elapsed_time'])
# df_flights['distance_log'] = np.log(df_flights['distance'])
# df_flights['arr_delay_log'] = np.log(df_flights['arr_delay'])

In [None]:
# df_flights.drop('crs_elapsed_time', axis = 1, inplace = True)
# df_flights.drop('distance', axis = 1, inplace = True)
# df_flights.drop('arr_delay', axis = 1, inplace = True)


In [None]:
sum(df_flights.isnull().sum())

In [None]:
# for feat in ['crs_elapsed_time_log','distance_log','arr_delay']:
#     uni_analyis_continuous(df_flights, feature=feat, feature_title=feat)

# Transform Categorical Variables into Numeric Data

In [None]:
df_flights.info()

In [None]:
# <class 'pandas.core.frame.DataFrame'>
# Int64Index: 93123 entries, 0 to 100997
# Data columns (total 36 columns):
#  #   Column                Non-Null Count  Dtype         
# ---  ------                --------------  -----         
#  0   mkt_unique_carrier    93123 non-null  object        
#  1   branded_code_share    93123 non-null  object        
#  2   mkt_carrier           93123 non-null  object        
#  3   mkt_carrier_fl_num    93123 non-null  object        
#  4   op_unique_carrier     93123 non-null  object        
#  5   tail_num              93123 non-null  object        
#  6   op_carrier_fl_num     93123 non-null  object        
#  7   origin_airport_id     93123 non-null  object        
#  8   origin                93123 non-null  object        
#  9   origin_city_name      93123 non-null  object        
#  10  dest_airport_id       93123 non-null  object        
#  11  dest                  93123 non-null  object        
#  12  dest_city_name        93123 non-null  object        
#  13  crs_dep_time          93123 non-null  datetime64[ns]
#  14  crs_arr_time          93123 non-null  datetime64[ns]
#  15  year                  93123 non-null  int64         
#  16  month                 93123 non-null  int64         
#  17  day                   93123 non-null  int64         
#  18  mean_arr_delay        93123 non-null  float64       
#  19  median_arr_delay      93123 non-null  float64       
#  20  season                93123 non-null  object        
#  21  day_of_week           93123 non-null  object        
#  22  is_weekend            93123 non-null  bool          
#  23  is_holiday            93123 non-null  bool          
#  24  crs_dep_time_hour     93123 non-null  float64       
#  25  crs_dep_time_of_day   93123 non-null  category      
#  26  crs_arr_time_hour     93123 non-null  float64       
#  27  crs_arr_time_of_day   93123 non-null  category      
#  28  origin_state          93123 non-null  object        
#  29  dest_state            93123 non-null  object        
#  30  origin_region         93123 non-null  object        
#  31  dest_region           93123 non-null  object        
#  32  haul                  93123 non-null  category      
#  33  crs_elapsed_time_log  93123 non-null  float64       
#  34  distance_log          93123 non-null  float64       
#  35  arr_delay_log         33133 non-null  float64       
# dtypes: bool(2), category(3), datetime64[ns](2), float64(7), int64(3), object(19)
# memory usage: 25.2+ MB

In [None]:
# df_flights = df_flights.drop(columns=['mkt_carrier','mkt_carrier_fl_num','tail_num','op_carrier_fl_num','origin_airport_id','origin','origin_city_name','dest_airport_id','dest','dest_city_name','crs_dep_time','crs_arr_time'],axis=1)

In [None]:
# mkt_unique_carrier unique values: 10
# branded_code_share unique values: 15
# mkt_carrier unique values: 10             drop
# mkt_carrier_fl_num unique values: 6770    drop
# op_unique_carrier unique values: 27
# tail_num unique values: 6150              drop
# op_carrier_fl_num unique values: 6772     drop
# origin_airport_id unique values: 368      drop
# origin unique values: 368                 drop
# origin_city_name unique values: 361       drop
# dest_airport_id unique values: 370        drop
# dest unique values: 370                   drop
# dest_city_name unique values: 363         drop
# cancelled unique values: 1                drop
# diverted unique values: 1                 drop
# dup unique values: 1                      drop

In [None]:
df_flights.info()

In [None]:
#  0   mkt_unique_carrier    93123 non-null  object  -dummy
#  1   branded_code_share    93123 non-null  object  -dummy
#  2   op_unique_carrier     93123 non-null  object  -dummy
#  3   arr_delay             93123 non-null  float64 
#  4   year                  93123 non-null  int64   
#  5   month                 93123 non-null  int64   
#  6   day                   93123 non-null  int64   
#  7   mean_arr_delay        93123 non-null  float64 
#  8   median_arr_delay      93123 non-null  float64 
#  9   season                93123 non-null  object  -dummy
#  10  day_of_week           93123 non-null  object  -dummy
#  11  is_weekend            93123 non-null  bool    -map 0 to false, 1 to true
#  12  is_holiday            93123 non-null  bool    -map 0 to false, 1 to true
#  13  crs_dep_time_hour     93123 non-null  float64 
#  14  crs_dep_time_of_day   93123 non-null  category-dummy
#  15  crs_arr_time_hour     93123 non-null  float64 
#  16  crs_arr_time_of_day   93123 non-null  category-dummy
#  17  origin_state          93123 non-null  object  -dummy
#  18  dest_state            93123 non-null  object  -dummy
#  19  origin_region         93123 non-null  object  -dummy
#  20  dest_region           93123 non-null  object  -dummy
#  21  haul                  93123 non-null  category-dummy
#  22  crs_elapsed_time_log  93123 non-null  float64 
#  23  distance_log          93123 non-null  float64

In [None]:
sum(df_flights.isnull().sum())

In [None]:
# transform nominal variables into Dummy Variables

cat_feats = df_flights.dtypes[(df_flights.dtypes == 'object') | (df_flights.dtypes == 'category') ].index.tolist()
df_dummy = pd.get_dummies(df_flights[cat_feats])
df_dummy.head()

In [None]:
sum(df_dummy.isnull().sum())

In [None]:
df_dummy.shape

In [None]:
# Map values onto numbers for bool variables

df_flights.replace({
    "is_weekend" : {True : 1, False : 0}, 
    "is_holiday" : {True : 1, False : 0},
    }, inplace=True)

In [None]:
# drop the nominal variables from the original dataset

numeric_df = df_flights.drop(columns=cat_feats)


In [None]:
numeric_df.shape

In [None]:
# scale the numeric data
#use MinMaxScaler to scale data into a given range ((0,1) by default)

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(numeric_df), columns=numeric_df.columns)
scaled_data.head()

In [None]:
scaled_data.shape

In [None]:
scaled_data['year'].value_counts()

In [None]:
scaled_data['month'].value_counts()

In [None]:
len(scaled_data['day'].value_counts())

In [None]:
sum(scaled_data.isnull().sum())

In [None]:
transformed_df = scaled_data.merge(df_dummy, how='outer', left_index = True, right_index = True)
transformed_df.head(50)
transformed_df.shape

In [None]:
# merge the scaled and dummy variables into one dataset

transformed_df = pd.concat([scaled_data, df_dummy], axis=1)
transformed_df.head(50)

In [None]:
transformed_df.shape

In [None]:
transformed_df.isnull().sum()

In [None]:
# make a pickle of the feature engineered data
import pickle

#Do Some Feature Engineering
feature_engineered__trans_data = transformed_df

#Dump it(save it in binary format)
with open(r'..\data\flight_data_engineered_trans.pickle','wb') as flight_data_file:
     pickle.dump(feature_engineered__trans_data, flight_data_file)

# Iteration 2

### Load Feature Engineered Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
sns.set() # Setting seaborn as default style even if use only matplotlib
pd.set_option('display.max_rows', None)

In [2]:
import pickle

with open(r'..\data\flight_data_engineered2.pickle','rb') as flight_data_file:
     df_flights = pickle.load(flight_data_file)

In [3]:
df_flights = df_flights.reset_index(drop=True)

In [4]:
df_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98363 entries, 0 to 98362
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   op_unique_carrier          98363 non-null  object  
 1   crs_elapsed_time           98363 non-null  int64   
 2   arr_delay                  98363 non-null  float64 
 3   mean_arr_delay             98363 non-null  float64 
 4   season                     98363 non-null  object  
 5   day_of_week                98363 non-null  object  
 6   is_weekend                 98363 non-null  bool    
 7   is_holiday                 98363 non-null  bool    
 8   crs_dep_time_of_day        98363 non-null  category
 9   crs_arr_time_of_day        98363 non-null  category
 10  origin_region              98363 non-null  object  
 11  dest_region                98363 non-null  object  
 12  haul                       98363 non-null  category
 13  mean_origin_airport_delay  9836

### Check for Missing Values

In [5]:
sum(df_flights.isnull().sum())

0

### Fix DataTypes

In [6]:
df_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98363 entries, 0 to 98362
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   op_unique_carrier          98363 non-null  object  
 1   crs_elapsed_time           98363 non-null  int64   
 2   arr_delay                  98363 non-null  float64 
 3   mean_arr_delay             98363 non-null  float64 
 4   season                     98363 non-null  object  
 5   day_of_week                98363 non-null  object  
 6   is_weekend                 98363 non-null  bool    
 7   is_holiday                 98363 non-null  bool    
 8   crs_dep_time_of_day        98363 non-null  category
 9   crs_arr_time_of_day        98363 non-null  category
 10  origin_region              98363 non-null  object  
 11  dest_region                98363 non-null  object  
 12  haul                       98363 non-null  category
 13  mean_origin_airport_delay  9836

In [7]:
# all good

### Transform Categorical Variables into Numeric Data

In [9]:
# transform nominal variables into Dummy Variables

cat_feats = df_flights.dtypes[(df_flights.dtypes == 'object') | (df_flights.dtypes == 'category') ].index.tolist()
df_dummy = pd.get_dummies(df_flights[cat_feats])
df_dummy.head()

Unnamed: 0,op_unique_carrier_9E,op_unique_carrier_9K,op_unique_carrier_AA,op_unique_carrier_AS,op_unique_carrier_AX,op_unique_carrier_B6,op_unique_carrier_C5,op_unique_carrier_CP,op_unique_carrier_DL,op_unique_carrier_EM,...,origin_region_South,origin_region_West,dest_region_Midwest,dest_region_Northeast,dest_region_Other,dest_region_South,dest_region_West,haul_short,haul_medium,haul_long
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0


In [10]:
# Map values onto numbers for bool variables

df_flights.replace({
    "is_weekend" : {True : 1, False : 0}, 
    "is_holiday" : {True : 1, False : 0},
    }, inplace=True)

In [11]:
# drop the nominal variables from the original dataset

numeric_df = df_flights.drop(columns=cat_feats)


In [13]:
# scale the numeric data

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(numeric_df), columns=numeric_df.columns)
scaled_data.head()

Unnamed: 0,crs_elapsed_time,arr_delay,mean_arr_delay,is_weekend,is_holiday,mean_origin_airport_delay,mean_dest_airport_delay
0,0.016631,0.281114,-0.968084,-0.604196,6.414322,0.051481,1.330524
1,0.156155,0.837591,-0.968084,-0.604196,6.414322,0.051481,0.269255
2,0.156155,-0.295973,-0.968084,-0.604196,6.414322,0.051481,0.269255
3,-1.169323,-0.172311,-0.968084,-0.604196,6.414322,0.051481,-0.266687
4,-1.169323,0.652099,-0.968084,-0.604196,6.414322,0.051481,-0.266687


In [22]:
# merge the scaled and dummy variables into one dataset

transformed_df = scaled_data.merge(df_dummy, how='outer', left_index = True, right_index = True)
transformed_df.head()

Unnamed: 0,crs_elapsed_time,arr_delay,mean_arr_delay,is_weekend,is_holiday,mean_origin_airport_delay,mean_dest_airport_delay,op_unique_carrier_9E,op_unique_carrier_9K,op_unique_carrier_AA,...,origin_region_South,origin_region_West,dest_region_Midwest,dest_region_Northeast,dest_region_Other,dest_region_South,dest_region_West,haul_short,haul_medium,haul_long
0,0.016631,0.281114,-0.968084,-0.604196,6.414322,0.051481,1.330524,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.156155,0.837591,-0.968084,-0.604196,6.414322,0.051481,0.269255,0,0,0,...,0,1,1,0,0,0,0,0,1,0
2,0.156155,-0.295973,-0.968084,-0.604196,6.414322,0.051481,0.269255,0,0,0,...,0,1,1,0,0,0,0,0,1,0
3,-1.169323,-0.172311,-0.968084,-0.604196,6.414322,0.051481,-0.266687,0,0,0,...,0,1,0,0,0,0,1,1,0,0
4,-1.169323,0.652099,-0.968084,-0.604196,6.414322,0.051481,-0.266687,0,0,0,...,0,1,0,0,0,0,1,1,0,0


### Make a pickle of the transformed data

In [21]:
import pickle

feature_engineered_trans_data2 = transformed_df

with open(r'..\data\flight_data_engineered_trans2.pickle','wb') as flight_data_file:
      pickle.dump(feature_engineered_trans_data2, flight_data_file)

# Iteration 3

### Load Feature Engineered Data

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
sns.set() # Setting seaborn as default style even if use only matplotlib
pd.set_option('display.max_rows', None)

In [24]:
import pickle

with open(r'..\data\flight_data_engineered3.pickle','rb') as flight_data_file:
     df_flights = pickle.load(flight_data_file)

In [25]:
df_flights = df_flights.reset_index(drop=True)

In [26]:
df_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99023 entries, 0 to 99022
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   mean_flight_num_delay  99023 non-null  float64
 1   arr_delay              99023 non-null  float64
dtypes: float64(2)
memory usage: 1.5 MB


### Check for Missing Values

In [27]:
sum(df_flights.isnull().sum())

0

### Fix DataTypes

In [28]:
df_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99023 entries, 0 to 99022
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   mean_flight_num_delay  99023 non-null  float64
 1   arr_delay              99023 non-null  float64
dtypes: float64(2)
memory usage: 1.5 MB


In [None]:
# all good

### Transform Categorical Variables into Numeric Data

In [30]:
# # transform nominal variables into Dummy Variables

# cat_feats = df_flights.dtypes[(df_flights.dtypes == 'object') | (df_flights.dtypes == 'category') ].index.tolist()
# df_dummy = pd.get_dummies(df_flights[cat_feats])
# df_dummy.head()

In [None]:
# # Map values onto numbers for bool variables

# df_flights.replace({
#     "is_weekend" : {True : 1, False : 0}, 
#     "is_holiday" : {True : 1, False : 0},
#     }, inplace=True)

In [None]:
# drop the nominal variables from the original dataset

# numeric_df = df_flights.drop(columns=cat_feats)


In [None]:
# scale the numeric data

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(numeric_df), columns=numeric_df.columns)
scaled_data.head()

Unnamed: 0,crs_elapsed_time,arr_delay,mean_arr_delay,is_weekend,is_holiday,mean_origin_airport_delay,mean_dest_airport_delay
0,0.016631,0.281114,-0.968084,-0.604196,6.414322,0.051481,1.330524
1,0.156155,0.837591,-0.968084,-0.604196,6.414322,0.051481,0.269255
2,0.156155,-0.295973,-0.968084,-0.604196,6.414322,0.051481,0.269255
3,-1.169323,-0.172311,-0.968084,-0.604196,6.414322,0.051481,-0.266687
4,-1.169323,0.652099,-0.968084,-0.604196,6.414322,0.051481,-0.266687


In [None]:
# merge the scaled and dummy variables into one dataset

transformed_df = scaled_data.merge(df_dummy, how='outer', left_index = True, right_index = True)
transformed_df.head()

Unnamed: 0,crs_elapsed_time,arr_delay,mean_arr_delay,is_weekend,is_holiday,mean_origin_airport_delay,mean_dest_airport_delay,op_unique_carrier_9E,op_unique_carrier_9K,op_unique_carrier_AA,...,origin_region_South,origin_region_West,dest_region_Midwest,dest_region_Northeast,dest_region_Other,dest_region_South,dest_region_West,haul_short,haul_medium,haul_long
0,0.016631,0.281114,-0.968084,-0.604196,6.414322,0.051481,1.330524,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.156155,0.837591,-0.968084,-0.604196,6.414322,0.051481,0.269255,0,0,0,...,0,1,1,0,0,0,0,0,1,0
2,0.156155,-0.295973,-0.968084,-0.604196,6.414322,0.051481,0.269255,0,0,0,...,0,1,1,0,0,0,0,0,1,0
3,-1.169323,-0.172311,-0.968084,-0.604196,6.414322,0.051481,-0.266687,0,0,0,...,0,1,0,0,0,0,1,1,0,0
4,-1.169323,0.652099,-0.968084,-0.604196,6.414322,0.051481,-0.266687,0,0,0,...,0,1,0,0,0,0,1,1,0,0


### Make a pickle of the transformed data

In [None]:
import pickle

feature_engineered_trans_data2 = transformed_df

with open(r'..\data\flight_data_engineered_trans2.pickle','wb') as flight_data_file:
      pickle.dump(feature_engineered_trans_data2, flight_data_file)