# Prep procedure

### Import Modules

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import datetime
import numpy as np
from sklearn import preprocessing 

### Import data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
#Import data
#path = '/content/drive/My Drive/flights_data/'
path = '/content/drive/My Drive/PERSO/LIGHTHOUSE LABS/MIDTERM/data/'
df = pd.read_csv(str(path)+"flights_sample_10.csv",low_memory=False)

## Prep Data

In [None]:
print(df.columns)

Index(['Unnamed: 0', 'fl_date', 'mkt_unique_carrier', 'branded_code_share',
       'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime', 'no_name'],
      dtype='object')


In [None]:
dfF = df[['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time','dep_delay',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance','arr_delay','carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay','cancelled','taxi_in', 'taxi_out','diverted']].copy()

In [None]:
total = dfF.isnull().sum().sort_values(ascending=False)
percent = (dfF.isnull().sum()/dfF.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
late_aircraft_delay,1291631,0.8111
security_delay,1291631,0.8111
nas_delay,1291631,0.8111
weather_delay,1291631,0.8111
carrier_delay,1291631,0.8111
arr_delay,31241,0.019618
taxi_in,28187,0.0177
taxi_out,27433,0.017227
dep_delay,26581,0.016692
tail_num,5008,0.003145


In [112]:
#dfF = dfF[dfF.dep_delay.notnull()]
#dfF = dfF[dfF.arr_delay.notnull()]

## Feature engineering

### Convert delays to Bool

In [None]:
#list of each delay type
delaytypes = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

#creates new bool column for each delay type, where 1 if that delay type is the max of all delays AND is > 0
for delay in delaytypes:
    dfF['is'+delay] = np.where((dfF[delaytypes].idxmax(axis=1) == delay)&(dfF[delay]>0), 1, 0)


In [None]:
# column that shows which delay it is (if any)

dfF['delay_type'] = df[delaytypes].idxmax(axis="columns")

In [None]:
delay_dict = {'carrier_delay':1, 'weather_delay':2, 'nas_delay':3, 'security_delay':4, 'late_aircraft_delay':5}
dfF['delay_type'] = dfF['delay_type'].map(delay_dict)


### Create flight number

In [None]:
dfF['flight_number'] = dfF['mkt_unique_carrier']+dfF['mkt_carrier_fl_num'].astype(str)+dfF['origin']+dfF['dest']

In [None]:
dfF['flight_number'].value_counts()

UA1768IAHTPA    98
NK446LASORD     95
UA778IADMCO     95
UA773DENSMF     93
HA64LIHLAX      93
                ..
WN4802SFOSAN     1
DL496SLCSEA      1
AA4958PHLALB     1
WN670BWIMSY      1
WN415SJCONT      1
Name: flight_number, Length: 280824, dtype: int64

In [None]:
print('Number of different flight_numbers in train set : ' +str(len(dfF['flight_number'].value_counts())))

Number of different flight_numbers in train set : 280824


In [None]:
dfF.flights.describe()

count    1592444.0
mean           1.0
std            0.0
min            1.0
25%            1.0
50%            1.0
75%            1.0
max            1.0
Name: flights, dtype: float64

### Convert fly_date to Month / Day / Day of year / Day of Week / Dep Hour and Arr Hour

In [None]:
dfF.fl_date = pd.to_datetime(dfF.fl_date, format='%Y-%m-%d')

dfF['fl_month'] = pd.DatetimeIndex(dfF.fl_date).month
dfF['fl_day'] = pd.DatetimeIndex(dfF.fl_date).day
dfF['fl_day_of_year'] = dfF.fl_date.dt.dayofyear
dfF['fl_day_of_week'] = dfF.fl_date.dt.dayofweek

dfF['crs_dep_time'] = dfF['crs_dep_time'].apply(lambda x: '{0:0>4}'.format(x))
dfF['crs_arr_time'] = dfF['crs_arr_time'].apply(lambda x: '{0:0>4}'.format(x))

dfF['dep_hour'] = dfF['crs_dep_time'].str[:2]
dfF['arr_hour'] = dfF['crs_arr_time'].str[:2]

### Create short, med, long haul duration type

In [None]:
# duration type (eg short, med, long haul)
bins = [-100, 180, 360, 2000]
labels = [1,2,3]
dfF['duration_type'] = pd.cut(dfF['crs_elapsed_time'], bins=bins, labels=labels)

In [None]:
#dfF

## Day section

In [None]:
hour_to_daysection = {'00': 0,
                      '01': 0,
                      '02': 0,
                      '03': 0,
                      '04': 0,
                      '05': 0,
                      '06': 1,
                      '07': 1,
                      '08': 1,
                      '09': 1,
                      '10': 1,
                      '11': 2,
                      '12': 2,
                      '13': 2,
                      '14': 2,
                      '15': 2,
                      '16': 2,
                      '17': 3,
                      '18': 3,
                      '19': 3,
                      '20': 4,
                      '21': 4,
                      '22': 4,
                      '23': 4,
                      '24': 4}

# Night = 0, Morning rush = 1, Midday = 2, Afternoon rush = 3, Evening = 4

dfF['dep_daysection'] = dfF['dep_hour'].map(hour_to_daysection)

dfF['arr_daysection'] = dfF['arr_hour'].map(hour_to_daysection)

## Season

In [None]:
month_to_season = {1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 3, 7: 3, 8: 3, 9: 4, 10: 4, 11: 4, 12: 1}
dfF['fl_season'] = dfF['fl_month'].map(month_to_season)

In [None]:
#dfF

## Origin and Destination State

In [None]:
dfF['origin_state'] = dfF['origin_city_name'].str[-2:]
dfF['dest_state'] = dfF['dest_city_name'].str[-2:]

In [None]:
#dfF

# Aggregate statistics values

## Carrier Volume


In [None]:
# total volume at origin and dest airport
dfF['origin_airport_vol'] = dfF.groupby(['origin_airport_id'])['flights'].transform(np.sum)
dfF['dest_airport_vol'] = dfF.groupby(['origin_airport_id'])['flights'].transform(np.sum)

# how much the airline operates out of that airport
dfF['carrier_origin_apt_vol'] = dfF.groupby(['origin_airport_id','op_unique_carrier'])['flights'].transform(np.sum)
dfF['carrier_dest_apt_vol'] = dfF.groupby(['dest_airport_id','op_unique_carrier'])['flights'].transform(np.sum)

# how much the airline operates out of that airport as pct
dfF['carrier_origin_apt_pct'] = dfF['carrier_origin_apt_vol'] / dfF['origin_airport_vol'] * 100
dfF['carrier_dest_apt_pct'] = dfF['carrier_dest_apt_vol'] / dfF['dest_airport_vol'] * 100

## Mean Carrier Delay

In [None]:
dfF['mean_carrier_arr_delay'] = dfF.groupby(['op_unique_carrier'])['arr_delay'].transform(np.mean)


## Carrier Delay per month

In [115]:
stattype = [('mean', np.mean), ('median', np.median), ('std',np.std)]

for stat in stattype:
  # arr per airline per month
  dfF[stat[0]+'_carrier_arr_delay_month'] = dfF.groupby(['op_unique_carrier','fl_month'])['arr_delay'].transform(stat[1])

  # dep per airline per month
  dfF[stat[0]+'_carrier_dep_delay_month'] = dfF.groupby(['op_unique_carrier','fl_month'])['dep_delay'].transform(stat[1])

In [116]:
## Mean Carrier-Airport-(DEP and ARR) delay per (DEP and ARR) hour

stattype = [('mean', np.mean), ('median', np.median), ('std',np.std)]

for stat in stattype:
  dfF[stat[0]+'_carrier_origin_dep-hour_dep_delay'] = dfF.groupby(['op_unique_carrier','origin','dep_hour'])['dep_delay'].transform(stat[1])
  dfF[stat[0]+'_carrier_dest_fl-hour_delay'] = dfF.groupby(['op_unique_carrier','dest','arr_hour'])['arr_delay'].transform(stat[1])

In [117]:
## Mean DEP and ARR delay for Carrier routes per month per day of the week per day section. 

stattype = [('mean', np.mean), ('median', np.median), ('std',np.std)]

for stat in stattype:
  dfF[stat[0]+'_carrier_route_month_dow_dayS_dep-delay'] = dfF.groupby(['op_unique_carrier','origin','dest','fl_month','fl_day_of_week','dep_daysection'])['dep_delay'].transform(stat[1])
  dfF[stat[0]+'_carrier_route_month_dow_dayS_arr-delay'] = dfF.groupby(['op_unique_carrier','origin','dest','fl_month','fl_day_of_week','arr_daysection'])['arr_delay'].transform(stat[1])



In [None]:
#dfF

In [118]:
#means
stattype = [('mean', np.mean), ('median', np.median), ('std',np.std)]

for stat in stattype:
  #by month
  dfF[stat[0]+'_route_arr_delay_month'] = dfF.groupby(['origin','dest','fl_month'])['arr_delay'].transform(stat[1])
  dfF[stat[0]+'_route_dep_delay_month'] = dfF.groupby(['origin','dest','fl_month'])['dep_delay'].transform(stat[1])
  
  #by day of week
  dfF[stat[0]+'_route_arr_delay_dow'] = dfF.groupby(['origin','dest','fl_day_of_week'])['arr_delay'].transform(stat[1])
  dfF[stat[0]+'_route_dep_delay_dow'] = dfF.groupby(['origin','dest','fl_day_of_week'])['dep_delay'].transform(stat[1])

  #by departure day section
  dfF[stat[0]+'_route_arr_delay_dep_hour'] = dfF.groupby(['origin','dest','dep_daysection'])['arr_delay'].transform(stat[1])
  dfF[stat[0]+'_route_dep_delay_dep_hour'] = dfF.groupby(['origin','dest','dep_daysection'])['dep_delay'].transform(stat[1])

  #by arrival day section
  dfF[stat[0]+'_route_arr_delay_arr_hour'] = dfF.groupby(['origin','dest','arr_daysection'])['arr_delay'].transform(stat[1])
  dfF[stat[0]+'_route_dep_delay_arr_hour'] = dfF.groupby(['origin','dest','arr_daysection'])['dep_delay'].transform(stat[1])


## Delay by Route/Time of Year

In [None]:
## Origin weather delay per month PCT
dfF['origin_month_weather_delay_pct'] = dfF.groupby(['origin','fl_month'])['isweather_delay'].transform(np.sum) / dfF.groupby(['origin','fl_month'])['flights'].transform(np.sum) * 100
## Dest weather delay per month PCT 
dfF['dest_month_weather_delay_pct'] = dfF.groupby(['dest','fl_month'])['isweather_delay'].transform(np.sum) / dfF.groupby(['dest','fl_month'])['flights'].transform(np.sum) * 100
## Route weather delay per month PCT
dfF['route_month_weather_delay_pct'] = dfF.groupby(['origin','dest','fl_month'])['isweather_delay'].transform(np.sum) / dfF.groupby(['origin','dest','fl_month'])['flights'].transform(np.sum) * 100

## Delay by Time of Day

In [None]:
#list of each delay type
delaytypes = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
times = ['arr_daysection', 'dep_daysection']
#creates feature for % of each delay type for time of day
for delay in delaytypes:
  for time in times:
    dfF[time+'_'+delay+'_pct'] = dfF.groupby([time])['is'+delay].transform(np.sum) / dfF.groupby([time])['flights'].transform(np.sum) * 100

#dropping arrival time vs security and late aircraft delay
dfF = dfF.drop(['arr_daysection_security_delay_pct','arr_daysection_late_aircraft_delay_pct'],axis=1)

In [113]:
#dfF

## Delay Type vs Carrier

In [None]:
#list of each delay type
delaytypes = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

#creates feature for % of each delay type for each carrier
for delay in delaytypes:
    dfF['carrier_'+delay+'_pct'] = dfF.groupby(['op_unique_carrier'])['is'+delay].transform(np.sum) / dfF.groupby(['op_unique_carrier'])['flights'].transform(np.sum) * 100



In [None]:
#list of each delay type
delaytypes = ['carrier_delay', 'late_aircraft_delay']
#creates fields for each delay type
for delay in delaytypes:
    dfF['carrier_route_month_'+delay] = dfF.groupby(['op_unique_carrier','origin','dest','fl_month'])['is'+delay].transform(np.sum) / dfF.groupby(['op_unique_carrier','origin','dest','fl_month'])['flights'].transform(np.sum) * 100

In [120]:
#dfF

## Origin and destination and route month weekday daysection NAS delay PCT


In [None]:
#list of each delay type
delaytypes = ['nas_delay']
#creates fields for each delay type
for delay in delaytypes:
    dfF['origin_month_weekday_daysection_'+delay] = dfF.groupby(['origin','fl_month','fl_day_of_week','dep_daysection'])['is'+delay].transform(np.sum) / dfF.groupby(['origin','fl_month','fl_day_of_week','dep_daysection'])['flights'].transform(np.sum) * 100
for delay in delaytypes:
    dfF['dest_month_weekday_daysection_'+delay] = dfF.groupby(['dest','fl_month','fl_day_of_week','arr_daysection'])['is'+delay].transform(np.sum) / dfF.groupby(['dest','fl_month','fl_day_of_week','arr_daysection'])['flights'].transform(np.sum) * 100
for delay in delaytypes:
    dfF['route_month_weekday_daysection_'+delay] = dfF.groupby(['origin','dest','fl_month','fl_day_of_week','dep_daysection'])['is'+delay].transform(np.sum) / dfF.groupby(['origin','dest','fl_month','fl_day_of_week','dep_daysection'])['flights'].transform(np.sum) * 100

# Columns dropping stars here
## Convert categorical data

### Get dummies

In [None]:
#Get dummies
df_dummy = pd.get_dummies(dfF[["op_unique_carrier"]])
df_dummy.shape
# Merge into DF
dfF = pd.concat([dfF,df_dummy],axis=1).drop(["op_unique_carrier",'op_unique_carrier_9E'],axis=1)

#Get dummies
#df_dummy = pd.get_dummies(dfFT[["op_unique_carrier"]])
#df_dummy.shape
# Merge into DF
#dfFT = pd.concat([dfFT,df_dummy],axis=1).drop(["op_unique_carrier",'op_unique_carrier_9E'],axis=1)

KeyError: ignored

### Drop columns

In [None]:
list(dfF.columns)

['fl_date',
 'mkt_unique_carrier',
 'branded_code_share',
 'mkt_carrier',
 'mkt_carrier_fl_num',
 'tail_num',
 'op_carrier_fl_num',
 'origin_airport_id',
 'origin',
 'origin_city_name',
 'dest_airport_id',
 'dest',
 'dest_city_name',
 'crs_dep_time',
 'dep_delay',
 'crs_arr_time',
 'dup',
 'crs_elapsed_time',
 'flights',
 'distance',
 'arr_delay',
 'carrier_delay',
 'weather_delay',
 'nas_delay',
 'security_delay',
 'late_aircraft_delay',
 'cancelled',
 'taxi_in',
 'taxi_out',
 'diverted',
 'iscarrier_delay',
 'isweather_delay',
 'isnas_delay',
 'issecurity_delay',
 'islate_aircraft_delay',
 'delay_type',
 'flight_number',
 'fl_month',
 'fl_day',
 'fl_day_of_year',
 'fl_day_of_week',
 'dep_hour',
 'arr_hour',
 'duration_type',
 'dep_daysection',
 'arr_daysection',
 'fl_season',
 'origin_state',
 'dest_state',
 'origin_airport_vol',
 'dest_airport_vol',
 'carrier_origin_apt_vol',
 'carrier_dest_apt_vol',
 'carrier_origin_apt_pct',
 'carrier_dest_apt_pct',
 'mean_carrier_arr_delay',
 'me

In [None]:
dfF1 = dfF

In [None]:
#dfF1 = dfF1.select_dtypes('number').dropna(axis='columns')

In [None]:
#dfF1

In [None]:
dfF1 = dfF1.drop([
'carrier_delay',
 'weather_delay',
 'nas_delay',
 'security_delay',
 'late_aircraft_delay',                             
'iscarrier_delay',
 'isweather_delay',
 'isnas_delay',
 'issecurity_delay',
 'islate_aircraft_delay'],axis=1)

## Deal with Naans

In [None]:
total = dfF1.isnull().sum().sort_values(ascending=False)
percent = (dfF1.isnull().sum()/dfF1.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(50)

Unnamed: 0,Total,Percent
delay_type,1291631,0.8111
std_carrier_route_month_dow_dayS_arr-delay,677141,0.425221
std_carrier_route_month_dow_dayS_dep-delay,656026,0.411962
arr_delay,31241,0.019618
taxi_in,28187,0.0177
taxi_out,27433,0.017227
dep_delay,26581,0.016692
mean_carrier_route_month_dow_dayS_arr-delay,14157,0.00889
median_carrier_route_month_dow_dayS_arr-delay,14157,0.00889
median_carrier_route_month_dow_dayS_dep-delay,11801,0.007411


In [None]:
# Remove Nans
dfF1 = dfF1[dfF1['delay_type'].notnull()]
dfF1 = dfF1[dfF1['duration_type'].notnull()]
# dfF1 = dfF1[dfF1['crs_elapsed_time'].notnull()]

In [None]:
dfF1 = dfF1[dfF1['taxi_in'].notnull()]
dfF1 = dfF1[dfF1['taxi_out'].notnull()]
dfF1 = dfF1[dfF1['dep_delay'].notnull()]
dfF1 = dfF1[dfF1['mean_carrier_route_month_dow_dayS_dep-delay'].notnull()]
dfF1 = dfF1[dfF1['median_carrier_route_month_dow_dayS_dep-delay'].notnull()]

In [None]:
# Keep numerical data only
dfF1 = dfF1.select_dtypes('number')
# Remove Columns with Nans
dfF1 = dfF1.select_dtypes('number').dropna(axis='columns')

In [None]:
y = dfF1.delay_type
df_numeric = dfF1.drop(["delay_type"],axis=1)

from sklearn.feature_selection import VarianceThreshold
# remove features with small variance
vt = VarianceThreshold(0.1)
df_transformed = vt.fit_transform(df_numeric)
selected_columns = df_numeric.columns[vt.get_support()]
# transforming an array back to a data-frame preserves column labels
df_transformed = pd.DataFrame(df_transformed, columns = selected_columns)

In [None]:
# remove highly correlated pairs
df_corr = df_transformed.corr().abs()
indices = np.where(df_corr > 0.9)
indices = [(df_corr.index[x], df_corr.columns[y]) for x, y in zip(*indices)
              if x != y and x < y]
for idx in indices:
    try:
        df_transformed.drop(idx[1], axis = 1, inplace=True)
    except KeyError:
        pass

In [None]:
# forward regression
from sklearn.feature_selection import f_regression, SelectKBest
skb = SelectKBest(f_regression, k=30)
X = skb.fit_transform(df_transformed, y)
X = pd.DataFrame(X,columns=df_transformed.columns[skb.get_support()])
X.columns

Index(['mkt_carrier_fl_num', 'origin_airport_id', 'crs_elapsed_time',
       'taxi_out', 'dep_daysection', 'arr_daysection', 'origin_airport_vol',
       'carrier_origin_apt_vol', 'carrier_dest_apt_vol',
       'carrier_origin_apt_pct', 'carrier_dest_apt_pct',
       'mean_carrier_arr_delay_month',
       'mean_carrier_origin_dep-hour_dep_delay',
       'mean_carrier_dest_fl-hour_delay',
       'median_carrier_origin_dep-hour_dep_delay',
       'median_carrier_dest_fl-hour_delay', 'mean_route_arr_delay_dep_hour',
       'mean_route_arr_delay_arr_hour', 'median_route_arr_delay_dep_hour',
       'median_route_dep_delay_dep_hour', 'median_route_arr_delay_arr_hour',
       'median_route_dep_delay_arr_hour', 'route_month_weather_delay_pct',
       'arr_daysection_carrier_delay_pct', 'arr_daysection_nas_delay_pct',
       'dep_daysection_nas_delay_pct', 'carrier_late_aircraft_delay_pct',
       'carrier_route_month_carrier_delay',
       'carrier_route_month_late_aircraft_delay',
       'rou

In [121]:
#X = X.drop('dep_delay',axis=1)

# Modelling

In [None]:
# Find 50 features


In [None]:
# Assign X and y
#X = dfF1.drop('delay_type',axis=1)
#y = dfF1['delay_type']

In [None]:
# Data test train Split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.80,random_state=1,stratify=y)


In [None]:
X_train

Unnamed: 0,mkt_carrier_fl_num,origin_airport_id,crs_elapsed_time,taxi_out,dep_daysection,arr_daysection,origin_airport_vol,carrier_origin_apt_vol,carrier_dest_apt_vol,carrier_origin_apt_pct,carrier_dest_apt_pct,mean_carrier_arr_delay_month,mean_carrier_origin_dep-hour_dep_delay,mean_carrier_dest_fl-hour_delay,median_carrier_origin_dep-hour_dep_delay,median_carrier_dest_fl-hour_delay,mean_route_arr_delay_dep_hour,mean_route_arr_delay_arr_hour,median_route_arr_delay_dep_hour,median_route_dep_delay_dep_hour,median_route_arr_delay_arr_hour,median_route_dep_delay_arr_hour,route_month_weather_delay_pct,arr_daysection_carrier_delay_pct,arr_daysection_nas_delay_pct,dep_daysection_nas_delay_pct,carrier_late_aircraft_delay_pct,carrier_route_month_carrier_delay,carrier_route_month_late_aircraft_delay,route_month_weekday_daysection_nas_delay
20723,5712.0,14747.0,134.0,13.0,2.0,3.0,37819.0,2827.0,1322.0,7.475079,3.495597,1.760079,14.549206,-1.094595,-2.0,-10.5,3.685333,-0.913420,-6.0,-2.0,-5.0,-2.0,1.020408,5.153119,7.189542,6.209534,8.345150,15.789474,5.263158,0.000000
222819,1813.0,14100.0,138.0,24.0,2.0,3.0,31102.0,9824.0,2466.0,31.586393,7.928751,2.299194,10.069079,19.680000,-3.0,3.0,-0.991170,5.053942,-9.0,-2.0,-4.0,0.0,0.000000,5.153119,7.189542,6.209534,7.184755,9.677419,12.903226,0.000000
128664,494.0,11057.0,58.0,30.0,4.0,4.0,50167.0,19588.0,2013.0,39.045588,4.012598,5.091603,16.526633,12.518717,0.0,0.0,17.042553,17.042553,3.0,1.0,3.0,1.0,0.000000,6.273976,6.307453,4.593159,7.184755,2.040816,2.040816,100.000000
43872,4323.0,14831.0,145.0,17.0,2.0,3.0,13688.0,6688.0,13926.0,48.860316,101.738749,6.584201,10.250000,10.073544,1.0,0.0,4.077236,-1.072289,-6.0,-1.0,-8.0,-1.0,0.000000,5.153119,7.189542,6.209534,8.783435,3.448276,13.793103,16.666667
189093,4515.0,14100.0,71.0,33.0,3.0,3.0,31102.0,3772.0,6626.0,12.127837,21.304096,7.606678,7.529880,10.739336,-4.0,-7.0,14.062500,6.154639,-9.5,-2.0,-11.0,-3.0,0.000000,5.153119,7.189542,6.870966,6.325297,0.000000,0.000000,50.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280229,2410.0,10397.0,137.0,27.0,2.0,3.0,78303.0,48800.0,1128.0,62.322006,1.440558,-1.367736,6.801622,15.835616,-1.0,3.0,24.194444,29.652542,1.0,5.0,7.0,11.0,1.000000,5.153119,7.189542,6.209534,4.222974,0.000000,2.127660,33.333333
60166,1114.0,14683.0,195.0,11.0,2.0,2.0,8459.0,3334.0,14168.0,39.413642,167.490247,6.968599,8.188755,5.642066,0.0,-5.0,-6.312500,-7.232877,-8.0,-1.0,-11.0,-1.0,0.000000,4.571698,5.329035,6.209534,8.783435,15.000000,20.000000,0.000000
44028,3166.0,11298.0,53.0,60.0,3.0,3.0,58492.0,2642.0,52.0,4.516857,0.088901,8.128602,18.772727,44.571429,-4.0,12.0,42.882353,12.656716,12.0,-3.0,-3.0,-4.0,0.000000,5.153119,7.189542,6.870966,7.525156,20.000000,0.000000,100.000000
201778,572.0,13891.0,65.0,6.0,2.0,2.0,4470.0,2408.0,14345.0,53.870246,320.917226,5.650779,13.384615,6.054254,3.0,-3.0,6.247059,3.000000,-4.0,1.0,-5.0,0.0,4.347826,4.571698,5.329035,6.209534,8.783435,0.000000,13.043478,0.000000


## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

regr = RandomForestClassifier(n_estimators=60, max_depth=10, random_state=0)
regr.fit(X_train,y_train)
#LAST PARAMS : 50/10

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
y_pred=regr.predict(X_test)
y_pred

array([5, 3, 3, ..., 3, 3, 3])

In [None]:
#Accuracy Score
regr.score(X_test, y_pred)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

0.7837114156466205
              precision    recall  f1-score   support

           1       0.79      0.57      0.67     16130
           2       0.96      0.22      0.36      1969
           3       0.81      0.99      0.89     18391
           4       1.00      0.02      0.03       124
           5       0.75      0.82      0.78     23514

    accuracy                           0.78     60128
   macro avg       0.86      0.52      0.55     60128
weighted avg       0.79      0.78      0.77     60128

[[ 9244     2  1659     0  5225]
 [  495   434   238     0   802]
 [   15     0 18213     0   163]
 [   41     9     7     2    65]
 [ 1864     6  2414     0 19230]]


In [None]:
pd.Series(y_pred).describe()

count    60128.000000
mean         3.559606
std          1.467066
min          1.000000
25%          3.000000
50%          3.000000
75%          5.000000
max          5.000000
dtype: float64

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [None]:
#train model
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
#Accuracy Score
regr.score(X_test, y_pred)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

0.6925059872272485
              precision    recall  f1-score   support

         1.0       0.58      0.64      0.61     16135
         2.0       0.22      0.18      0.20      2042
         3.0       0.80      0.89      0.84     18318
         4.0       0.00      0.02      0.01       108
         5.0       0.74      0.62      0.68     23525

    accuracy                           0.69     60128
   macro avg       0.47      0.47      0.47     60128
weighted avg       0.70      0.69      0.69     60128

[[10327   380  1484   117  3827]
 [  676   374   378    39   575]
 [  955   300 16253    75   735]
 [   50     1     9     2    46]
 [ 5779   667  2086   310 14683]]


##XGBOOST

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

data_dmatrix = xgb.DMatrix(data=X.astype(int),label=y.astype(int))



In [None]:
#create train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.astype(int), y.astype(int), test_size=0.2, random_state=123)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train,y_train)

y_preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
print("RMSE: %f" % (rmse))

RMSE: 1.751816


In [None]:
#Accuracy Score
regr.score(X_test, y_pred)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

#Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

0.3143294305481639
              precision    recall  f1-score   support

           1       0.27      0.30      0.29     16130
           2       0.04      0.03      0.03      1969
           3       0.31      0.34      0.32     18391
           4       0.01      0.03      0.01       124
           5       0.39      0.33      0.36     23514

    accuracy                           0.31     60128
   macro avg       0.20      0.21      0.20     60128
weighted avg       0.32      0.31      0.32     60128

[[4854  442 5404  141 5289]
 [ 585   62  674   17  631]
 [5409  504 6190  177 6111]
 [  31    4   40    4   45]
 [6908  710 7902  204 7790]]


# Dealing with Ram

In [None]:

import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

#say a data frame is taking up a lot and it happens to be one you don't need anymore you can do:
#del df

[('dfF', 2602606480),
 ('df', 1542050435),
 ('dfFT', 1007882019),
 ('df_test', 507999370),
 ('dfF1', 216740560),
 ('X', 204255024),
 ('ohe_df', 140135216),
 ('transformed', 140135184),
 ('X_train', 129849528),
 ('dfFT1', 111647668),
 ('y', 66914128),
 ('X_test', 32462472),
 ('y_train', 19976864),
 ('df_dummy', 16514044),
 ('y_test', 4994240),
 ('y_pred', 2497208),
 ('airports', 3072),
 ('RandomForestRegressor', 2008),
 ('missing_data', 1369),
 ('percent', 1265),
 ('total', 1265),
 ('LabelBinarizer', 1192),
 ('OneHotEncoder', 1192),
 ('hour_to_daysection', 1176),
 ('LabelEncoder', 1064),
 ('MultiColumnLabelEncoder', 1064),
 ('Pipeline', 1064),
 ('fruit_data', 672),
 ('month_to_season', 640),
 ('defaultdict', 416),
 ('mean_absolute_error', 136),
 ('mean_squared_error', 136),
 ('r2_score', 136),
 ('train_test_split', 136),
 ('delaytypes', 96),
 ('bins', 88),
 ('labels', 80),
 ('np', 72),
 ('pd', 72),
 ('preprocessing', 72),
 ('c', 69),
 ('delay', 68),
 ('path', 63),
 ('carrier_encoder', 4

In [None]:
del dfF_encodertest
del dfF_encoded
del df_test
del dfF1