In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
import joblib
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
import matplotlib.pyplot as pyplot
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import VarianceThreshold
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [4]:
data = pd.read_csv('/Users/user/Documents/Github/flightstatus/data/dev/flights_processed.csv')
data.shape

(5819079, 58)

In [5]:
data['TARGET'].value_counts()/len(data)

0    0.662225
1    0.337775
Name: TARGET, dtype: float64

In [20]:
data['ORIGIN_AIRPORT_INTL'].value_counts()

0    5819079
Name: ORIGIN_AIRPORT_INTL, dtype: int64

In [23]:
data[['ORIGIN_AIRPORT_INTL', 'ORIGIN_AIRPORT', 'AIRPORT_ORIGIN']].tail()

Unnamed: 0,ORIGIN_AIRPORT_INTL,ORIGIN_AIRPORT,AIRPORT_ORIGIN
5819074,0,LAX,Los Angeles International Airport
5819075,0,JFK,John F. Kennedy International Airport (New Yor...
5819076,0,JFK,John F. Kennedy International Airport (New Yor...
5819077,0,MCO,Orlando International Airport
5819078,0,JFK,John F. Kennedy International Airport (New Yor...


In [8]:
data.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,DATE,DATE_TRAFFIC,MONTH_TRAFFIC,AIRLINE_TRAFFIC,ORIGIN_AIRPORT_INTL,ORIGIN_AIRPORT_TRAFFIC,DESTINATION_AIRPORT_INTL,DESTINATION_AIRPORT_TRAFFIC,LONGITUDE_DIF,LATITUDE_DIF
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,2015-01-01,13950,469968,172521,0,32006.0,0,221806.0,-27.68688,13.72534
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,2015-01-01,13950,469968,725984,0,389369.0,0,45147.0,-38.31248,7.25938
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,2015-01-01,13950,469968,198715,0,295974.0,0,200646.0,-41.43171,2.40499
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,2015-01-01,13950,469968,725984,0,389369.0,0,138665.0,-38.11751,8.14929
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,2015-01-01,13950,469968,172521,0,221806.0,0,32006.0,27.68688,-13.72534


In [9]:
data.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'AIRPORT_ORIGIN', 'CITY_ORIGIN',
       'STATE_ORIGIN', 'LATITUDE_ORIGIN', 'LONGITUDE_ORIGIN',
       'AIRPORT_DESTINATION', 'CITY_DESTINATION', 'STATE_DESTINATION',
       'LATITUDE_DESTINATION', 'LONGITUDE_DESTINATION', 'HOLIDAY', 'TARGET',
       'AIRLINE_CAT', 'ORIGIN_AIRPORT_CAT', 'DESTINATION_AIRPORT_CAT',
       'HOLIDAY_FLAG', 'WEEKEND', 'DATE', 'DATE_TRAFFIC', 'MONTH_TRAFFIC',
       'AIRLINE_TRAFFIC', 'ORIGIN_AIRPORT_INTL', 'ORIGIN_AIRPORT_TR

In [None]:
NOT_USED = ['SCHEDULED_TIME', 'WEATHER_DELAY', 'CANCELLATION_REASON', 'CANCELLED', 'ORIGIN_STATE', 'WEATHER_DELAY_CNT', 'WEATHER_DELAY_FLAG']

In [12]:
NUM_VAR = [v for v in data.columns if is_numeric_dtype(data[v])]

In [13]:
NUM_VAR

['YEAR',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'FLIGHT_NUMBER',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'DEPARTURE_DELAY',
 'TAXI_OUT',
 'WHEELS_OFF',
 'SCHEDULED_TIME',
 'ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'WHEELS_ON',
 'TAXI_IN',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_TIME',
 'ARRIVAL_DELAY',
 'DIVERTED',
 'CANCELLED',
 'AIR_SYSTEM_DELAY',
 'SECURITY_DELAY',
 'AIRLINE_DELAY',
 'LATE_AIRCRAFT_DELAY',
 'WEATHER_DELAY',
 'LATITUDE_ORIGIN',
 'LONGITUDE_ORIGIN',
 'LATITUDE_DESTINATION',
 'LONGITUDE_DESTINATION',
 'TARGET',
 'AIRLINE_CAT',
 'ORIGIN_AIRPORT_CAT',
 'DESTINATION_AIRPORT_CAT',
 'HOLIDAY_FLAG',
 'WEEKEND',
 'DATE_TRAFFIC',
 'MONTH_TRAFFIC',
 'AIRLINE_TRAFFIC',
 'ORIGIN_AIRPORT_INTL',
 'ORIGIN_AIRPORT_TRAFFIC',
 'DESTINATION_AIRPORT_INTL',
 'DESTINATION_AIRPORT_TRAFFIC',
 'LONGITUDE_DIF',
 'LATITUDE_DIF']

In [10]:
data['SCHEDULED_TIME'].dtype == 

dtype('float64')

In [None]:
df_train, df_test = train_test_split(data, stratify = data['TARGET'], test_size=0.3, random_state=12)

In [15]:
selector = VarianceThreshold()
selector.fit(data[NUM_VAR].fillna(-1))

VarianceThreshold(threshold=0.0)

In [None]:
variance_sel = pd.DataFrame({'feature': data.drop(['target', 'experiancustomerid_orig'], axis=1).columns,
                             'selected': selector.get_support()})
cols = variance_sel[variance_sel['selected']]['feature'].tolist()
len(cols)

In [16]:
selector.get_support()

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True])

In [19]:
NUM_VAR_SELECTED = [NUM_VAR[i] for i in range(len(NUM_VAR)) if selector.get_support()[i]]
NUM_VAR_SELECTED

['MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'FLIGHT_NUMBER',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'DEPARTURE_DELAY',
 'TAXI_OUT',
 'WHEELS_OFF',
 'SCHEDULED_TIME',
 'ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'WHEELS_ON',
 'TAXI_IN',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_TIME',
 'ARRIVAL_DELAY',
 'DIVERTED',
 'CANCELLED',
 'AIR_SYSTEM_DELAY',
 'SECURITY_DELAY',
 'AIRLINE_DELAY',
 'LATE_AIRCRAFT_DELAY',
 'WEATHER_DELAY',
 'LATITUDE_ORIGIN',
 'LONGITUDE_ORIGIN',
 'LATITUDE_DESTINATION',
 'LONGITUDE_DESTINATION',
 'TARGET',
 'AIRLINE_CAT',
 'ORIGIN_AIRPORT_CAT',
 'DESTINATION_AIRPORT_CAT',
 'HOLIDAY_FLAG',
 'WEEKEND',
 'DATE_TRAFFIC',
 'MONTH_TRAFFIC',
 'AIRLINE_TRAFFIC',
 'ORIGIN_AIRPORT_TRAFFIC',
 'DESTINATION_AIRPORT_TRAFFIC',
 'LONGITUDE_DIF',
 'LATITUDE_DIF']

In [18]:
NUM_VAR

['YEAR',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'FLIGHT_NUMBER',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'DEPARTURE_DELAY',
 'TAXI_OUT',
 'WHEELS_OFF',
 'SCHEDULED_TIME',
 'ELAPSED_TIME',
 'AIR_TIME',
 'DISTANCE',
 'WHEELS_ON',
 'TAXI_IN',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_TIME',
 'ARRIVAL_DELAY',
 'DIVERTED',
 'CANCELLED',
 'AIR_SYSTEM_DELAY',
 'SECURITY_DELAY',
 'AIRLINE_DELAY',
 'LATE_AIRCRAFT_DELAY',
 'WEATHER_DELAY',
 'LATITUDE_ORIGIN',
 'LONGITUDE_ORIGIN',
 'LATITUDE_DESTINATION',
 'LONGITUDE_DESTINATION',
 'TARGET',
 'AIRLINE_CAT',
 'ORIGIN_AIRPORT_CAT',
 'DESTINATION_AIRPORT_CAT',
 'HOLIDAY_FLAG',
 'WEEKEND',
 'DATE_TRAFFIC',
 'MONTH_TRAFFIC',
 'AIRLINE_TRAFFIC',
 'ORIGIN_AIRPORT_INTL',
 'ORIGIN_AIRPORT_TRAFFIC',
 'DESTINATION_AIRPORT_INTL',
 'DESTINATION_AIRPORT_TRAFFIC',
 'LONGITUDE_DIF',
 'LATITUDE_DIF']