In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [None]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
file_dir = '/content/drive/MyDrive/데이콘/데이콘 항공편 지연/데이콘_항공편_지연'
csv_to_parquet(file_dir + '/train.csv', 'train')
csv_to_parquet(file_dir + '/test.csv', 'test')

train Done.
test Done.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)
train = pd.read_parquet('/content/train.parquet')
test = pd.read_parquet('/content/test.parquet')
sample_submission = pd.read_csv(file_dir+'/sample_submission.csv', index_col = 0)

# 전처리 변경

In [None]:

import datetime
def to_time(time_list):
    Time = pd.Series(time_list).astype(str).str.zfill(4)
    Time = Time.replace('2400','0000')
    return [datetime.datetime.strptime(i, '%H%M').strftime("%H:%M") if i != '0nan' else np.NaN for i in Time] 

time_list = [i if str(i) == 'nan' else str(int(i))  for i in train['Estimated_Departure_Time'] ]
time_list1 = [i if str(i) == 'nan' else str(int(i))  for i in train['Estimated_Arrival_Time'] ]
train['Estimated_Departure_Time_HH:MM'] = to_time(time_list)
train['Estimated_Arrival_Time_HH:MM'] = to_time(time_list1)
train


Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,,07:40,10:24
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,,16:10,18:05
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,,09:05,17:35
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,,09:00,10:19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,,PHL,14100,,678.0,United Air Lines Inc.,UA,19977.0,N477UA,,09:36,12:43
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,,MSP,13487,Minnesota,223.0,SkyWest Airlines Inc.,DL,,N439SW,,09:20,10:28
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,,HOU,12191,Texas,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,,08:00,13:40
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,Tennessee,ATL,10397,,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,,16:13,18:24


In [None]:

# 예상 비행시간 만들기 (분으로 만들기)
train_est_time = []
for i,j in zip(train['Estimated_Arrival_Time_HH:MM'], train['Estimated_Departure_Time_HH:MM']):
    if str(i) != 'nan' and str(j) != 'nan':
        if (datetime.datetime.strptime(str(i), "%H:%M") - datetime.datetime.strptime(str(j), "%H:%M")).total_seconds()/60 >= 0:
            train_est_time.append((datetime.datetime.strptime(str(i), "%H:%M") - datetime.datetime.strptime(str(j), "%H:%M")).total_seconds()/60)
        else:
            time = datetime.datetime.strptime(str(i), "%H:%M") - datetime.datetime.strptime(str(j), "%H:%M") + datetime.datetime.strptime('23:59', "%H:%M") + datetime.timedelta(minutes=1)
            train_est_time.append(datetime.timedelta(hours=time.hour,minutes=time.minute ).total_seconds()/60)
    else:
        train_est_time.append(np.NaN)

In [None]:

train['Estimated_Time'] = train_est_time

In [None]:
def time_gb(x):

  if x >= 600 and x <= 659:
    return '0600-0659'
  elif x>=1400 and x<=1459:
    return '1400-1459'
  elif x>=1200 and x<=1259:
    return '1200-1259'
  elif x>=1500 and x<=1559:
    return '1500-1559'
  elif x>=1900 and x<=1959:
    return '1900-1959'
  elif x>=900 and x<=959:
    return '0900-0959'
  elif x>=1000 and x<=1059:
    return  '1000-1059'
  elif x>=2000 and x<=2059:
    return '2000-2059'
  elif x>=1300 and x<=1359:
    return '1300-1359'
  elif x>=1100 and x<=1159:
    return '1100-1159'
  elif x>=800 and x<=859:
    return '0800-0859'
  elif x>=2200 and x<=2259:
    return '2200-2259'
  elif x>=1600 and x<=1659:
    return '1600-1659'
  elif x>=1700 and x<=1759:
    return '1700-1759'
  elif x>=2100 and x<=2159:
    return '2100-2159'
  elif x>=700 and x<=759:
    return '0700-0759'
  elif x>=1800 and x<=1859:
    return '1800-1859'
  elif x>=1 and x<=559:
    return '0001-0559'
  elif x>=2300 and x<=2400:
    return '2300-2400'

# 예상 출발, 도착시간 그룹 화
train['Dep_time_gb'] = train['Estimated_Departure_Time'].dropna().apply(time_gb)
train['Arr_time_gb'] = train['Estimated_Arrival_Time'].dropna().apply(time_gb)
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM,Estimated_Time,Dep_time_gb,Arr_time_gb
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,,,,,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,,07:40,10:24,164.0,0700-0759,1000-1059
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,,16:10,18:05,115.0,1600-1659,1800-1859
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,,09:05,17:35,510.0,0900-0959,1700-1759
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,,09:00,10:19,79.0,0900-0959,1000-1059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,,PHL,14100,,678.0,United Air Lines Inc.,UA,19977.0,N477UA,,09:36,12:43,187.0,0900-0959,1200-1259
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,,MSP,13487,Minnesota,223.0,SkyWest Airlines Inc.,DL,,N439SW,,09:20,10:28,68.0,0900-0959,1000-1059
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,,HOU,12191,Texas,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,,08:00,13:40,340.0,0800-0859,1300-1359
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,Tennessee,ATL,10397,,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,,16:13,18:24,131.0,1600-1659,1800-1859


In [None]:

def distance_gb(x):
  if x < 700 :
    return 'short_route'
  elif x>=700 and x<3000:
    return 'mid_route'
  elif x>=3000:
    return 'long_route'    


# 예상 출발, 도착시간 그룹 화
train['route_gb'] = train['Distance'].apply(distance_gb)
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM,Estimated_Time,Dep_time_gb,Arr_time_gb,route_gb
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,,,,,,short_route
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,,07:40,10:24,164.0,0700-0759,1000-1059,mid_route
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,,16:10,18:05,115.0,1600-1659,1800-1859,short_route
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,,09:05,17:35,510.0,0900-0959,1700-1759,mid_route
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,,09:00,10:19,79.0,0900-0959,1000-1059,short_route
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,,PHL,14100,,678.0,United Air Lines Inc.,UA,19977.0,N477UA,,09:36,12:43,187.0,0900-0959,1200-1259,short_route
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,,MSP,13487,Minnesota,223.0,SkyWest Airlines Inc.,DL,,N439SW,,09:20,10:28,68.0,0900-0959,1000-1059,short_route
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,,HOU,12191,Texas,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,,08:00,13:40,340.0,0800-0859,1300-1359,mid_route
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,Tennessee,ATL,10397,,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,,16:13,18:24,131.0,1600-1659,1800-1859,short_route


In [None]:
origin_state = train[['Origin_Airport','Origin_Airport_ID','Origin_State']].dropna()
origin_state = origin_state.drop_duplicates()
origin_state.columns = ['Origin_Airport','Origin_Airport_ID','F_Origin_State']
Destination_state = train[['Destination_Airport','Destination_Airport_ID','Destination_State']].dropna()
Destination_state = Destination_state.drop_duplicates()
Destination_state.columns = ['Destination_Airport','Destination_Airport_ID','F_Destination_State']


train = pd.merge(train, origin_state, on = ['Origin_Airport','Origin_Airport_ID'], how = 'outer')
train = pd.merge(train, Destination_state, on = ['Destination_Airport','Destination_Airport_ID'], how = 'outer')
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM,Estimated_Time,Dep_time_gb,Arr_time_gb,route_gb,F_Origin_State,F_Destination_State
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,,,,,,short_route,Oklahoma,Texas
1,TRAIN_000827,7,16,550.0,715.0,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N733SA,,05:50,07:15,85.0,0001-0559,0700-0759,short_route,Oklahoma,Texas
2,TRAIN_001452,1,13,1405.0,1535.0,0,0,OKC,13851,Oklahoma,HOU,12191,,419.0,Southwest Airlines Co.,WN,19393.0,N204WN,,14:05,15:35,90.0,1400-1459,1500-1559,short_route,Oklahoma,Texas
3,TRAIN_004015,5,17,1050.0,1215.0,0,0,OKC,13851,Oklahoma,HOU,12191,,419.0,Southwest Airlines Co.,WN,19393.0,N246LV,,10:50,12:15,85.0,1000-1059,1200-1259,short_route,Oklahoma,Texas
4,TRAIN_009325,8,14,600.0,725.0,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7706A,,06:00,07:25,85.0,0600-0659,0700-0759,short_route,Oklahoma,Texas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_896461,11,26,1310.0,,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,,UA,20304.0,N915SW,,13:10,,,1300-1359,,short_route,North Dakota,North Dakota
999996,TRAIN_908847,1,5,1320.0,1408.0,0,0,JMS,12519,,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N928EV,Not_Delayed,13:20,14:08,48.0,1300-1359,1400-1459,short_route,North Dakota,North Dakota
999997,TRAIN_911180,4,5,1315.0,1403.0,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N951SW,,13:15,14:03,48.0,1300-1359,1400-1459,short_route,North Dakota,North Dakota
999998,TRAIN_947096,11,11,1310.0,1355.0,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N701BR,,13:10,13:55,45.0,1300-1359,1300-1359,short_route,North Dakota,North Dakota


In [None]:

mask = (train['Destination_Airport'] == 'YNG')&(train['Destination_Airport_ID'] == 16133)
value = 'Indiana'

train.loc[mask,'F_Destination_State'] = train.loc[mask,'F_Destination_State'].fillna(value) 

In [None]:

import itertools


master_dil = '/content/drive/MyDrive/데이콘/데이콘 항공편 지연/데이콘_항공편_지연/ReleasableAircraft.2018/MASTER.txt'
df = []
with open(master_dil, 'r', encoding = "utf-8-sig") as file:
           df.append(file.readlines())
           
df = list(itertools.chain(*df))  
df = [line.split(',') for line in df]
col = df[0]
#col_store = col.copy()
#df = [i[:35] for i in df]
#len(col_store)
df = pd.DataFrame(df, columns = col)
Airline2 = df[['N-NUMBER', 'NAME']].rename(columns={'N-NUMBER' : 'Tail_Number', 'NAME' : 'NAME2'})

In [None]:
df['Tail_Number'] = ['N' + i for i in df['N-NUMBER']]

Airline = df[['Tail_Number', 'NAME']]


train_ar = pd.merge(train, Airline, left_on = 'Tail_Number', right_on = 'Tail_Number', how = 'left')
ar = pd.merge(train_ar, Airline2, left_on = 'Tail_Number', right_on = 'Tail_Number', how = 'left')

In [None]:
train[train['Estimated_Arrival_Time'].astype(str) == 'nan']

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM,Estimated_Time,Dep_time_gb,Arr_time_gb,route_gb,F_Origin_State,F_Destination_State
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,,,,,,short_route,Oklahoma,Texas
9,TRAIN_066479,10,12,1830.0,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,,N7722B,Not_Delayed,18:30,,,1800-1859,,short_route,Oklahoma,Texas
16,TRAIN_103796,3,26,550.0,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N245WN,Not_Delayed,05:50,,,0001-0559,,short_route,Oklahoma,Texas
24,TRAIN_140341,3,31,530.0,,0,0,OKC,13851,Oklahoma,HOU,12191,,419.0,Southwest Airlines Co.,WN,19393.0,N954WN,,05:30,,,0001-0559,,short_route,Oklahoma,Texas
25,TRAIN_143031,3,19,1955.0,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,,WN,19393.0,N477WN,,19:55,,,1900-1959,,short_route,Oklahoma,Texas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999949,TRAIN_424438,4,6,1315.0,,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,,UA,,N939SW,,13:15,,,1300-1359,,short_route,North Dakota,North Dakota
999958,TRAIN_525763,3,29,2312.0,,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N971SW,,23:12,,,2300-2400,,short_route,North Dakota,North Dakota
999965,TRAIN_594461,7,26,1410.0,,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N987SW,Not_Delayed,14:10,,,1400-1459,,short_route,North Dakota,North Dakota
999975,TRAIN_672880,2,21,1420.0,,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N928EV,,14:20,,,1400-1459,,short_route,North Dakota,North Dakota


In [None]:
# 시작과 도착 예정시간이 null인 경우에 대체할 값
median_time = train.dropna().groupby(['F_Origin_State','F_Destination_State'])['Estimated_Departure_Time', 'Estimated_Arrival_Time'].agg('median')
median_time = pd.DataFrame(median_time)
median_time = median_time.rename(columns={'Estimated_Departure_Time' : 'N_DP_Time', 'Estimated_Arrival_Time' : 'N_AR_Time'})
median_time = median_time.reset_index()
median_time

  median_time = train.dropna().groupby(['F_Origin_State','F_Destination_State'])['Estimated_Departure_Time', 'Estimated_Arrival_Time'].agg('median')


Unnamed: 0,F_Origin_State,F_Destination_State,N_DP_Time,N_AR_Time
0,Alabama,Colorado,801.0,1011.0
1,Alabama,Florida,1610.0,1840.0
2,Alabama,Georgia,1135.0,1341.0
3,Alabama,Illinois,1435.0,1646.0
4,Alabama,Maryland,700.0,955.0
...,...,...,...,...
1280,Wyoming,Colorado,1125.0,1250.0
1281,Wyoming,Illinois,1233.5,1640.5
1282,Wyoming,New Jersey,1430.0,2046.0
1283,Wyoming,Texas,1230.0,1617.0


In [None]:
train_dum = train.copy()

In [None]:
# 시작지, 도착지, 도착예상시간 매칭 걸기 위해 출발 예상시간이 nan인 값 뽑기기
F_Estimated_Departure_Time = train[train['Estimated_Departure_Time'].astype(str) != 'nan'].groupby(['F_Origin_State','F_Destination_State', 'Estimated_Departure_Time', 'Estimated_Arrival_Time'])['Estimated_Departure_Time'].agg('median')
F_Estimated_Departure_Time = pd.DataFrame(F_Estimated_Departure_Time)
F_Estimated_Departure_Time.rename(columns={'Estimated_Departure_Time' : 'NT_DP_Time'}, inplace = True)
F_Estimated_Departure_Time = F_Estimated_Departure_Time.reset_index()[['F_Origin_State','F_Destination_State','Estimated_Arrival_Time','NT_DP_Time']]
F_Estimated_Departure_Time

Unnamed: 0,F_Origin_State,F_Destination_State,Estimated_Arrival_Time,NT_DP_Time
0,Alabama,Colorado,807.0,600.0
1,Alabama,Colorado,823.0,605.0
2,Alabama,Colorado,835.0,605.0
3,Alabama,Colorado,832.0,610.0
4,Alabama,Colorado,830.0,615.0
...,...,...,...,...
352090,Wyoming,Utah,1940.0,1820.0
352091,Wyoming,Utah,1937.0,1835.0
352092,Wyoming,Utah,1945.0,1843.0
352093,Wyoming,Washington,1439.0,1310.0


In [None]:
F_Estimated_Arrival_Time = train[train['Estimated_Arrival_Time'].astype(str) != 'nan'].groupby(['F_Origin_State','F_Destination_State', 'Estimated_Departure_Time', 'Estimated_Arrival_Time'])['Estimated_Arrival_Time'].agg('median')
F_Estimated_Arrival_Time = pd.DataFrame(F_Estimated_Arrival_Time)
F_Estimated_Arrival_Time.rename(columns={'Estimated_Arrival_Time' : 'NT_AR_Time'}, inplace = True)
F_Estimated_Arrival_Time = F_Estimated_Arrival_Time.reset_index()[['F_Origin_State','F_Destination_State', 'Estimated_Departure_Time','NT_AR_Time']]
F_Estimated_Arrival_Time

Unnamed: 0,F_Origin_State,F_Destination_State,Estimated_Departure_Time,NT_AR_Time
0,Alabama,Colorado,600.0,807.0
1,Alabama,Colorado,605.0,823.0
2,Alabama,Colorado,605.0,835.0
3,Alabama,Colorado,610.0,832.0
4,Alabama,Colorado,615.0,830.0
...,...,...,...,...
352090,Wyoming,Utah,1820.0,1940.0
352091,Wyoming,Utah,1835.0,1937.0
352092,Wyoming,Utah,1843.0,1945.0
352093,Wyoming,Washington,1310.0,1439.0


In [None]:
train_dum = pd.merge(train_dum, median_time, left_on = ['F_Origin_State','F_Destination_State'], right_on =['F_Origin_State','F_Destination_State'], how = 'left')

In [None]:
train_dum = pd.merge(train_dum, F_Estimated_Departure_Time, on =['F_Origin_State','F_Destination_State','Estimated_Arrival_Time'], how = 'left')

In [None]:
# 이새끼만 램 초과함(코랩) 로컬에서 확인해야할듯??
#train_dum = pd.merge(train_dum, F_Estimated_Arrival_Time, left_on =['F_Origin_State','F_Destination_State','Estimated_Departure_Time'], right_on =['F_Origin_State','F_Destination_State','Estimated_Departure_Time'], how = 'left')

In [None]:
train_dum

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Estimated_Departure_Time_HH:MM,Estimated_Arrival_Time_HH:MM,Estimated_Time,Dep_time_gb,Arr_time_gb,route_gb,F_Origin_State,F_Destination_State,N_DP_Time,N_AR_Time,NT_DP_Time
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,,,,,,,short_route,Oklahoma,Texas,1200.0,1327.5,
1,TRAIN_000827,7,16,550.0,715.0,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N733SA,,05:50,07:15,85.0,0001-0559,0700-0759,short_route,Oklahoma,Texas,1200.0,1327.5,545.0
2,TRAIN_000827,7,16,550.0,715.0,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N733SA,,05:50,07:15,85.0,0001-0559,0700-0759,short_route,Oklahoma,Texas,1200.0,1327.5,550.0
3,TRAIN_001452,1,13,1405.0,1535.0,0,0,OKC,13851,Oklahoma,HOU,12191,,419.0,Southwest Airlines Co.,WN,19393.0,N204WN,,14:05,15:35,90.0,1400-1459,1500-1559,short_route,Oklahoma,Texas,1200.0,1327.5,1401.0
4,TRAIN_001452,1,13,1405.0,1535.0,0,0,OKC,13851,Oklahoma,HOU,12191,,419.0,Southwest Airlines Co.,WN,19393.0,N204WN,,14:05,15:35,90.0,1400-1459,1500-1559,short_route,Oklahoma,Texas,1200.0,1327.5,1404.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2926514,TRAIN_994361,10,9,2315.0,2359.0,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N920SW,,23:15,23:59,44.0,2300-2400,2300-2400,short_route,North Dakota,North Dakota,1362.5,1404.5,2311.0
2926515,TRAIN_994361,10,9,2315.0,2359.0,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N920SW,,23:15,23:59,44.0,2300-2400,2300-2400,short_route,North Dakota,North Dakota,1362.5,1404.5,2312.0
2926516,TRAIN_994361,10,9,2315.0,2359.0,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N920SW,,23:15,23:59,44.0,2300-2400,2300-2400,short_route,North Dakota,North Dakota,1362.5,1404.5,2314.0
2926517,TRAIN_994361,10,9,2315.0,2359.0,0,0,JMS,12519,North Dakota,DVL,11447,North Dakota,83.0,SkyWest Airlines Inc.,UA,20304.0,N920SW,,23:15,23:59,44.0,2300-2400,2300-2400,short_route,North Dakota,North Dakota,1362.5,1404.5,2315.0


In [None]:
train_dum['N_EST_DT'] = np.where((train_dum['Estimated_Departure_Time'].astype(str) == 'nan') & (train_dum['Estimated_Arrival_Time'].astype(str) == 'nan'), train_dum['N_DP_Time'],
                        np.where(train_dum['Estimated_Departure_Time'].astype(str) == 'nan', train_dum['NT_DP_Time'], train_dum['Estimated_Departure_Time']))

In [None]:
# 도착시간도 비슷하게
train_dum['N_EST_AT'] = np.where((train_dum['Estimated_Departure_Time'].astype(str) == 'nan') & (train_dum['Estimated_Arrival_Time'].astype(str) == 'nan'), train_dum['N_AR_Time'],
                        np.where(train_dum['Estimated_Arrival_Time'].astype(str) == 'nan', train_dum['NT_AR_Time'], train_dum['Estimated_Arrival_Time']))