In [1]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import math
import json
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns
# import missingno as msnco

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_df = pd.read_csv('sample_submission.csv')

In [3]:
#Cancelled와 Diverted 데이터는 모든 데이터가 0으로 존재 --> drop 
train_df.drop(['Cancelled','Diverted'], axis=1, inplace= True) 

#delay 1, 0 으로 변환
dic = {'Not_Delayed' : 0, 'Delayed' : 1}
train_df['Delay'] = train_df['Delay'].map(dic) # delay float타입으로 변경

In [4]:
df = train_df[['ID','Estimated_Departure_Time','Estimated_Arrival_Time','Origin_State','Destination_State','Distance','Delay']]

In [5]:
df = df[df['Estimated_Arrival_Time']>df['Estimated_Departure_Time']] #도착시간이 출발시간보다 큰 경우만

In [6]:
df.dropna(subset=['Origin_State', 'Destination_State'], inplace=True)

In [7]:
df.isna().sum()

ID                               0
Estimated_Departure_Time         0
Estimated_Arrival_Time           0
Origin_State                     0
Destination_State                0
Distance                         0
Delay                       453129
dtype: int64

In [8]:
#시차 계산 함수. time_delay 딕셔너리에 시차 업로드
def find_time_lag(train_df, state):
    OA = train_df[train_df['Origin_State'] == state]
    predict_st = OA.Destination_State.unique().tolist()
    DA = train_df[train_df['Destination_State'] == state]
    time_delay_dict = {}
    for st in predict_st:
        st_oa = OA[OA['Destination_State'] == st]
        st_oa_diff = st_oa.Estimated_Arrival_Time.mean() - st_oa.Estimated_Departure_Time.mean()
        st_da = DA[DA['Origin_State'] == st]
        st_da_diff = st_da.Estimated_Arrival_Time.mean() - st_da.Estimated_Departure_Time.mean()
        time_delay = st_da_diff - st_oa_diff
        if not np.isnan(time_delay):
            time_delay_dict[state, st] = -round(time_delay/2/60)
    
    return time_delay_dict

time_delay = dict()
for i in train_df['Origin_State'].unique().tolist():
    time_delay.update(find_time_lag(train_df, i))
    
print(time_delay)

{('Oklahoma', 'Texas'): 0, ('Oklahoma', 'Colorado'): -2, ('Oklahoma', 'Georgia'): 1, ('Oklahoma', 'Illinois'): 0, ('Oklahoma', 'New Jersey'): 1, ('Oklahoma', 'California'): -3, ('Oklahoma', 'Virginia'): 1, ('Oklahoma', 'Florida'): 1, ('Oklahoma', 'Missouri'): 0, ('Oklahoma', 'Nevada'): -1, ('Oklahoma', 'North Carolina'): 2, ('Oklahoma', 'Minnesota'): 0, ('Oklahoma', 'Arizona'): -2, ('Oklahoma', 'Utah'): -2, ('Oklahoma', 'Maryland'): 1, ('Oklahoma', 'Michigan'): 1, ('Oklahoma', 'Pennsylvania'): 1, ('Oklahoma', 'Washington'): -3, ('Illinois', 'Utah'): -1, ('Illinois', 'Missouri'): 0, ('Illinois', 'Kentucky'): 0, ('Illinois', 'Tennessee'): 0, ('Illinois', 'Ohio'): 0, ('Illinois', 'Minnesota'): 0, ('Illinois', 'West Virginia'): 2, ('Illinois', 'Michigan'): 1, ('Illinois', 'Oklahoma'): 0, ('Illinois', 'Washington'): 0, ('Illinois', 'New York'): 1, ('Illinois', 'Texas'): 0, ('Illinois', 'California'): 0, ('Illinois', 'Indiana'): 1, ('Illinois', 'Illinois'): 0, ('Illinois', 'Pennsylvania'): 0

In [18]:
time_delay

{('Oklahoma', 'Texas'): 0,
 ('Oklahoma', 'Colorado'): -2,
 ('Oklahoma', 'Georgia'): 1,
 ('Oklahoma', 'Illinois'): 0,
 ('Oklahoma', 'New Jersey'): 1,
 ('Oklahoma', 'California'): -3,
 ('Oklahoma', 'Virginia'): 1,
 ('Oklahoma', 'Florida'): 1,
 ('Oklahoma', 'Missouri'): 0,
 ('Oklahoma', 'Nevada'): -1,
 ('Oklahoma', 'North Carolina'): 2,
 ('Oklahoma', 'Minnesota'): 0,
 ('Oklahoma', 'Arizona'): -2,
 ('Oklahoma', 'Utah'): -2,
 ('Oklahoma', 'Maryland'): 1,
 ('Oklahoma', 'Michigan'): 1,
 ('Oklahoma', 'Pennsylvania'): 1,
 ('Oklahoma', 'Washington'): -3,
 ('Illinois', 'Utah'): -1,
 ('Illinois', 'Missouri'): 0,
 ('Illinois', 'Kentucky'): 0,
 ('Illinois', 'Tennessee'): 0,
 ('Illinois', 'Ohio'): 0,
 ('Illinois', 'Minnesota'): 0,
 ('Illinois', 'West Virginia'): 2,
 ('Illinois', 'Michigan'): 1,
 ('Illinois', 'Oklahoma'): 0,
 ('Illinois', 'Washington'): 0,
 ('Illinois', 'New York'): 1,
 ('Illinois', 'Texas'): 0,
 ('Illinois', 'California'): 0,
 ('Illinois', 'Indiana'): 1,
 ('Illinois', 'Illinois'): 0,

In [9]:
# time_delay를 통해 time_diff를 계산하기 위해  먼저 time_diff 컬럼 생성 
df['time_diff'] = np.nan


# 구한 시차를 df에 time_diff 컬럼에 넣어주기 
for i in range(len(df)):   
    if (df['Origin_State'].iloc[i],df['Destination_State'].iloc[i]) in time_delay.keys() :
        df['time_diff'].iloc[i] = time_delay[df['Origin_State'].iloc[i],df['Destination_State'].iloc[i]]
    else : 
        df['time_diff'].iloc[i] == 'time'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_diff'].iloc[i] = time_delay[df['Origin_State'].iloc[i],df['Destination_State'].iloc[i]]


In [10]:
df = df.dropna(subset=['time_diff'])
df['time_diff'].unique()

array([ -1.,   0.,  -2.,   1.,  -4.,   3.,   2.,   4.,  -3.,   6.,  -7.,
        -5., -10.,   5.,  15.,  10.,  18.,  13.,   9.,   7.,  -6.,   8.,
        11.,  16.,  -9.,  12., -15.,  -8.,  14.])

In [11]:
#시차 고려하지 않은 도착시간-출발시간
df['flight_time'] = (df['Estimated_Arrival_Time']-df['Estimated_Departure_Time']) 

In [12]:
#도착시간-출발시간 분으로 표현
df['flight_time'] = (df['flight_time']//100)*60 + df['flight_time']%100   

In [13]:
#시차 포함
df['flight_time'] = df['flight_time'] - df['time_diff']*60  

In [16]:
df[df['time_diff']==18]

Unnamed: 0,ID,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_State,Destination_State,Distance,Delay,time_diff,flight_time
3777,TRAIN_003777,1953.0,2302.0,New York,New Mexico,1826.0,0.0,18.0,-851.0
27690,TRAIN_027690,1955.0,2309.0,New York,New Mexico,1826.0,,18.0,-846.0
47752,TRAIN_047752,2000.0,2305.0,New York,New Mexico,1826.0,,18.0,-895.0
94901,TRAIN_094901,1943.0,2254.0,New York,New Mexico,1826.0,,18.0,-889.0
97441,TRAIN_097441,2000.0,2305.0,New York,New Mexico,1826.0,,18.0,-895.0
99205,TRAIN_099205,1942.0,2254.0,New York,New Mexico,1826.0,,18.0,-888.0
151171,TRAIN_151171,2000.0,2305.0,New York,New Mexico,1826.0,,18.0,-895.0
159232,TRAIN_159232,1953.0,2302.0,New York,New Mexico,1826.0,1.0,18.0,-851.0
231202,TRAIN_231202,2000.0,2300.0,New York,New Mexico,1826.0,,18.0,-900.0
249306,TRAIN_249306,2000.0,2307.0,New York,New Mexico,1826.0,0.0,18.0,-893.0


In [None]:
#우선 flight_time 양수인 부분만 ???
df = df[df['flight_time']>0]

In [None]:
#거리 1마일당 비행시간 
distance_time = (df['flight_time']/df['Distance']).mean()
distance_time

In [None]:
#train_df에 df 시차와 비행시간 컬럼 넣어주기 
train_df = pd.merge(train_df, df, how='outer')
train_df

비행시간 컬럼 추가 

In [None]:
#거리 별 비행시간 - 평균비행시간 컬럼 추가 
train_df['flight_time_avg'] = round(train_df['Distance']*distance_time)

In [None]:
#비행시간차이 = 비행시간- 평균비행시간
train_df['flight_time_diff'] = train_df['flight_time'] - train_df['flight_time_avg']

In [None]:
train_df

In [None]:
#여기서 부터 .항공사별 비행시간과 비행시간차이 비교
#출발공항 기준
train_df.groupby('Origin_Airport').mean()

In [None]:
train_df.groupby('Destination_Airport').mean()

먼저 공항별로 group 묶고 -> 평균비행시간 -> 평균비행시간과 비행시간 차이 


In [None]:
#도착예정시간, 출발예정시간 둘 중 하나만 나와있는 걸로 거리가 있으니 데이터 채우기 