# Import

In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42)

# csv to parquet
- 메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능합니다

In [3]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [4]:
csv_to_parquet('./dts/train.csv','train')
csv_to_parquet('./dts/test.csv','test')

train Done.
test Done.


# Data Load

In [5]:
train = pd.read_parquet('./train.parquet').drop('ID',axis=1)
test = pd.read_parquet('./test.parquet').drop('ID',axis=1)
sample_submission = pd.read_csv('./dts/sample_submission.csv', index_col=0)

# Data Pre-Processing
- 레이블(Delay)를 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다


In [6]:
train.head()

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
3,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,
4,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 18 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Month                     1000000 non-null  int64  
 1   Day_of_Month              1000000 non-null  int64  
 2   Estimated_Departure_Time  890981 non-null   float64
 3   Estimated_Arrival_Time    890960 non-null   float64
 4   Cancelled                 1000000 non-null  int64  
 5   Diverted                  1000000 non-null  int64  
 6   Origin_Airport            1000000 non-null  object 
 7   Origin_Airport_ID         1000000 non-null  int64  
 8   Origin_State              890985 non-null   object 
 9   Destination_Airport       1000000 non-null  object 
 10  Destination_Airport_ID    1000000 non-null  int64  
 11  Destination_State         890921 non-null   object 
 12  Distance                  1000000 non-null  float64
 13  Airline                   89

In [9]:
train.describe(include='all')

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
count,1000000.0,1000000.0,890981.0,890960.0,1000000.0,1000000.0,1000000,1000000.0,890985,1000000,1000000.0,890921,1000000.0,891080,891010,891003.0,1000000,255001
unique,,,,,,,374,,52,375,,52,,28,11,,6430,2
top,,,,,,,ORD,,California,ORD,,California,,Southwest Airlines Co.,UA,,N483HA,Not_Delayed
freq,,,,,,,50436,,103482,50171,,104347,,182113,206211,,530,210001
mean,6.945156,15.764842,1341.153019,1493.295934,0.0,0.0,,12696.278484,,,12701.813986,,784.078499,,,19997.388093,,
std,3.462506,8.763515,489.814011,520.803494,0.0,0.0,,1514.938441,,,1515.213044,,590.790469,,,404.268639,,
min,1.0,1.0,1.0,1.0,0.0,0.0,,10135.0,,,10135.0,,16.0,,,19393.0,,
25%,4.0,8.0,925.0,1105.0,0.0,0.0,,11292.0,,,11292.0,,350.0,,,19790.0,,
50%,7.0,16.0,1332.0,1524.0,0.0,0.0,,12889.0,,,12889.0,,623.0,,,19977.0,,
75%,10.0,23.0,1742.0,1924.0,0.0,0.0,,14057.0,,,14057.0,,1020.0,,,20378.0,,


In [12]:
NaN_col = []

for i in range(len(train.columns)):
    if(sum(train[train.columns[i]].notna())<1000000):
        NaN_col.append(train.columns[i])
        
print(NaN_col)

['Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Delay']


In [13]:
for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)

print('Done')

Done


- 질적 변수들을 수치화합니다

In [14]:
qual_col = []

for i in range(len(train.dtypes)):
    if (train.dtypes[i] == 'object'):
        qual_col.append(train.columns[i])
        
print(qual_col)
qual_col.remove('Delay')
print(qual_col)

['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number', 'Delay']
['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']


In [15]:
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
            
    test[i] = le.transform(test[i])
    
print('Done.')

Done.


- 레이블이 없는 데이터들을 제거합니다

In [16]:
train = train.dropna()

column_number = {}
for i,column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [19]:
train_x = train.drop(columns = ['Delay','Delay_num'])
train_y = train['Delay_num']

# Classification Model Fit

In [20]:
clf = RandomForestClassifier()
clf.fit(train_x,train_y)

# Inference

In [22]:
y_pred = clf.predict_proba(test)

# Submit

In [23]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('baseline_submission.csv', index=True)

# XGBoost

In [33]:
from xgboost import XGBClassifier

model = XGBClassifier(booster='gbtree', max_depth=8, n_estimators=200, objective='binary:logistic' ,eval_metric='logloss')
model.fit(train_x, train_y)

In [34]:
y_pred = model.predict_proba(test)

submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('xgb_submission.csv', index=True)

In [35]:
from sklearn.metrics import log_loss

log_loss(train_y,model.predict(train_x))

1.5976709801260067