In [1]:
import random
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from catboost import CatBoostClassifier, Pool
from collections import defaultdict


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Fixed Seed

### Data  load

In [3]:
train = pd.read_parquet('./data/train_preprocess_9.parquet')
test = pd.read_parquet('./data/test_preprocess_9.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)



In [2]:
train = pd.read_parquet('./data/train_preprocess_10.parquet')
test = pd.read_parquet('./data/test_preprocess_10.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)



### Distance 및 모든 값 object 처리

In [3]:
train = train.astype({'Distance':int})
test = test.astype({'Distance':int})

print("distance Done.")

train = train.astype({'Carrier_ID(DOT)':int})
test = test.astype({'Carrier_ID(DOT)':int})

train = train.astype({'EDT':object, 'EAT':object, 'Distance':object, 'Origin_Airport_ID':object, \
                     'Destination_Airport_ID':object, 'Carrier_ID(DOT)':object, 'Day':object})
test = test.astype({'EDT':object, 'EAT':object, 'Distance':object, 'Origin_Airport_ID':object, \
                     'Destination_Airport_ID':object, 'Carrier_ID(DOT)':object, 'Day':object})

print("CID Done.")

distance Done.
CID Done.


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 976567 entries, 1 to 999999
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   ID                      976567 non-null  object
 1   Origin_Airport_ID       976567 non-null  object
 2   Destination_Airport_ID  976567 non-null  object
 3   Distance                976567 non-null  object
 4   Carrier_ID(DOT)         976567 non-null  object
 5   Tail_Number             976567 non-null  object
 6   Delay                   249021 non-null  object
 7   Day                     976567 non-null  object
 8   EDT                     976567 non-null  object
 9   EAT                     976567 non-null  object
dtypes: object(10)
memory usage: 82.0+ MB


In [5]:
train = train.dropna()

column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))

train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

print('Training Prepared.')

Training Prepared.


In [6]:
counts = list(train_y.value_counts())
class_weight = [counts[1]/sum(counts), counts[0]/sum(counts)]
print("weight :", class_weight)

weight : [0.1766316896968529, 0.8233683103031472]


In [7]:
cat_features = [i for i in range(8)]
model = CatBoostClassifier(random_seed=42, cat_features=cat_features, class_weights=class_weight, verbose=0)
model.fit(train_x, train_y)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


<catboost.core.CatBoostClassifier at 0x2a0e38e16c0>

### 예측

In [8]:
y_pred = model.predict_proba(test_x)

### 제출

In [9]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('./submission/submission_catboost_1st.csv', index=True)