In [1]:
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
colab_path = '/content/drive/MyDrive/ColabNotebooks/airplane'

In [4]:
import pandas as pd
import numpy as np
import random
import os
import gc
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [6]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [7]:
csv_to_parquet(colab_path + '/train.csv', 'train')
csv_to_parquet(colab_path+ '/test.csv', 'test')

train Done.
test Done.


In [23]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [41]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 19 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Cancelled                 1000000 non-null  int64  
 6   Diverted                  1000000 non-null  int64  
 7   Origin_Airport            1000000 non-null  object 
 8   Origin_Airport_ID         1000000 non-null  int64  
 9   Origin_State              890985 non-null   object 
 10  Destination_Airport       1000000 non-null  object 
 11  Destination_Airport_ID    1000000 non-null  int64  
 12  Destination_State         890921 non-null   object 
 13  Distance                  10

In [42]:
train.isnull().sum()


ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                109015
Destination_Airport              0
Destination_Airport_ID           0
Destination_State           109079
Distance                         0
Airline                     108920
Carrier_Code(IATA)          108990
Carrier_ID(DOT)             108997
Tail_Number                      0
Delay                       744999
dtype: int64

In [24]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [25]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [26]:
train_x = train.drop(columns=['ID'])
#train_y = train['Delay']
test_x = test.drop(columns=['ID'])

In [27]:
# 레이블이 있는 데이터와 없는 데이터로 분리
labeled_data = train_x[train_x['Delay'].notna()]
unlabeled_data = train_x[train_x['Delay'].isna()]

In [28]:
# 레이블이 있는 데이터로 모델 학습
X_train, X_test, y_train, y_test = train_test_split(
    labeled_data.drop(columns=['Delay']),
    labeled_data['Delay'],
    test_size=0.3,
    random_state=42
)
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [29]:
# 레이블이 없는 데이터 예측
unlabeled_data['Delay'] = model.predict(unlabeled_data.drop(columns=['Delay']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['Delay'] = model.predict(unlabeled_data.drop(columns=['Delay']))


In [30]:
merged_train = pd.concat([labeled_data, unlabeled_data], axis=0)


In [34]:
merged_train['Delay'] = merged_train['Delay'].replace({'Not_Delayed': 0, 'Delayed': 1})

In [36]:
merged_train.isnull().sum()


Month                       0
Day_of_Month                0
Estimated_Departure_Time    0
Estimated_Arrival_Time      0
Cancelled                   0
Diverted                    0
Origin_Airport              0
Origin_Airport_ID           0
Origin_State                0
Destination_Airport         0
Destination_Airport_ID      0
Destination_State           0
Distance                    0
Airline                     0
Carrier_Code(IATA)          0
Carrier_ID(DOT)             0
Tail_Number                 0
Delay                       0
dtype: int64

In [37]:
train_x = merged_train.drop(columns=['Delay'])
train_y = merged_train['Delay']
test_x = test.drop(columns=['ID'])

In [43]:
model = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

In [None]:
# 모델 학습
model.fit(train_x, train_y)

In [40]:
y_pred = model.predict_proba(test_x)


In [41]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)


In [42]:
submission.to_csv('XGB2_submission.csv', index=True)