In [2]:
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
colab_path = '/content/drive/MyDrive/ColabNotebooks/airplane'

In [4]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt


In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [8]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [9]:
csv_to_parquet(colab_path + '/train.csv', 'train')
csv_to_parquet(colab_path + '/test.csv', 'test')

train Done.
test Done.


In [111]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv(colab_path + '/sample_submission.csv', index_col = 0)

## **결측값 처리**

In [112]:
train = train.drop(['Cancelled','Diverted'], axis=1) #의미없는 열 제거
test = test.drop(['Cancelled','Diverted'], axis=1) #의미없는 열 제거

In [113]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    mode1 = test[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode1)
print('Done.')

Done.


In [114]:
#질적 변수들을 라벨인코더를 활용해 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [170]:
# 레이블이 있는 데이터와 레이블이 없는 데이터로 분리
labeled_data = train.dropna(subset=['Delay'])
unlabeled_data = train[train['Delay'].isnull()]

In [171]:
labeled_data['Delay']=labeled_data['Delay'].replace({'Not_Delayed': 0, 'Delayed': 1})


In [172]:
# 초기 모델 학습
model = RandomForestClassifier()
X_train = labeled_data.drop(columns=['ID', 'Delay'])
y_train = labeled_data['Delay']
model.fit(X_train, y_train)

In [None]:
while len(unlabeled_data) > 0:
    # 레이블이 없는 데이터 예측
    X_unlabeled = unlabeled_data.drop(columns=['ID','Delay'])
    y_pred = model.predict(X_unlabeled)
    y_prob = model.predict_proba(X_unlabeled)

    # 모델이 확신을 가지고 있는 새로운 레이블 추가
    threshold = 0.8
    new_labels = []
    for j in range(len(y_prob)):
        max_prob = np.max(y_prob[j])
        if max_prob >= threshold:
            new_labels.append(y_pred[j])
        else:
            new_labels.append(np.nan)
    # add new labels to unlabeled_data
    unlabeled_data['Delay'] = new_labels
    unlabeled_data = unlabeled_data[unlabeled_data['Delay'].isnull()]
    unlabeled_data_labeled = unlabeled_data[unlabeled_data['Delay'].notnull()]
    # nan이 아닌 데이터와 라벨링된 데이터 병합
    labeled_and_predicted_data = pd.concat([unlabeled_data_labeled, labeled_data], ignore_index=True)
    # 초기 모델 학습
    model = RandomForestClassifier()
    X_train = labeled_and_predicted_data.drop(columns=['ID', 'Delay'])
    y_train = labeled_and_predicted_data['Delay']
    model.fit(X_train, y_train)

In [137]:
test_x = test.drop(columns=['ID'])

In [143]:
y_pred = model.predict_proba(test_x)

In [144]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [145]:
submission.to_csv('22_submission.csv', index=True)
