In [2]:
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
colab_path = '/content/drive/MyDrive/ColabNotebooks/airplane'

In [45]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt


In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [7]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [8]:
csv_to_parquet(colab_path + '/train.csv', 'train')
csv_to_parquet(colab_path + '/test.csv', 'test')

train Done.
test Done.


In [51]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv(colab_path + '/sample_submission.csv', index_col = 0)

## **결측값 처리**

In [52]:
train = train.drop(['Cancelled','Diverted'], axis=1) #의미없는 열 제거
test = test.drop(['Cancelled','Diverted'], axis=1) #의미없는 열 제거

In [53]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    mode1 = test[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode1)
print('Done.')

Done.


In [54]:
#질적 변수들을 라벨인코더를 활용해 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [94]:
# 레이블이 있는 데이터와 레이블이 없는 데이터로 분리
labeled_data = train.dropna(subset=['Delay'])
unlabeled_data = train[train['Delay'].isnull()]

In [96]:
# Self-Training 반복 횟수
num_iterations = 5

In [69]:
# 초기 모델 학습
model = RandomForestClassifier()
X_train = labeled_data.drop(columns=['ID','Delay'])
y_train = labeled_data['Delay']
model.fit(X_train, y_train)

In [168]:
new_labels_series = pd.Series(new_labels)
new_labels_series.notnull()

AttributeError: ignored

In [None]:
for i in range(num_iterations):
    # 레이블이 없는 데이터 예측
    X_unlabeled = unlabeled_data.drop(columns=['ID','Delay'])
    y_pred = model.predict(X_unlabeled)
    y_prob = model.predict_proba(X_unlabeled)

    # 모델이 확신을 가지고 있는 새로운 레이블 추가
    threshold = 0.8
    new_labels = []
    for j in range(len(y_prob)):
        max_prob = np.max(y_prob[j])
        if max_prob >= threshold:
            new_labels.append(y_pred[j])
        else:
            new_labels.append(np.nan)
            new_labels_series = pd.Series(new_labels)
    new_labeled_data = unlabeled_data.loc[pd.notna(new_labels_series)].copy()


In [None]:
X_labeled = new_labeled_data.drop(columns=['ID','Delay'])
y_labeled = new_labeled_data['Delay']
model.fit(X_labeled, y_labeled)


In [None]:
# 초기 모델 학습
model = RandomForestClassifier()
X_train = labeled_data.drop('Delay', axis=1)
y_train = labeled_data['Delay']
model.fit(X_train, y_train)

In [42]:
# 데이터 전처리 - 결측값 처리
train['Delay'] = train['Delay'].replace({'Not_Delayed': 0, 'Delayed': 1})
train_x = train.drop(columns=['ID','Delay'])
test_x = test.drop(columns=['ID'])
y = train['Delay']
train_x.isnull().sum()


Month                       0
Day_of_Month                0
Estimated_Departure_Time    0
Estimated_Arrival_Time      0
Origin_Airport              0
Origin_Airport_ID           0
Origin_State                0
Destination_Airport         0
Destination_Airport_ID      0
Destination_State           0
Distance                    0
Airline                     0
Carrier_Code(IATA)          0
Carrier_ID(DOT)             0
Tail_Number                 0
dtype: int64

In [29]:
test_x.isnull().sum()

Month                       0
Day_of_Month                0
Estimated_Departure_Time    0
Estimated_Arrival_Time      0
Cancelled                   0
Diverted                    0
Origin_Airport              0
Origin_Airport_ID           0
Origin_State                0
Destination_Airport         0
Destination_Airport_ID      0
Destination_State           0
Distance                    0
Airline                     0
Carrier_Code(IATA)          0
Carrier_ID(DOT)             0
Tail_Number                 0
dtype: int64

In [14]:
train_x['Delay'].value_counts()

0    955000
1     45000
Name: Delay, dtype: int64

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_x, y, test_size=0.2, random_state=42)

In [46]:
# Normalize numerical features
scaler = StandardScaler()
train_x = scaler.fit_transform(X_train)
val_x = scaler.transform(X_test)
test_x = scaler.transform(test_x)


In [32]:
import xgboost as xgb

# 모델 학습
model = xgb.XGBClassifier()
model.fit(X_train, y_train)


In [33]:
# 예측
preds = model.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 정확도 평가
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
roc_auc = roc_auc_score(y_test, preds)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"ROC AUC: {roc_auc}")

Accuracy: 0.9545333333333333
Precision: 0.0
Recall: 0.0
F1-score: 0.0
ROC AUC: 0.49999476189312164


In [35]:
# 모델 예측
test_preds = model.predict_proba(test_x)

In [36]:
submission = pd.DataFrame(data=test_preds, columns=sample_submission.columns, index=sample_submission.index)

In [37]:
submission.to_csv('21_submission.csv', index=True)