# Import

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import SVC

# Data Load

In [2]:
train = pd.read_parquet('./data/train.parquet')
test = pd.read_parquet('./data/test.parquet')
sample_submission = pd.read_csv('./data/sample_submission.csv', index_col = 0)

# Data Pre-Processing

In [3]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [4]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [5]:
#레이블이 없는 데이터들을 제거합니다
train_unlabeled = train[train['Delay'].isna()]
train_labeled = train.dropna()

In [6]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train_labeled.loc[:, 'Delay_num'] = train_labeled['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_labeled.loc[:, 'Delay_num'] = train_labeled['Delay'].apply(lambda x: to_number(x, column_number))


In [7]:
X_labeled = train_labeled.drop(columns=['ID', 'Delay', 'Delay_num'])
X_unlabeled = train_unlabeled.drop(columns=['ID', 'Delay'])
y_labeled = train_labeled['Delay_num']
X_test = test.drop(columns=['ID'])

# Modeling

In [8]:
# 두 개의 뷰(view) 생성
X_view1 = X_labeled.iloc[:, :8]
X_view2 = X_labeled.iloc[:, 8:]

In [9]:
# Co-training 알고리즘 적용
n_iter = 10  # Co-training 반복 횟수
clf1 = DecisionTreeClassifier(random_state=123)
clf2 = DecisionTreeClassifier(random_state=123)

In [10]:
for i in range(n_iter):
    # 각각의 뷰에서 모델 학습
    clf1.fit(X_view1, y_labeled)
    clf2.fit(X_view2, y_labeled)
    print('fit Done')

    # 각 모델의 예측 결과를 사용하여 레이블 예측
    y_unlabeled_pred1 = clf1.predict(X_unlabeled.iloc[:, :8])
    y_unlabeled_pred2 = clf2.predict(X_unlabeled.iloc[:, 8:])
    print('predict Done')

    # 예측 결과가 일치하는 데이터를 레이블이 지정된 데이터로 추가
    idx_agree = np.where(y_unlabeled_pred1 == y_unlabeled_pred2)[0]
    print(f'{len(idx_agree)} / {len(y_unlabeled_pred1)}')
    if len(idx_agree) > 0:
        X_labeled = np.concatenate([X_labeled, X_unlabeled.iloc[idx_agree]], axis=0)
        y_labeled = np.concatenate([y_labeled, y_unlabeled_pred1[idx_agree]], axis=0)

        X_view1 = np.concatenate([X_view1, X_unlabeled.iloc[idx_agree, :8]], axis=0)
        X_view2 = np.concatenate([X_view2, X_unlabeled.iloc[idx_agree, 8:]], axis=0)

    # 레이블이 지정된 데이터셋에서 모델 성능 평가
    # clf1.fit(X_labeled.iloc[:, :8], y_labeled)
    # y_pred1 = clf1.predict(X.iloc[:, :8])
    # acc1 = accuracy_score(y, y_pred1)

    # clf2.fit(X_labeled[:, 5:], y_labeled)
    # y_pred2 = clf2.predict(X[:, 5:])
    # acc2 = accuracy_score(y, y_pred2)

    # print(f"Iteration {i+1}: View 1 Accuracy = {acc1:.2f}, View 2 Accuracy = {acc2:.2f}")

# 최종 모델 학습
# clf1.fit(X_labeled[:, :5], y_labeled)
# y_pred = clf1.predict(X[:, :5])
# acc = accuracy_score(y, y_pred)
# print(f"Final Accuracy = {acc:.2f}")

fit Done
predict Done
513897 / 744999
fit Done




predict Done
643142 / 744999
fit Done




predict Done
684228 / 744999
fit Done




predict Done
699814 / 744999
fit Done




predict Done
708156 / 744999
fit Done




predict Done
712915 / 744999
fit Done




predict Done
716257 / 744999
fit Done




predict Done
718533 / 744999
fit Done




predict Done
720361 / 744999
fit Done




predict Done
721992 / 744999


In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
clf = RandomForestClassifier()
clf.fit(X_labeled, y_labeled)

In [16]:
y_pred = clf.predict_proba(test.drop(columns=['ID']))
y_pred



array([[0.9 , 0.1 ],
       [0.79, 0.21],
       [0.89, 0.11],
       ...,
       [0.95, 0.05],
       [0.98, 0.02],
       [0.96, 0.04]])

In [17]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('co_training_submission.csv', index=True)