In [1]:
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
colab_path = '/content/drive/MyDrive/ColabNotebooks/airplane'

In [3]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt


In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [5]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [6]:
csv_to_parquet(colab_path + '/train.csv', 'train')
csv_to_parquet(colab_path + '/test.csv', 'test')

train Done.
test Done.


In [7]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv(colab_path + '/sample_submission.csv', index_col = 0)

## **결측값 처리**

In [8]:
train = train.drop(['Cancelled','Diverted'], axis=1) #의미없는 열 제거
test = test.drop(['Cancelled','Diverted'], axis=1) #의미없는 열 제거

In [9]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    mode1 = test[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode1)
print('Done.')

Done.


In [10]:
#질적 변수들을 라벨인코더를 활용해 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [28]:
# 레이블이 있는 데이터와 레이블이 없는 데이터로 분리
labeled_data = train.dropna(subset=['Delay'])
unlabeled_data = train[train['Delay'].isnull()]

In [29]:
labeled_data['Delay']=labeled_data['Delay'].replace({'Not_Delayed': 0, 'Delayed': 1})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['Delay']=labeled_data['Delay'].replace({'Not_Delayed': 0, 'Delayed': 1})


In [30]:
# 초기 모델 학습
model = RandomForestClassifier()
X_train = labeled_data.drop(columns=['ID', 'Delay'])
y_train = labeled_data['Delay']
model.fit(X_train, y_train)

In [31]:
    # 레이블이 없는 데이터 예측
    X_unlabeled = unlabeled_data.drop(columns=['ID','Delay'])
    y_pred = model.predict(X_unlabeled)
    y_prob = model.predict_proba(X_unlabeled)

    # 모델이 확신을 가지고 있는 새로운 레이블 추가
    threshold = 0.95
    new_labels = []
    for j in range(len(y_prob)):
        max_prob = np.max(y_prob[j])
        if max_prob >= threshold:
            new_labels.append(y_pred[j])
        else:
            new_labels.append(np.nan)
            """
    # 중지하는 조건 추가
    if len(new_labels) == 0 or all(pd.isnull(new_labels)):
        break
        """
    # add new labels to unlabeled_data
    unlabeled_data['Delay'] = new_labels
    unlabeled_data_labeled = unlabeled_data[unlabeled_data['Delay'].notnull()]
    unlabeled_data = unlabeled_data[unlabeled_data['Delay'].isnull()]
    """
    # 중지하는 조건 추가
    if len(unlabeled_data_labeled) == 0:
        break
        """
    # nan이 아닌 데이터와 라벨링된 데이터 병합
    labeled_data = pd.concat([labeled_data, unlabeled_data_labeled], ignore_index=True)
   
    # 초기 모델 학습
    model = RandomForestClassifier()
    X_train = labeled_data.drop(columns=['ID', 'Delay'])
    y_train = labeled_data['Delay']
    model.fit(X_train, y_train)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['Delay'] = new_labels


In [26]:
labeled_data['Delay'].value_counts()

0.0    628645
1.0     45010
Name: Delay, dtype: int64

In [33]:
labeled_and_predicted_data

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_369029,8,9,1930.0,2244.0,119,11618,28,333,14893,4,2500.0,26,8,19977.0,2538,1.0
1,TRAIN_393850,6,11,1435.0,2310.0,157,12173,9,195,12892,4,2556.0,16,6,19690.0,1977,1.0
2,TRAIN_405352,12,31,2056.0,2354.0,228,13303,4,290,14307,38,1209.0,3,0,19805.0,2079,0.0
3,TRAIN_408461,5,31,2030.0,2350.0,159,12191,42,127,11697,7,957.0,23,8,19393.0,4292,0.0
4,TRAIN_416707,5,11,2211.0,2343.0,72,11042,33,193,12889,26,1824.0,14,4,20436.0,3797,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255009,TRAIN_999962,10,11,600.0,2003.0,310,14683,42,256,13930,11,1041.0,22,8,20304.0,488,0.0
255010,TRAIN_999963,5,2,1759.0,1926.0,204,12953,30,93,11278,47,214.0,23,3,20452.0,5204,1.0
255011,TRAIN_999969,10,10,940.0,1056.0,223,13256,42,169,12266,42,316.0,19,8,20378.0,5350,1.0
255012,TRAIN_999985,8,8,1914.0,2039.0,296,14492,31,183,12451,7,407.0,14,4,20436.0,1499,0.0


In [None]:
from sklearn.ensemble import RandomForestClassifier

# 초기 레이블이 있는 데이터셋과 레이블이 없는 데이터셋 로드
labeled_data = pd.read_csv('labeled_data.csv')
unlabeled_data = pd.read_csv('unlabeled_data.csv')

# 초기 레이블링 데이터로 모델 학습
X_train = labeled_data.drop(columns=['label'])
y_train = labeled_data['label']
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Self-Training 시작
num_iterations = 3
for i in range(num_iterations):
    # 레이블이 없는 데이터 중 20%만 추가 레이블링
    sample_size = int(len(unlabeled_data) * 0.2)
    sample_data = unlabeled_data.sample(n=sample_size, random_state=42)
    X_sample = sample_data.drop(columns=['label'])
    y_sample = model.predict(X_sample)
    sample_data['label'] = y_sample

    # 추가된 레이블링 데이터와 기존 레이블링 데이터를 병합하여 모델 재학습
    labeled_data = pd.concat([labeled_data, sample_data], ignore_index=True)
    X_train = labeled_data.drop(columns=['label'])
    y_train = labeled_data['label']
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    
    # 추가된 레이블링 데이터는 unlabeled_data에서 제거
    unlabeled_data = unlabeled_data[~unlabeled_data.index.isin(sample_data.index)]


In [32]:
test_x = test.drop(columns=['ID'])

In [33]:
y_pred = model.predict_proba(test_x)

In [34]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [35]:
submission.to_csv('24_submission.csv', index=True)
