# Import

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import math
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

# Data Load

In [2]:
train = pd.read_csv('train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('test.csv').drop(columns=['SAMPLE_ID'])

# Data Pre-processing

In [3]:
# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

In [4]:
# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

In [5]:
# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']


for feature in tqdm(categorical_features, desc="Encoding features"):
    encoder = LabelEncoder()
    train[feature] = encoder.fit_transform(train[feature])
    for label in np.unique(test[feature]):
        if label not in encoder.classes_:
            encoder.classes_ = np.append(encoder.classes_, label)
    test[feature] = encoder.transform(test[feature])

Encoding features: 100%|██████████| 6/6 [00:09<00:00,  1.59s/it]


In [6]:
# 결측치 처리
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

  train.fillna(train.mean(), inplace=True)
  test.fillna(train.mean(), inplace=True)


# Data Generation

In [7]:
# ATA_LT SIN-COS
train['COS_ATA_LT'] = np.cos(2*np.pi*(train.ATA_LT/24))
train['SIN_ATA_LT'] = np.sin(2*np.pi*(train.ATA_LT/24))

test['COS_ATA_LT'] = np.cos(2*np.pi*(test.ATA_LT/24))
test['SIN_ATA_LT'] = np.sin(2*np.pi*(test.ATA_LT/24))

In [8]:
# round_hour
train['rounded_hour'] = (train['ATA'].dt.hour + (train['ATA'].dt.minute // 30)).apply(lambda x: 0 if x == 24 else x)
test['rounded_hour'] = (test['ATA'].dt.hour + (test['ATA'].dt.minute // 30)).apply(lambda x: 0 if x == 24 else x)

In [9]:
# Date SIN-COS
# sin, cos 변환 함수 정의
def encode_cyclic_feature(data, column, max_val):
    data[column + '_sin'] = np.sin(2 * np.pi * data[column] / max_val)
    data[column + '_cos'] = np.cos(2 * np.pi * data[column] / max_val)
    return data

# 각 피처에 대해 sin, cos 변환 수행
train = encode_cyclic_feature(train, 'month', 12)
train = encode_cyclic_feature(train, 'day', 31)
train = encode_cyclic_feature(train, 'weekday', 7)
train = encode_cyclic_feature(train, 'rounded_hour', 24)
test = encode_cyclic_feature(test, 'month', 12)
test = encode_cyclic_feature(test, 'day', 31)
test = encode_cyclic_feature(test, 'weekday', 7)
test = encode_cyclic_feature(test, 'rounded_hour', 24)

train.drop(['ATA'],axis=1,inplace=True)
test.drop(['ATA'],axis=1,inplace=True)

In [10]:
# Ship info category

ship_info = ['SHIP_TYPE_CATEGORY','BREADTH','BUILT','DEADWEIGHT','DEPTH','DRAUGHT','GT','LENGTH','FLAG']

train_data = train[ship_info]
test_data = test[ship_info]

scaler = MinMaxScaler()
train_data_scale = scaler.fit_transform(train_data)
test_data_scale = scaler.transform(test_data)

#model = KMeans()
#visualizer = KElbowVisualizer(model, k=(1,10))
#visualizer.fit(train_data_scale)

In [11]:
k = 4

# 그룹 수, random_state 설정
model = KMeans(n_clusters = k, random_state = 10)

# 정규화된 데이터에 학습
model.fit(train_data_scale)

# 클러스터링 결과 각 데이터가 몇 번째 그룹에 속하는지 저장
train['ship_cluster'] = model.fit_predict(train_data_scale)
test['ship_cluster'] = model.predict(test_data_scale)

# Info

In [12]:
trian_target = train['CI_HOUR']
train.drop(['CI_HOUR'],axis=1,inplace=True)
train['CI_HOUR'] = trian_target

# CSV

In [14]:
train.to_csv('train_merge_new_fillna.csv',index=False)
test.to_csv('test_merge_new_fillna.csv',index=False)