In [3]:
import random
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings("ignore")

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [5]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':2048,
    'SEED':41
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    torch.cuda.manual_seed_all(seed) # multi-gpu seed 고정

seed_everything(CFG['SEED']) # Seed 고정

In [5]:
train1 = pd.read_csv('preprocess_train_data.csv').drop(columns=['제품']).fillna(0)
train1

Unnamed: 0,ID,대분류,중분류,소분류,브랜드,개당판매금액,언급량,판매량
0,0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,13500.0,0.84131,0
1,0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,13500.0,0.91383,0
2,0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,13500.0,1.45053,0
3,0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,13500.0,2.42239,0
4,0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,13500.0,1.87119,0
...,...,...,...,...,...,...,...,...
7293505,15889,B002-C001-0002,B002-C002-0004,B002-C003-0020,B002-03799,49800.0,5.51203,0
7293506,15889,B002-C001-0002,B002-C002-0004,B002-C003-0020,B002-03799,49800.0,3.52480,0
7293507,15889,B002-C001-0002,B002-C002-0004,B002-C003-0020,B002-03799,49800.0,4.03249,0
7293508,15889,B002-C001-0002,B002-C002-0004,B002-C003-0020,B002-03799,49800.0,5.88917,0


In [6]:
groups = train1.groupby('ID')

scale_min_dict = {}
scale_max_dict = {}

for name, group in groups:
    scale_min_dict[name] = group['판매량'].min()
    scale_max_dict[name] = group['판매량'].max()

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

def scale_series(s):
    return pd.Series(scaler.fit_transform(s.values.reshape(-1, 1)).flatten(), index=s.index)

train1['판매량'] = train1.groupby('ID')['판매량'].transform(scale_series)
train1['개당판매금액'] = scaler.fit_transform(train1['개당판매금액'].values.reshape(-1,1))

In [8]:
# labelencoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train1[col])
    train1[col] = label_encoder.transform(train1[col])

In [9]:
from torch.utils.data import WeightedRandomSampler

def weightedsampler(train_data):
    train_category = train_data[:,0,1].astype(int)

    cat_count = np.array([len(np.where(train_category==t)[0]) for t in np.unique(train_category)])
    weight = 1. / cat_count
    samples_weight = np.array([weight[t] for t in train_category])
    samples_weight = torch.from_numpy(samples_weight)
    sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight))

    return sampler

In [10]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [10]:
class moving_avg(torch.nn.Module):
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = torch.nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x

class series_decomp(torch.nn.Module):
    def __init__(self, kernel_size):
        super(series_decomp, self).__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        residual = x - moving_mean
        return moving_mean, residual 
        
class LTSF_DLinear(torch.nn.Module):
    def __init__(self, window_size, forcast_size, kernel_size, individual, feature_size):
        super(LTSF_DLinear, self).__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.decompsition = series_decomp(kernel_size)
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear_Seasonal = torch.nn.ModuleList()
            self.Linear_Trend = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear_Trend.append(torch.nn.Linear(self.window_size, self.forcast_size))
                # self.Linear_Trend[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
                self.Linear_Seasonal.append(torch.nn.Linear(self.window_size, self.forcast_size))
                # self.Linear_Seasonal[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
        else:
            self.Linear_Trend = torch.nn.Linear(self.window_size, self.forcast_size)
            # self.Linear_Trend.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
            self.Linear_Seasonal = torch.nn.Linear(self.window_size,  self.forcast_size)
            # self.Linear_Seasonal.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))

    def forward(self, x):
        x = x[:, :, 1:] # 모델 학습 시 ID 제외
        trend_init, seasonal_init = self.decompsition(x)
        trend_init, seasonal_init = trend_init.permute(0,2,1), seasonal_init.permute(0,2,1)
        if self.individual:
            trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.forcast_size], dtype=trend_init.dtype).to(trend_init.device)
            seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.forcast_size], dtype=seasonal_init.dtype).to(seasonal_init.device)
            for idx in range(self.channels):
                trend_output[:, idx, :] = self.Linear_Trend[idx](trend_init[:, idx, :])
                seasonal_output[:, idx, :] = self.Linear_Seasonal[idx](seasonal_init[:, idx, :])                
        else:
            trend_output = self.Linear_Trend(trend_init)
            seasonal_output = self.Linear_Seasonal(seasonal_init)
        x = seasonal_output + trend_output
        x = x.permute(0,2,1) # [batch_size, forcast_size, channels]
        x = x[:, :, -1].squeeze(1) # 마지막 column(판매량)만 고려
        return x

In [None]:
class AsymmetricMSELoss(nn.Module):
    def __init__(self, alpha = 0.1):
        super(AsymmetricMSELoss, self).__init__()
        self.alpha = alpha

    def forward(self, predictions, targets):
        diff = predictions - targets
        squared_diff = diff ** 2
        loss = torch.where(diff >= 0, self.alpha * squared_diff, (1 - self.alpha) * squared_diff)
        return loss.mean()

In [None]:
from collections import defaultdict

class ASAM:
    def __init__(self, optimizer, model, rho=0.5, eta=0.01):
        self.optimizer = optimizer
        self.model = model
        self.rho = rho
        self.eta = eta
        self.state = defaultdict(dict)

    @torch.no_grad()
    def ascent_step(self):
        wgrads = []
        for n, p in self.model.named_parameters():
            if p.grad is None:
                continue
            t_w = self.state[p].get("eps")
            if t_w is None:
                t_w = torch.clone(p).detach()
                self.state[p]["eps"] = t_w
            if 'weight' in n:
                t_w[...] = p[...]
                t_w.abs_().add_(self.eta)
                p.grad.mul_(t_w)
            wgrads.append(torch.norm(p.grad, p=2))
        wgrad_norm = torch.norm(torch.stack(wgrads), p=2) + 1.e-16
        for n, p in self.model.named_parameters():
            if p.grad is None:
                continue
            t_w = self.state[p].get("eps")
            if 'weight' in n:
                p.grad.mul_(t_w)
            eps = t_w
            eps[...] = p.grad[...]
            eps.mul_(self.rho / wgrad_norm)
            p.add_(eps)
        self.optimizer.zero_grad()

    @torch.no_grad()
    def descent_step(self):
        for n, p in self.model.named_parameters():
            if p.grad is None:
                continue
            p.sub_(self.state[p]["eps"])
        self.optimizer.step()
        self.optimizer.zero_grad()

In [None]:
from joblib import Parallel, delayed

def compute_for_cat(cat_df, ids):
    sub_cat_df = cat_df[cat_df[:, 0] == ids]
    true_Y, pred_Y = sub_cat_df[:, 2], sub_cat_df[:, 3]

    days_denom = np.sum(true_Y) + 1e-10
    days_num = np.abs(true_Y - pred_Y)
    days_denom_with_eps = np.maximum(true_Y, pred_Y) + 1e-10
    days = (days_num / days_denom_with_eps) * (true_Y / days_denom)
    return np.sum(days)

def compute_PSFA(sub_df):
    psfa_m = np.zeros(5)

    for cat in range(5):
        cat_df = sub_df[sub_df[:, :, 1] == cat]
        cat_ids, _ = np.unique(cat_df[:, 0], return_counts=True)
        cat_id_list = np.zeros(len(cat_ids))

        results = Parallel(n_jobs=-1)(delayed(compute_for_cat)(cat_df, ids) for _, ids in enumerate(cat_ids))

        cat_id_list = np.array(results)
        psfa_m[cat] = 1 - (np.sum(cat_id_list) / len(cat_id_list))
    return np.mean(psfa_m)

In [None]:
def train(model, minimizer, train_loader, val_loader, device, scheduler):
    model = nn.DataParallel(model, device_ids=[0, 1], output_device=0)
    model.to(device)

    criterion = AsymmetricMSELoss().to(device)
    # criterion = nn.HuberLoss(delta=0.1).to(device)

    best_score = 0
    best_loss = np.inf
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for X, Y in tqdm(iter(train_loader)):
            X, Y = X.to(device), Y.to(device)
            
            # Ascent Step
            output = model(X)
            loss = criterion(output, Y)
            loss.backward()
            minimizer.ascent_step()

            # Descent Step
            loss_2 = criterion(model(X), Y)
            loss_2.backward()
            minimizer.descent_step()
            
            train_loss.append(loss.item())
        
        val_loss, psfa = validation(model, val_loader, criterion, device)
        
        if scheduler is not None:
            scheduler.step()

        model_save = ''
        if psfa > best_score:
            best_score = psfa
            best_model = model
            model_save = 'Model Saved'
        
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] PSFA : [{psfa:.5f}] {model_save}')
    return best_model, best_score

def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    sub_df_list = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
            
            X_ids = X[:, :21, :2].detach().cpu().numpy()
            Y = Y.detach().cpu().numpy().reshape(Y.shape[0], Y.shape[1], 1)
            pred_Y = output.detach().cpu().numpy().reshape(output.shape[0], output.shape[1], 1)
            
            sub_df = np.concatenate([X_ids, Y, pred_Y], axis=2)

            sub_df_list.append(sub_df)
        
        sub_df = np.concatenate(sub_df_list, axis=0)
        PSFA = compute_PSFA(sub_df)
    
    return np.mean(val_loss) , PSFA

In [None]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            output = model(X)
            output = output.cpu().numpy()
            predictions.extend(output)
    
    return np.array(predictions)

In [None]:
train1_input = np.load('./data/new_data/train1_input_mean_stds.npy')
train1_target = np.load('./data/new_data/train1_target_mean_stds.npy')
test_input = np.load('./data/new_data/test_input_mean_stds.npy')

In [None]:
train_input_ids = train1_input[:, 0, 0].astype(int)

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=CFG['SEED'])

In [None]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
for fold_num, (train_idx, valid_idx) in enumerate(skf.split(train1_input, train_input_ids)):
    print(f'fold: {fold_num}', '='*50)
    
    train_input, train_target = train1_input[train_idx], train1_target[train_idx]
    val_input, val_target = train1_input[valid_idx], train1_target[valid_idx]

    train_dataset = CustomDataset(train_input, train_target)
    train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=16, pin_memory=True, sampler=weightedsampler(train_input))

    val_dataset = CustomDataset(val_input, val_target)
    val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=16, pin_memory=True)

    model = LTSF_DLinear(window_size=CFG['TRAIN_WINDOW_SIZE'], forcast_size=CFG["PREDICT_SIZE"], kernel_size=25,individual=False, feature_size=1)

    optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
    minimizer = ASAM(optimizer, model)

    scheduler = None

    print("Start Training")
    infer_model, fold_score = train(model, minimizer, train_loader, val_loader, device, scheduler)

    pred = inference(infer_model, test_loader, device)

    # 추론 결과를 inverse scaling
    for idx in range(len(pred)):
        pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

    torch.save(infer_model.state_dict(), f'./data/ensemble_submit/linear_idsplit_{fold_num}_{fold_score}.pth')

    pred = np.round(pred, 0).astype(int)
    
    submit = pd.read_csv('./sample_submission.csv')
    submit.iloc[:,1:] = pred
    submit.to_csv(f'/home/hwlee/dacon/LGAI/ensemble_submit/linear_idsplit_{fold_num}_{fold_score}.csv', index=False)
    
    print("Done")

    del train_input, train_target, val_input, val_target, train_dataset, train_loader, val_dataset, val_loader, model, optimizer, scheduler, infer_model, pred