In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [3]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=0)

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    def __len__(self):
        return len(self.X)

In [None]:
def train(model, optimizer, train_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    for epoch in range(1, 9):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, Y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : []')
    return model

In [None]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            output = model(X)
            output = output.cpu().numpy()
            predictions.extend(output)

    return np.array(predictions)

In [None]:
def improved_fill_zeros_with_median_and_selective_interpolation_v6(data):
    non_zero_data = [value for value in data if value != 0]
    if len(non_zero_data) == 0:
        return data
    median_value = np.median(non_zero_data)
    filled_data = data.copy()
    non_zero_indices = np.where(data != 0)[0]
    if non_zero_indices[0] != 0:
        filled_data[:non_zero_indices[0]] = np.linspace(median_value, data[non_zero_indices[0]], non_zero_indices[0]+1)[:-1]
    if non_zero_indices[-1] != len(data) - 1:
        filled_data[non_zero_indices[-1]+1:] = np.linspace(data[non_zero_indices[-1]], median_value, len(data)-non_zero_indices[-1])[:-1]

    for i in range(len(non_zero_indices) - 1):
        start, end = non_zero_indices[i], non_zero_indices[i+1]
        filled_data[start+1:end] = np.linspace(data[start], data[end], end-start+1)[1:-1]

    return filled_data

def add_noise_to_interpolated_values_v2(original_data, processed_data, std_dev=0.5):
    """Add Gaussian noise to the interpolated values in the processed data."""
    noise = np.where(original_data == 0, np.random.normal(0, std_dev, len(original_data)), 0)
    noisy_processed_data = processed_data + noise
    return noisy_processed_data

def add_bootstrap_noise_to_interpolated_values(original_data, processed_data):
    """Add bootstrap noise to the interpolated values in the processed data."""
    non_zero_data = [value for value in original_data if value != 0]
    if len(non_zero_data) == 0:
        return processed_data
    noise_values = np.random.choice(non_zero_data, size=len(original_data), replace=True) - np.median(non_zero_data)
    noisy_processed_data = np.where(original_data == 0, processed_data + noise_values, processed_data)
    return noisy_processed_data


In [None]:
seed_list=[43,44,93779555]
seed_list

In [None]:

for seed in seed_list:
    CFG = {
    'TRAIN_WINDOW_SIZE':28, 
    'PREDICT_SIZE':21, 
    'EPOCHS':8,
    'LEARNING_RATE':0.001,
    'BATCH_SIZE':1024,
    'SEED':seed
    }

    submit = pd.read_csv('./data2/sample_submission.csv')
    seed_everything(CFG['SEED'])
    for shop in tqdm(range(1,11)):
        train_data = pd.read_csv('./data2/train.csv')
        shop = str(shop).zfill(2)
        train_data = train_data.loc[train_data['쇼핑몰'] == f'S001-000{shop}']
        ids = train_data['ID']
        date_columns = train_data.columns[7:]

        grouped_by_product = train_data.groupby('제품')[date_columns].sum().reset_index()

        merged_df = pd.merge(train_data.drop(columns=date_columns), grouped_by_product, on='제품', how='left')
        train_data = merged_df.copy()

        for i in range(len(train_data)):
            value = list(train_data.iloc[i][7:])
            interpolated_sales_data = improved_fill_zeros_with_median_and_selective_interpolation_v6(np.array(value))
            train_data.iloc[i, 7:] = interpolated_sales_data

        numeric_cols = train_data.columns[7:]
        min_values = train_data[numeric_cols].min(axis=1)
        max_values = train_data[numeric_cols].max(axis=1)
        ranges = max_values - min_values
        ranges[ranges == 0] = 1
        train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
        scale_min_dict = min_values.to_dict()
        scale_max_dict = max_values.to_dict()

        def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
            num_rows = len(data)
            window_size = train_size + predict_size
            input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, 1))
            target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))

            for i in tqdm(range(num_rows)):
                sales_data = np.array(data.iloc[i, 7:])  # 첫 4개 열을 제외하고 가져옵니다.

                for j in range(len(sales_data) - window_size + 1):
                    window = sales_data[j : j + window_size]
                    input_data[i * (len(data.columns) - window_size + 1) + j] = window[:train_size].reshape(-1, 1)
                    target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]

            return input_data, target_data

        def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):

            num_rows = len(data)

            input_data = np.empty((num_rows, train_size, 1))

            for i in tqdm(range(num_rows)):
                sales_data = np.array(data.iloc[i, -train_size:])
                input_data[i] = sales_data.reshape(-1, 1)  

            return input_data

        train_input, train_target = make_train_data(train_data)
        test_input = make_predict_data(train_data)
        data_len = len(train_input)

        train_dataset = CustomDataset(train_input, train_target)
        train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

        class BaseModel(nn.Module):
            def __init__(self, input_size=1, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
                super(BaseModel, self).__init__()
                self.hidden_size = hidden_size
                self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
                self.fc = nn.Sequential(
                    nn.Linear(hidden_size*2, hidden_size//2),
                    nn.ReLU(),
                    nn.Dropout(),
                    nn.Linear(hidden_size//2, output_size)
                )

                self.actv = nn.ReLU()

            def forward(self, x):
                batch_size = x.size(0)
                hidden = self.init_hidden(batch_size, x.device)
                lstm_out, hidden = self.lstm(x, hidden)
                last_output = lstm_out[:, -1, :]
                output = self.actv(self.fc(last_output))
                return output.squeeze(1)

            def init_hidden(self, batch_size, device):
                return (torch.zeros(2, batch_size, self.hidden_size, device=device),
                        torch.zeros(2, batch_size, self.hidden_size, device=device))

        def train(model, optimizer, train_loader, device):
            model.to(device)
            criterion = nn.MSELoss().to(device)
            best_loss = 9999999
            best_model = None

            for epoch in range(1, CFG['EPOCHS']+1):
                model.train()
                train_loss = []
                train_mae = []
                for X, Y in tqdm(iter(train_loader)):
                    X = X.to(device)
                    Y = Y.to(device)
                    optimizer.zero_grad()
                    output = model(X)
                    loss = criterion(output, Y)
                    loss.backward()
                    optimizer.step()

                    train_loss.append(loss.item())
                print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : []')
            return model

        model = BaseModel()
        optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
        infer_model = train(model, optimizer, train_loader, device)
        test_dataset = CustomDataset(test_input, None)
        test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
        pred = inference(model, test_loader, device)
        for idx in range(len(pred)):
            pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]
        pred = np.round(pred, 0).astype(int)
        submit.loc[submit['ID'].isin(ids.values), '2023-04-25':] = pred
    submit.to_csv(f'biLSbyshopping_28_{seed}_.csv',index=False)

# LSTFLinear

In [None]:
def improved_fill_zeros_with_median_and_selective_interpolation_v6(data):
    non_zero_data = [value for value in data if value != 0]
    if len(non_zero_data) == 0:
        return data
    median_value = np.median(non_zero_data)
    filled_data = data.copy()
    non_zero_indices = np.where(data != 0)[0]
    if non_zero_indices[0] != 0:
        filled_data[:non_zero_indices[0]] = np.linspace(median_value, data[non_zero_indices[0]], non_zero_indices[0]+1)[:-1]
    if non_zero_indices[-1] != len(data) - 1:
        filled_data[non_zero_indices[-1]+1:] = np.linspace(data[non_zero_indices[-1]], median_value, len(data)-non_zero_indices[-1])[:-1]
    for i in range(len(non_zero_indices) - 1):
        start, end = non_zero_indices[i], non_zero_indices[i+1]
        filled_data[start+1:end] = np.linspace(data[start], data[end], end-start+1)[1:-1]
    return filled_data

def add_noise_to_interpolated_values_v2(original_data, processed_data, std_dev=0.5):
    """Add Gaussian noise to the interpolated values in the processed data."""
    noise = np.where(original_data == 0, np.random.normal(0, std_dev, len(original_data)), 0)
    noisy_processed_data = processed_data + noise
    return noisy_processed_data

def add_bootstrap_noise_to_interpolated_values(original_data, processed_data):
    """Add bootstrap noise to the interpolated values in the processed data."""
    non_zero_data = [value for value in original_data if value != 0]
    if len(non_zero_data) == 0:
        return processed_data
    noise_values = np.random.choice(non_zero_data, size=len(original_data), replace=True) - np.median(non_zero_data)
    noisy_processed_data = np.where(original_data == 0, processed_data + noise_values, processed_data)
    return noisy_processed_data


In [None]:
train_data = pd.read_csv('train.csv')
submit = pd.read_csv('sample_submission.csv')
date_columns = train_data.columns[7:]

grouped_by_product = train_data.groupby('제품')[date_columns].sum().reset_index()

merged_df = pd.merge(train_data.drop(columns=date_columns), grouped_by_product, on='제품', how='left')
train_data = merged_df.copy()

for i in range(len(train_data)):
    value = list(train_data.iloc[i][7:])
    interpolated_sales_data = improved_fill_zeros_with_median_and_selective_interpolation_v6(np.array(value))
    train_data.iloc[i, 7:] = interpolated_sales_data
train_df = train_data.copy()

In [None]:
class LTSF_Linear(torch.nn.Module):
    def __init__(self, window_size, forecast_size, individual, feature_size):
        super(LTSF_Linear, self).__init__()
        self.window_size = window_size
        self.forecast_size = forecast_size
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear.append(torch.nn.Linear(self.window_size, self.forecast_size))
        else:
            self.Linear = torch.nn.Linear(self.window_size, self.forecast_size)

    def forward(self, x):
        if self.individual:
            output = torch.zeros([x.size(0), self.forecast_size, x.size(2)],dtype=x.dtype).to(x.device)
            for i in range(self.channels):
                output[:,:,i] = self.Linear[i](x[:,:,i])
            x = output
        else:
            x = self.Linear(x.permute(0,2,1)).permute(0,2,1)
        return x

class Data(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


In [None]:
def reshape_data(df):
    time_series_data = []
    for idx, row in df.iterrows():
        sales_data = row[7:].values.astype(float)
        time_series_data.append(sales_data)
    return np.array(time_series_data)

def time_slide_df(data, window_size, forecast_size):
    data_list = []
    dap_list = []
    for idx in range(0, len(data) - window_size - forecast_size + 1):
        x = data[idx:idx + window_size].reshape(window_size, 1)
        y = data[idx + window_size:idx + window_size + forecast_size]
        data_list.append(x)
        dap_list.append(y)
    return np.array(data_list, dtype='float32'), np.array(dap_list, dtype='float32')

def create_dataloader(data, window_size, forecast_size, batch_size):
    X, Y = time_slide_df(data, window_size, forecast_size)
    ds = Data(X, Y)
    return DataLoader(ds, batch_size=batch_size, shuffle=True)

time_series_data = reshape_data(train_df)

In [None]:
def time_slide_df(data, window_size, forecast_size):
    data_list = []
    dap_list = []
    for idx in range(0, len(data) - window_size - forecast_size + 1):
        x = data[idx:idx + window_size].reshape(window_size, 1)
        y = data[idx + window_size:idx + window_size + forecast_size]
        data_list.append(x)
        dap_list.append(y)
    return np.array(data_list, dtype='float32'), np.array(dap_list, dtype='float32')

def create_dataloader(data, window_size, forecast_size, batch_size):
    X, Y = time_slide_df(data, window_size, forecast_size)
    ds = Data(X, Y)
    return DataLoader(ds, batch_size=batch_size, shuffle=True)

time_series_data = reshape_data(train_df)

window_size = 28   # Considering the last 30 days for prediction
forecast_size = 21 # Predicting the next 21 days
batch_size = 4096
epoch_count = 777 #777
lr = 0.001
min_delta = 0.001
patience = 10

future_predictions_by_id = {}
loss_history = {}

for idx, (id_val, data) in tqdm(enumerate(zip(train_df["ID"], time_series_data)), total=len(train_df["ID"])):
    mean_ = np.mean(data)
    std_ = np.std(data)
    if std_ == 0:
        standardized_data = data
    else:
        standardized_data = (data - mean_) / std_
    individual_loss_history = []

    train_dl = create_dataloader(standardized_data, window_size, forecast_size, batch_size)
    best_loss = float('inf')
    no_improvement_count = 0

    DLinear_model = LTSF_Linear(window_size=window_size, forecast_size=21, individual=False, feature_size=1)
    DLinear_model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)

    for epoch in range(1, epoch_count + 1):
        loss_list = []
        DLinear_model.train()
        for batch_idx, (data, target) in enumerate(train_dl):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = DLinear_model(data)
            loss = criterion(output, target.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())

        if((epoch % 10) == 0):
            avg_loss = np.mean(loss_list)
            individual_loss_history.append(avg_loss)
            if avg_loss + min_delta < best_loss:
                best_loss = avg_loss
                no_improvement_count = 0
            else:
                no_improvement_count += 1
                if no_improvement_count >= patience:
                    break

    loss_history[id_val] = individual_loss_history
    last_window_data = torch.tensor(standardized_data[-window_size:]).unsqueeze(0).unsqueeze(-1).float().to(device)
    future_prediction = DLinear_model(last_window_data)
    if std_ == 0:
        future_prediction = future_prediction.squeeze().detach().cpu().numpy()
    else:
        future_prediction = future_prediction.squeeze().detach().cpu().numpy() * std_ + mean_
    future_predictions_by_id[id_val] = future_prediction

In [None]:
for id_val, predictions in future_predictions_by_id.items():
    rounded_predictions = np.round(predictions).astype(np.float64)
    submit.loc[submit['ID'] == id_val, '2023-04-25':'2023-05-15'] = rounded_predictions.astype(np.float64)
submit.iloc[:, 1:] = submit.iloc[:, 1:].applymap(lambda x: 0 if x < 0 else x)
submit = submit.fillna(0)
submit.to_csv(f'l_{window_size}_{seed}_.csv',index=False)

In [16]:
lstm_output_list = ['biLSbyshopping_28_43_.csv', 'biLSbyshopping_28_44_.csv', 'biLSbyshopping_28_93779555_.csv']
for lstm_out in lstm_output_list:
    lstm_sub = pd.read_csv(f'./{lstm_out}')
    lstm_sub_median = lstm_sub.copy()
    columns_21_days = lstm_sub_median.columns[1:]
    for col in columns_21_days:
        lstm_sub_median[col] = lstm_sub[columns_21_days].median(axis=1).astype(int)
    for i in [6,7,9]:
        df = train_df[train_df['쇼핑몰'] == f'S001-0000{i}']
        for j in list(df.index):
            lstm_sub.iloc[int(j),1:] = np.round((submit.loc[int(j)][1:].values.astype(np.float64) + lstm_sub.loc[int(j)][1:].values).astype(np.float64) / 2)
    filename = lstm_out.split('.')[0]
    lstm_sub.to_csv(f'./{filename}avg.csv', index=False)

In [17]:
import pandas as pd
import numpy as np

csv_paths = ["./biLSbyshopping_28_44_avg.csv","./biLSbyshopping_28_93779555_avg.csv", "./biLSbyshopping_28_43_avg.csv"]

csv_weights = [0.3334, 0.3333, 0.3333]

csv_dfs = [pd.read_csv(path).set_index('ID') for path in csv_paths]

columns_21_days = csv_dfs[0].columns

ensemble_df = pd.DataFrame(index=csv_dfs[0].index)
ensemble_df['ID'] = csv_dfs[0].index

ensemble_values = sum(csv_df * weight for csv_df, weight in zip(csv_dfs, csv_weights))
ensemble_df[columns_21_days] = np.round(ensemble_values.values).astype(int)

ensemble_df.reset_index(drop=True, inplace=True)
ensemble_avg_df = ensemble_df.copy()
for col in columns_21_days:
    ensemble_avg_df[col] = ensemble_df[columns_21_days].mean(axis=1).astype(int)


ensemble_avg_df.reset_index(drop=True).to_csv("./REAL_FINAL_SUBMIT.csv", index=False)