In [1]:
import random
import os
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=0)

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [3]:
train = pd.read_csv('../../DATA/train.csv')
test  = pd.read_csv('../../DATA/test.csv')

In [4]:
def get_fix_makecols(data) :
    
    new_column_names = {
        'corporation': 'corp',
        'location': 'loc',
        'supply(kg)': 'supply',
        'price(원/kg)': 'price',
    }
    data = data.rename(columns=new_column_names)
    data['timestamp']  = pd.to_datetime(data['timestamp'])
    data['newitem'] = data['item'].str.cat([data['corp'], data['loc']], sep=' ')
    data['year'] = data['timestamp'].dt.year
    data['month'] = data['timestamp'].dt.month
    data['day'] = data['timestamp'].dt.day
    data['weekday'] = data['timestamp'].dt.weekday
    data['weekofyear'] = data['timestamp'].dt.isocalendar().week
    
    return data 

train= get_fix_makecols(train)
test = get_fix_makecols(test)

In [5]:
# 미리 정규화 하고 평균 분산은 따로 저장하기 
mean_by_item_s = train.groupby('newitem')['supply'].transform('mean')
std_by_item_s = train.groupby('newitem')['supply'].transform('std')

train['mean_by_item_s'] = mean_by_item_s
train['std_by_item_s'] = std_by_item_s
train['supply'] = (train['supply']-train['mean_by_item_s']) /train['std_by_item_s']

mean_by_item = train.groupby('newitem')['price'].transform('mean')
std_by_item = train.groupby('newitem')['price'].transform('std')

train['mean_by_item'] = mean_by_item
train['std_by_item'] = std_by_item
train['price'] = (train['price']-train['mean_by_item']) /train['std_by_item']


mean_std_by_item = train[['mean_by_item', 'std_by_item','newitem']].drop_duplicates()
mean_std_by_item_s = train[['mean_by_item_s', 'std_by_item_s','newitem']].drop_duplicates()
train.drop(columns=['mean_by_item','std_by_item','mean_by_item_s','std_by_item_s'], inplace=True)
cols = train.columns[-6:]  
train = train[cols.tolist() + train.columns[:-6].tolist()]  # 열 순서 변경

In [6]:
#인코딩을 해주고 열 제거 및 추가 
def remake_train(train ) :
    
    le = LabelEncoder()
    for col in ['item', 'corp', 'loc']:
        train[col] = le.fit_transform(train[col])
    train['newitem']  = train.index // 1523 
    
    train_supply = train[['ID', 'timestamp', 'supply']]
    train = train.drop(columns='supply')

    train = train.drop(columns= ['ID', 'timestamp'])
    train['means'] = 0 
    train['stds'] = 0 
    return train , train_supply

train , train_supply = remake_train(train )
        

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [7]:
#이전에 살필 날짜만큼의 이동평균과 분산을 각각 아이디마다 구하고 이를 기존
newtrain = train.copy() 
def make_rolling_data(data , windowsize) :
    means= data.loc[:, 'price'].rolling(window= windowsize).mean().values.flatten()
    stds = data.loc[:, 'price'].rolling(window= windowsize).std().values.flatten()
    
    data.loc[: ,'means' ] = means 
    data.loc[:, 'stds'] = stds 
    return data 

for i in range(39) :
    newdata = newtrain[newtrain['newitem']==i]
    newtrain.loc[i*1523: (i+1)*1523 , : ]  = make_rolling_data(newdata, 64 )

newtrain['supply'] = train_supply['supply']
cols = newtrain.columns.tolist()  # 현재 열의 순서를 리스트로 변환
cols.remove('price')  # 'price'를 리스트에서 제거
cols.append('price')  # 'price'를 리스트의 마지막에 추가
newtrain = newtrain[cols]  # 변경한 열 순서로 데이터프레임 재정렬


newtrain['year'] = newtrain['year']-2019 
train = newtrain.fillna(0)

    

         nan]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  newtrain.loc[i*1523: (i+1)*1523 , : ]  = make_rolling_data(newdata, 64 )
  newtrain.loc[i*1523: (i+1)*1523 , : ]  = make_rolling_data(newdata, 64 )


In [8]:
train

Unnamed: 0,newitem,year,month,day,weekday,weekofyear,item,corp,loc,means,stds,supply,price
0,0.0,0.0,1.0,1.0,1.0,1,4.0,0.0,0.0,0.000000,0.000000,-0.511206,-0.882285
1,0.0,0.0,1.0,2.0,2.0,1,4.0,0.0,0.0,0.000000,0.000000,-0.511206,-0.882285
2,0.0,0.0,1.0,3.0,3.0,1,4.0,0.0,0.0,0.000000,0.000000,2.982703,-0.288716
3,0.0,0.0,1.0,4.0,4.0,1,4.0,0.0,0.0,0.000000,0.000000,0.930152,-0.398636
4,0.0,0.0,1.0,5.0,5.0,1,4.0,0.0,0.0,0.000000,0.000000,1.354026,-0.452909
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,38.0,4.0,2.0,27.0,0.0,9,3.0,5.0,0.0,0.530967,0.749880,1.121287,0.721362
59393,38.0,4.0,2.0,28.0,1.0,9,3.0,5.0,0.0,0.527995,0.747845,0.992953,0.939204
59394,38.0,4.0,3.0,1.0,2.0,9,3.0,5.0,0.0,0.528806,0.748432,0.828637,1.087890
59395,38.0,4.0,3.0,2.0,3.0,9,3.0,5.0,0.0,0.525024,0.745832,1.225691,0.911542


In [9]:
train

Unnamed: 0,newitem,year,month,day,weekday,weekofyear,item,corp,loc,means,stds,supply,price
0,0.0,0.0,1.0,1.0,1.0,1,4.0,0.0,0.0,0.000000,0.000000,-0.511206,-0.882285
1,0.0,0.0,1.0,2.0,2.0,1,4.0,0.0,0.0,0.000000,0.000000,-0.511206,-0.882285
2,0.0,0.0,1.0,3.0,3.0,1,4.0,0.0,0.0,0.000000,0.000000,2.982703,-0.288716
3,0.0,0.0,1.0,4.0,4.0,1,4.0,0.0,0.0,0.000000,0.000000,0.930152,-0.398636
4,0.0,0.0,1.0,5.0,5.0,1,4.0,0.0,0.0,0.000000,0.000000,1.354026,-0.452909
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,38.0,4.0,2.0,27.0,0.0,9,3.0,5.0,0.0,0.530967,0.749880,1.121287,0.721362
59393,38.0,4.0,2.0,28.0,1.0,9,3.0,5.0,0.0,0.527995,0.747845,0.992953,0.939204
59394,38.0,4.0,3.0,1.0,2.0,9,3.0,5.0,0.0,0.528806,0.748432,0.828637,1.087890
59395,38.0,4.0,3.0,2.0,3.0,9,3.0,5.0,0.0,0.525024,0.745832,1.225691,0.911542


# 새로운 Train 데이터 완성 

In [10]:
# #가격 시계열 price_data 생성하기 

# def before_timeseries_data(train ):
  
#     # 날짜 범위 생성
#     start_date = datetime(2019, 1, 1).date()
#     end_date =datetime(2023, 3, 3).date()
#     dates = pd.date_range(start_date, end_date, freq='D')

#     # 데이터프레임 생성
#     price_data = pd.DataFrame(columns=['newitem' ,'item', 'corp', 'loc' ]+ dates.strftime('%Y-%m-%d').tolist())
#     # price_data['item'] = unique_values
#     price_data['item'] = train['item']
#     price_data['corp'] = train['corp']
#     price_data['loc'] = train['loc']
#     price_data['newitem'] = train['newitem']
    
#     # print(len(price_data)) 39 
#     for i in range(39) :
#         price_data.iloc[i,4:] = train['price'][i*1523 : (i+1)*1523 ]
#     return price_data

# price_data = before_timeseries_data(train, traincols)

NameError: name 'traincols' is not defined

In [11]:
lookback , forcast=64,28
fullwindow = forcast+lookback
fortrain =[]
fortest = []

for i in range(0, 39) :
    train_newitem = train.iloc[i*1523:(i+1)*1523 , :]
    test_newitem = train.iloc[i*1523:(i+1)*1523 , -1:]
    # print(train_newitem.tail(3)     
    for j in range(0,len(train_newitem)-fullwindow+1):
        x = train_newitem[j: lookback+j ] 
        y = test_newitem[j+lookback :j+lookback+ forcast ]
        fortrain.append(x)
        fortest.append(y)
fortrain, fortest = np.array(fortrain, dtype='float32'), np.array(fortest, dtype='float32')
        
        

In [None]:
fortest[0]

array([[ 0.58721155],
       [ 0.54839593],
       [ 0.385233  ],
       [ 0.77957207],
       [-0.8822854 ],
       [ 0.566258  ],
       [ 0.45874223],
       [ 0.383859  ],
       [ 0.3378299 ],
       [ 0.34504342],
       [ 0.33405137],
       [-0.8822854 ],
       [ 0.33027288],
       [ 0.27599972],
       [ 0.35740945],
       [ 0.47832176],
       [ 0.755527  ],
       [ 0.9753676 ],
       [-0.8822854 ],
       [ 0.926247  ],
       [ 0.30038828],
       [ 0.37149298],
       [ 1.2525729 ],
       [-0.8822854 ],
       [ 0.757588  ],
       [-0.8822854 ],
       [ 2.1175082 ],
       [ 0.4491242 ]], dtype=float32)

In [12]:
fortest = np.squeeze(fortest , axis = 2)


In [None]:
fortrain.shape, fortest.shape

((55848, 64, 13), (55848, 28))

In [13]:
# Define the window size, forecast size, and batch size
window_size = 64  
forecast_size = 28
batch_size = 2048
epoch_count = 777
lr = 0.001
min_delta = 0.0015
patience = 10

# Model Train


In [14]:
#Model 정의 및 사용할 class 
class LTSF_NLinear(torch.nn.Module):
    def __init__(self, window_size, forecast_size, individual, feature_size):
        super(LTSF_NLinear, self).__init__()
        self.window_size = window_size
        self.forecast_size = forecast_size
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear.append(torch.nn.Linear(self.window_size, self.forecast_size))
        else:
            self.Linear = torch.nn.Linear(self.window_size, self.forecast_size)

    def forward(self, x):
        seq_last = x[:,-1:,:].detach()
        x = x - seq_last
        if self.individual:
            output = torch.zeros([x.size(0), self.forecast_size, x.size(2)],dtype=x.dtype).to(x.device)
            for i in range(self.channels):
                output[:,:,i] = self.Linear[i](x[:,:,i])
            x = output
        else:
            x = self.Linear(x.permute(0,2,1)).permute(0,2,1)
        x = x + seq_last
        return x

class Data(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
    

In [15]:

def create_dataloader(fortrain, fortest ,batchsize):
    print(f"{fortrain.shape} , {fortest.shape}")
    ds = Data(fortrain, fortest)
    return DataLoader(ds, batch_size=batchsize , shuffle= True)

In [None]:
train

Unnamed: 0,newitem,year,month,day,weekday,weekofyear,item,corp,loc,means,stds,supply,price
0,0.0,0.0,1.0,1.0,1.0,1,4.0,0.0,0.0,0.000000,0.000000,-0.511206,-0.882285
1,0.0,0.0,1.0,2.0,2.0,1,4.0,0.0,0.0,0.000000,0.000000,-0.511206,-0.882285
2,0.0,0.0,1.0,3.0,3.0,1,4.0,0.0,0.0,0.000000,0.000000,2.982703,-0.288716
3,0.0,0.0,1.0,4.0,4.0,1,4.0,0.0,0.0,0.000000,0.000000,0.930152,-0.398636
4,0.0,0.0,1.0,5.0,5.0,1,4.0,0.0,0.0,0.000000,0.000000,1.354026,-0.452909
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,38.0,4.0,2.0,27.0,0.0,9,3.0,5.0,0.0,0.530967,0.749880,1.121287,0.721362
59393,38.0,4.0,2.0,28.0,1.0,9,3.0,5.0,0.0,0.527995,0.747845,0.992953,0.939204
59394,38.0,4.0,3.0,1.0,2.0,9,3.0,5.0,0.0,0.528806,0.748432,0.828637,1.087890
59395,38.0,4.0,3.0,2.0,3.0,9,3.0,5.0,0.0,0.525024,0.745832,1.225691,0.911542


In [None]:
item0_train = train.iloc[0: 1523 , : ]
fortesting = item0_train.iloc[ -64:, : ]
fortestingarr = np.array(fortesting, dtype='float32')

In [16]:
future_predicts_by_id = {}
loss_history = {}
for idx in range(39) :
    print(f"{idx}번 아이템 학습 시작")
    individual_loss_history=[]
    train_d =train.iloc[idx*1523:(idx+1)*1523, :]
    forpredict =np.array( train_d.iloc[-window_size: , :],dtype='float32')
    if idx ==0 :
        
        train_dl = create_dataloader(fortrain[idx*1432:(idx+1)*1432] , fortest[idx*1432:(idx+1)*1432] , 2048 )
        print(train_dl)
    best_loss = float('inf')
    no_improvement_count = 0
    DLinear_model = LTSF_NLinear(window_size=64, forecast_size=28, individual=False, feature_size=12)
    DLinear_model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)
    for epoch in range(1, epoch_count+1) :
        loss_list=[]
        DLinear_model.train()
        for batch_idx, (data, target) in enumerate(train_dl):
            if batch_idx ==0 and epoch ==1:
                print(data.shape, len(target))
            data, target = data.to(device), target.to(device)  # Move the data to the GPU if available
            optimizer.zero_grad()
            output = DLinear_model(data)
            loss = criterion(output, target.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())
        if((epoch % 10) == 0):
                avg_loss = np.mean(loss_list)
                # print(f"Id {idx}, Epoch {epoch}: Loss = {avg_loss}, {id_val[:2]}")
                individual_loss_history.append(avg_loss)
                if avg_loss + min_delta < best_loss:
                    best_loss = avg_loss
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1
                    if no_improvement_count >= patience:
                        print(f"Early stopping at epoch {epoch} for ID {id_val} {idx} {avg_loss}")
                        weight = 0
                        weight_dict = { 
                                'TG' : 0.256,
                                'BC' : 0.23 ,
                                'RD' :0.205,
                                'CR' : 0.179,
                                'CB' : 0.128 
                                    }

                        # weight = weight_dict[id_val[:2]] 
    loss_history[idx] = individual_loss_history
    print(train_d[-window_size: ].shape )
    last_window_data = torch.tensor(forpredict).unsqueeze(0).unsqueeze(-1).float().to(device)
    print(train_d[-window_size: ].shape )
    future_prediction = DLinear_model(last_window_data)
    future_predicts_by_id[idx] = future_prediction 
    

0번 아이템 학습 시작
(1432, 64, 13) , (1432, 28)
<torch.utils.data.dataloader.DataLoader object at 0x7ff4d1f28520>
torch.Size([1432, 64, 13]) 1432


  return F.mse_loss(input, target, reduction=self.reduction)


(64, 13)
(64, 13)


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 4 is not equal to len(dims) = 3

In [None]:

future_predictions_by_id = {}
loss_history = {}
# Iterate through the data by ID
for idx, (id_val, data) in tqdm(enumerate(zip(price_df["item"], time_series_data)), total=len(price_df["item"])):
    # mean_ = np.mean(data)
    # std_ = np.std(data)
    # if std_ == 0:
    #     standardized_data = data
    # else:
    #     standardized_data = (data - mean_) / std_

    individual_loss_history = []
    # Create DataLoader
    # print(mean_, std_)
    train_dl = create_dataloader(data, window_size, forecast_size, batch_size)
    # print(train_dl)
    best_loss = float('inf')
    no_improvement_count = 0

    # # Training the model
    # DLinear_model = LTSF_NLinear(window_size=window_size, forecast_size=28, individual=False, feature_size=1)
    # DLinear_model.to(device)
    # criterion = nn.MSELoss()
    # optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)
    # for epoch in range(1, epoch_count + 1):
    #     loss_list = []
    #     DLinear_model.train()
    #     for batch_idx, (data, target) in enumerate(train_dl):
    #         data, target = data.to(device), target.to(device)  # Move the data to the GPU if available
    #         optimizer.zero_grad()
    #         output = DLinear_model(data)
    #         loss = criterion(output, target.unsqueeze(-1))
    #         loss.backward()
    #         optimizer.step()
    #         loss_list.append(loss.item())
    #     if((epoch % 10) == 0):
    #         avg_loss = np.mean(loss_list)
    #         #print(f"Id {idx}, Epoch {epoch}: Loss = {avg_loss}")
    #         individual_loss_history.append(avg_loss)
    #         if avg_loss + min_delta < best_loss:
    #             best_loss = avg_loss
    #             no_improvement_count = 0
    #         else:
    #             no_improvement_count += 1
    #             if no_improvement_count >= patience:
    #                 #print(f"Early stopping at epoch {epoch} for ID {id_val}")
    #                 break


    # loss_history[id_val] = individual_loss_history

    # last_window_data = torch.tensor(standardized_data[-window_size:]).unsqueeze(0).unsqueeze(-1).float().to(device)
    # future_prediction = DLinear_model(last_window_data)

    # if std_ == 0:
    #     future_prediction = future_prediction.squeeze().detach().cpu().numpy()
    # else:
    #     future_prediction = future_prediction.squeeze().detach().cpu().numpy() * std_ + mean_

    # # Store the prediction
    # future_predictions_by_id[id_val] = future_prediction

# Future predictions for each ID from 2023-04-05 to 2023-04-25
# future_predictions_by_id

  0%|          | 0/39 [00:00<?, ?it/s]

2568.509520682863 2910.244395516674
(1432, 64, 1) (1432, 28)
3655.7866053841103 2600.043618964224
(1432, 64, 1) (1432, 28)
3317.2534471437953 3040.442394695632
(1432, 64, 1) (1432, 28)
3241.6565988181223 2392.0935345983767
(1432, 64, 1) (1432, 28)
4107.093237032173 3520.839012849921
(1432, 64, 1) (1432, 28)
3469.5725541694023 2619.3426060636084
(1432, 64, 1) (1432, 28)
2813.63624425476 2761.234041694644
(1432, 64, 1) (1432, 28)
3298.8417596848326 2602.0677477592976
(1432, 64, 1) (1432, 28)
2226.198949441891 2524.610869259018
(1432, 64, 1) (1432, 28)
3075.871306631648 2414.7798400992524
(1432, 64, 1) (1432, 28)
622.1923834537098 1036.7549544653434
(1432, 64, 1) (1432, 28)
173.2140512147078 482.8254009881138
(1432, 64, 1) (1432, 28)
278.5134602757715 567.8918800633913
(1432, 64, 1) (1432, 28)
422.2783978988838 667.7002277932821
(1432, 64, 1) (1432, 28)
5.896257386736704 91.62926532713632
(1432, 64, 1) (1432, 28)
391.18187787261985 628.7809374088395
(1432, 64, 1) (1432, 28)
18.55613919894

In [None]:
submit = pd.read_csv('./../../DATA/sample_submission.csv')
submit

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [None]:
findata = []
for id_val, predictions in future_predictions_by_id.items():
    rounded_predictions = np.round(predictions).astype(np.float64)
    findata.extend(rounded_predictions)

submit['answer'] = findata
submit.loc[submit['answer'] <= 0, 'answer'] = 0
submit.loc[submit.index % 7 == 1, 'answer'] = 0

In [None]:
submit

Unnamed: 0,ID,answer
0,TG_A_J_20230304,2743.0
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,2391.0
3,TG_A_J_20230307,2835.0
4,TG_A_J_20230308,2607.0
...,...,...
1087,RD_F_J_20230327,515.0
1088,RD_F_J_20230328,528.0
1089,RD_F_J_20230329,529.0
1090,RD_F_J_20230330,519.0


In [None]:
submit.to_csv('../../DATA/SUBMIT/Nlinear3.csv', index=False)

In [None]:
submit2= pd.read_csv('./../../DATA/SUBMIT/Dlinear1.csv')
submit2

Unnamed: 0,ID,answer
0,TG_A_J_20230304,2750.0
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,2548.0
3,TG_A_J_20230307,2915.0
4,TG_A_J_20230308,2700.0
...,...,...
1087,RD_F_J_20230327,363.0
1088,RD_F_J_20230328,378.0
1089,RD_F_J_20230329,403.0
1090,RD_F_J_20230330,362.0


In [None]:
sub2 = submit2
sub2['answer'] = sub2['answer']+submit['answer']

In [None]:
sub2['answer'] = sub2['answer']/2

In [None]:
sub2.to_csv('../../DATA/SUBMIT/D+Llinear.csv', index=False)

In [None]:
subb = pd.read_csv('../../DATA/SUBMIT/Nlinear2.csv')
subb.head(30)

Unnamed: 0,ID,answer
0,TG_A_J_20230304,2734.0
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,2491.0
3,TG_A_J_20230307,2870.0
4,TG_A_J_20230308,2593.0
5,TG_A_J_20230309,3121.0
6,TG_A_J_20230310,2818.0
7,TG_A_J_20230311,2602.0
8,TG_A_J_20230312,0.0
9,TG_A_J_20230313,2305.0


In [None]:
subb.to_csv('../../DATA/SUBMIT/Nlinear2_weekavg.csv', index=False)