In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
# 1달치 데이터 가져옴
item_list = glob.glob('../rawdata/2014-01-01*')
for idx, p in enumerate(item_list):
    df = pd.read_csv(p, encoding = 'cp949')
    if idx == 0:
        all_df = df
    else:
        all_df = pd.concat([all_df, df])

In [3]:
all_df.shape

(83457, 8)

In [7]:
item_info = pd.read_csv('../rawdata/item_info.csv')
item_dict =item_info.set_index('pum_id').to_dict()['pum_name']
all_df['pum_name'] =all_df['pum_id'].map(item_dict)

In [8]:
# 총 73종류
all_df.pum_name.value_counts()

pum_name
회화용구     2531
아동화      2338
즉석식품     2022
방향제      1933
스낵과자     1875
         ... 
구강세정제      75
김          75
청바지        36
탄산음료       18
기초화장품       2
Name: count, Length: 73, dtype: int64

In [9]:
# 구조 : good_name 을 활용해 pum_name 을 분류

all_df.head()

Unnamed: 0,collect_day,good_id,pum_id,pum_name,good_name,sales_price,discount_price,benifit
0,2014-01-01,3740.0,E061100,습기제거제,알뜰상품_제습제_4입,4650.0,,
1,2014-01-01,6847.0,L012040,치약,비손_예스미백치약_120G,770.0,,
2,2014-01-01,156664.0,L012080,화장지,해피트리_3겹천연펄프화장지_35M*12R(S),5800.0,,
3,2014-01-01,269074.0,L012180,,리스테린 쿨 민트/후레쉬버스트/내추럴 시트러스/티쓰앤껌 750mlx2개 초특가,7800.0,,
4,2014-01-01,356548.0,C021010,아동화,Allie Ballet Flat,57140.0,,


In [10]:
idx2pum = {idx: pum_id for idx, pum_id in enumerate(all_df['pum_id'].unique())}
pum2idx = {v : k for k, v in idx2pum.items()}

In [11]:
all_df['pid'] = all_df['pum_id'].map(pum2idx)

In [12]:
rawdata = all_df[['good_name','pid']]

In [13]:
rawdata.head()

Unnamed: 0,good_name,pid
0,알뜰상품_제습제_4입,0
1,비손_예스미백치약_120G,1
2,해피트리_3겹천연펄프화장지_35M*12R(S),2
3,리스테린 쿨 민트/후레쉬버스트/내추럴 시트러스/티쓰앤껌 750mlx2개 초특가,3
4,Allie Ballet Flat,4


# 기본 언어모델을 활용해 분류 baseline 을 생성하기

In [16]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

import torch.optim as optim

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers 'C:\Users\jinseulpark\AppData\Local\Temp\pip-req-build-2ji_aal1'


Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to c:\users\jinseulpark\appdata\local\temp\pip-req-build-2ji_aal1
  Resolved https://github.com/huggingface/transformers to commit ac974199c850cc392ba94ceeb4cb8801efc3311a
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml): started
  Building wheel for transformers (pyproject.toml): finished with status 'done'
  Created wheel for transformers: filename=transformers-4.37.0.dev0-py3-none-any.whl size=8271141 sha256=d873ec51a

In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
name = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=len(idx2pum))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
class mydataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        token = tokenizer(self.X[idx], padding = 'max_length', max_length = 96, truncation = True, return_tensors = 'pt') 
        label = torch.tensor(self.y[idx])
        return {'feature': token, 'label': label}

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test =train_test_split(rawdata['good_name'].values, rawdata['pid'].values, test_size = 0.3, random_state = 1205)

In [22]:
X_train_ds = mydataset(X_train, y_train)
X_test_ds = mydataset(X_test, y_test)

In [29]:
train_dl = DataLoader(X_train_ds, batch_size = 16)
test_dl = DataLoader(X_test_ds, batch_size = 16)

In [30]:
optimizer = optim.AdamW(model.parameters(), lr = 1e-5)
loss_fn = nn.CrossEntropyLoss()
next(iter(train_dl))

{'feature': {'input_ids': tensor([[[  101,  8987, 37114,  ...,     0,     0,     0]],
 
         [[  101, 39744, 11490,  ...,     0,     0,     0]],
 
         [[  101,   164,  8892,  ...,     0,     0,     0]],
 
         ...,
 
         [[  101,  9786, 17342,  ...,     0,     0,     0]],
 
         [[  101, 17957,   106,  ...,     0,     0,     0]],
 
         [[  101,  9344, 45725,  ...,     0,     0,     0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         ...,
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         ...,
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]]])},
 'label': tensor([74,  4, 26, 57, 58, 17, 20, 

In [39]:
EPOCH = 10
for e in range(EPOCH):
    for batch in train_dl:
        # forward
        input_batch = {k : v for k, v in batch['feature'].items() if k !='token_type_ids'}
        output = model(**input_batch)
        loss = loss_fn(output, batch['label'])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

ValueError: too many values to unpack (expected 2)

In [38]:
{k : v for k, v in batch['feature'].items() if k !='token_type_ids'}

{'input_ids': tensor([[[  101,  8987, 37114,  ...,     0,     0,     0]],
 
         [[  101, 39744, 11490,  ...,     0,     0,     0]],
 
         [[  101,   164,  8892,  ...,     0,     0,     0]],
 
         ...,
 
         [[  101,  9786, 17342,  ...,     0,     0,     0]],
 
         [[  101, 17957,   106,  ...,     0,     0,     0]],
 
         [[  101,  9344, 45725,  ...,     0,     0,     0]]]),
 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         ...,
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]]])}