In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
buying_concat = pd.read_excel('buying_concat.xlsx')
buying_concat.shape

(14767, 9)

In [3]:
test_ft = pd.read_excel('review_drop_dup.xlsx')
test_ft.shape

(44356, 7)

In [4]:
train_ft = pd.read_excel('zigzag_clothes.xlsx')
train_ft.shape

(7193, 24)

In [5]:
train_arr = train_ft['리뷰'].to_numpy()
train_arr.shape

(7193,)

In [6]:
from transformers import AutoTokenizer, AutoModel

In [7]:
model_name = 'kykim/bert-kor-base'

In [8]:
model = AutoModel.from_pretrained(model_name)

  return self.fget.__get__(instance, owner)()


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 클래스 정리

In [10]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, x, y=None):
        self.tokenizer = tokenizer
        self.x = x
        self.y = y
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        item = {}
        item["x"] = self.get_tokenizer(self.x[idx])
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item
    def get_tokenizer(self, text):
        x = self.tokenizer(text, padding="max_length", truncation=True)
        for k, v in x.items():
            x[k] = torch.tensor(v)
        return x

In [11]:
# 키워드 포함 여부에 대한 class
class Net(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.pre_model = AutoModel.from_pretrained(model_name)
        self.fc_out = torch.nn.Linear( self.pre_model.config.hidden_size, 7)

    def forward(self, x):
        x = self.pre_model(**x)
        # x[0]: 모든 시점의 히든출력 batch, seq, features
        # x[1]: CLS 토큰의 히든출력 batch, features
        return self.fc_out(x[1])

In [12]:
# 키워드별로 긍/부정을 예측하는 class
class Net_emotion(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.pre_model = AutoModel.from_pretrained(model_name)
        self.fc_out = torch.nn.Linear( self.pre_model.config.hidden_size, 1)

    def forward(self, x):
        x = self.pre_model(**x)
        # x[0]: 모든 시점의 히든출력 batch, seq, features
        # x[1]: CLS 토큰의 히든출력 batch, features
        return self.fc_out(x[1])

In [13]:
# 모델의 가중치를 받아온 뒤 예측을 진행
@torch.no_grad()
def test_loop(dataloader, model, device):
    pred_list = []
    act_func = torch.nn.Sigmoid()
    model.eval() # 평가 모드
    for batch in tqdm(dataloader):
        pred = model( batch["x"].to(device) )
        pred = act_func(pred) # logit 값을 확률로 변환
        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    return pred

In [14]:
# 모델의 가중치를 받아온 뒤 예측을 진행
@torch.no_grad()
def test_loop_emotion(dataloader, model, device):
    pred_list = []
    act_func = torch.nn.Sigmoid()
    model.eval() # 평가 모드
    for batch in dataloader:
        pred = model( batch["x"].to(device) )
        pred = act_func(pred) # logit 값을 확률로 변환
        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    return pred

# 키워드 존재 여부 판별 & 키워드별 긍/부정 예측 모델 

In [15]:
def Review_classification(review):                       
    if isinstance(review,pd.Series): # review가 Series의 형태와 그렇지 않은 경우로 나뉨. 그렇지 않은 경우에는 list의 형식을 전달
        test_arr = review.to_numpy()
    else:
        test_arr = pd.Series(review).to_numpy()
        
    test_dt = ReviewDataset(tokenizer,test_arr)
    test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
    
    pred_list = []

    model = Net(model_name).to(device)
    state_dict = torch.load("model_0.pth")
    model.load_state_dict(state_dict)
    
    pred = test_loop(test_dl, model, device) 
    pred_list.append(pred)
    pred = np.mean(pred_list, axis=0)
    pred = (pred > 0.5).astype(int)

    model = Net_emotion(model_name).to(device)
    state_dict_color = torch.load('color.pth')
    state_dict_fit = torch.load('fit.pth')
    state_dict_texture = torch.load('texture.pth')
    state_dict_quality = torch.load('quality.pth')
    state_dict_status = torch.load('status.pth')
    state_dict_price = torch.load('price.pth')
    state_dict_thickness = torch.load('thickness.pth')
    
    for i,v in tqdm(enumerate(pred),total = len(test_arr)): 
          if v[0] == 1: 
              test_dt = ReviewDataset(tokenizer,test_arr[i:i+1]) 
              test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
              
              pred_list = []
    
              model.load_state_dict(state_dict_color)  
              pred_color = test_loop_emotion(test_dl, model, device) 
              pred_list.append(pred_color)
                  
              pred_color = np.mean(pred_list, axis=0)
              pred_color = (pred_color > 0.5).astype(int)
                        
              if pred_color[0] == 1:
                  v[0] = 1
              else:
                  v[0] = -1

          if v[1] == 1: 
              test_dt = ReviewDataset(tokenizer,test_arr[i:i+1]) 
              test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
              
              pred_list = []
    
              model.load_state_dict(state_dict_fit)  
              pred_fit = test_loop_emotion(test_dl, model, device) 
              pred_list.append(pred_fit)
                  
              pred_fit = np.mean(pred_list, axis=0)
              pred_fit = (pred_fit > 0.5).astype(int)
                        
              if pred_fit[0] == 1:
                  v[1] = 1
              else:
                  v[1] = -1

          if v[2] == 1: 
              test_dt = ReviewDataset(tokenizer,test_arr[i:i+1]) 
              test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
              
              pred_list = []
    
              model.load_state_dict(state_dict_texture)  
              pred_texture = test_loop_emotion(test_dl, model, device) 
              pred_list.append(pred_texture)
                  
              pred_texture = np.mean(pred_list, axis=0)
              pred_texture = (pred_texture > 0.5).astype(int)
                        
              if pred_texture[0] == 1:
                  v[2] = 1
              else:
                  v[2] = -1

          if v[3] == 1: 
              test_dt = ReviewDataset(tokenizer,test_arr[i:i+1]) 
              test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
              
              pred_list = []
    
              model.load_state_dict(state_dict_quality)  
              pred_quality = test_loop_emotion(test_dl, model, device) 
              pred_list.append(pred_quality)
                  
              pred_quality = np.mean(pred_list, axis=0)
              pred_quality = (pred_quality > 0.5).astype(int)
                        
              if pred_quality[0] == 1:
                  v[3] = 1
              else:
                  v[3] = -1

          if v[4] == 1: 
              test_dt = ReviewDataset(tokenizer,test_arr[i:i+1]) 
              test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
              
              pred_list = []
    
              model.load_state_dict(state_dict_status)  
              pred_status = test_loop_emotion(test_dl, model, device) 
              pred_list.append(pred_status)
                  
              pred_status = np.mean(pred_list, axis=0)
              pred_status = (pred_status > 0.5).astype(int)
                        
              if pred_status[0] == 1:
                  v[4] = 1
              else:
                  v[4] = -1

          if v[5] == 1: 
              test_dt = ReviewDataset(tokenizer,test_arr[i:i+1]) 
              test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
              
              pred_list = []
    
              model.load_state_dict(state_dict_price)  
              pred_price = test_loop_emotion(test_dl, model, device) 
              pred_list.append(pred_price)
                  
              pred_price = np.mean(pred_list, axis=0)
              pred_price = (pred_price > 0.5).astype(int)
                        
              if pred_price[0] == 1:
                  v[5] = 1
              else:
                  v[5] = -1

          if v[6] == 1: 
              test_dt = ReviewDataset(tokenizer,test_arr[i:i+1]) 
              test_dl = torch.utils.data.DataLoader(test_dt,batch_size = 8,shuffle = False)
              
              pred_list = []
    
              model.load_state_dict(state_dict_thickness)  
              pred_thickness = test_loop_emotion(test_dl, model, device) 
              pred_list.append(pred_thickness)
                  
              pred_thickness = np.mean(pred_list, axis=0)
              pred_thickness = (pred_thickness > 0.5).astype(int)
                        
              if pred_thickness[0] == 1:
                  v[6] = 1
              else:
                  v[6] = -1
                  
    pred = pd.DataFrame(pred,columns = ['색감','핏','재질','퀄리티','제품상태','가격','두께'])
    review = pd.DataFrame(review).reset_index(drop = True)
    pred = pd.concat([review,pred],axis = 1)
    return pred

In [16]:
pred = Review_classification(test_ft['리뷰'])
pred

  0%|          | 0/5545 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


  0%|          | 0/44356 [00:00<?, ?it/s]

Unnamed: 0,리뷰,색감,핏,재질,퀄리티,제품상태,가격,두께
0,스트라이프 나시는 처음 사봤는데 살짝 포인트로 입기 너무 좋고 슬앤 특유의 모던한 ...,0,-1,1,0,0,0,1
1,처음 입어보는 민소매탑이라서 너무 튀지 않고 어디든 어울리는 무난템 골랐어용 줄무늬...,0,0,0,0,0,1,-1
2,이거 두께감이 꽤 있어요 그만큼 탄탄해서 여름엔 좀 덥지만 맘에 듭니다!! 부유방 ...,0,1,0,1,0,0,1
3,나시와 셋뚜세뚜로 구입했어요 셔츠는 옷걸이에 걸어서 겉에 비닐 씌워서 보내주셨는데 ...,1,1,0,1,0,0,0
4,너무 맘에들어요! 길이가 좀 짧긴 한데 고개 숙여도 넥 부분 안쳐지고 딱 붙어있어서...,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
44351,키 168에 60인데 커요 박스형 찾으시면 괜찮을듯\n저는 살짝 루즈핏에 여리여리 ...,0,-1,0,0,0,0,0
44352,사이즈가 확실히 커서 좋아요\n얇아서 초여름까지입고 실내활동만할때는 한여름에 가디건...,0,1,0,0,0,0,0
44353,색상 변경 문의드렸는데 친절하게 응대해주셔서 바꾼 색상으로 잘 받을 수 있었어요!!...,1,0,0,0,0,0,0
44354,퀄리티는 괜찮은데 모델 착샷보다 좀 많이 오버핏이고 색깔이 화면이랑은 좀 달라용 ㅠ,-1,-1,0,1,0,0,0


In [19]:
test_ft.drop(columns = ['Unnamed: 0'],inplace = True)

In [22]:
review_44356 = pd.concat([test_ft,pred],axis = 1).copy()
review_44356.shape

(44356, 14)

In [24]:
pred = pred.loc[:,['색감','핏','재질','퀄리티','제품상태','가격','두께']].copy()

In [26]:
Demo_clothes = pd.concat([buying_concat,pred],axis = 1).copy()

In [23]:
review_44356.to_excel('review_44356.xlsx', index=False)