In [17]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

device = 'cuda' if torch.cuda.is_available() else 'cpu'

SEED=42

In [18]:
# 데이터 불러오기 

train_ft = pd.read_excel('zigzag_clothes.xlsx')

train_ft['리뷰'] = train_ft['리뷰'].str.replace('\n','. ')
# 당장 분석에 필요없는 컬럼 제거 

drop_columns = ['중분류', '브랜드', '상품명', '가격', '리뷰수', '상품평점', '닉네임', '별점', '날짜', '키',
       '몸무게', '상의사이즈', '선택옵션', '사이즈평가', '퀄리티평가', '색감평가']

train_ft.drop(drop_columns,axis = 1,inplace = True)

train_ft.shape
train_ft.rename(columns={'가격.1':'가격'}, inplace=True)

In [19]:
train_ft.isnull().sum().sum()

0

In [20]:
from transformers import AutoTokenizer, AutoModel

model_name="lxyuan/distilbert-base-multilingual-cased-sentiments-student"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [21]:
from sklearn.model_selection import train_test_split

data = train_ft['리뷰']
target=train_ft[['색감', '핏', '재질', '퀄리티', '제품상태', '가격', '두께']]

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, shuffle=True)

In [22]:
mapping = {-1: 0, 0: 1, 1: 2}

target_train = target_train.replace(mapping)
target_test = target_test.replace(mapping)

In [23]:
train_arr = data_train.to_numpy()
test_arr = target_train.to_numpy()

In [24]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, x, y=None):
        self.tokenizer = tokenizer
        self.x = x
        self.y = y
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        item = {}
        item["x"] = self.get_tokenizer(self.x[idx])
        if self.y is not None:
            item["y"] = torch.tensor(self.y[idx])
        return item
    def get_tokenizer(self, text):
        x = self.tokenizer(text, padding="max_length", truncation=True)
        for k, v in x.items():
            x[k] = torch.tensor(v)
        return x

dt = ReviewDataset(tokenizer, train_arr, test_arr)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'x': {'input_ids': tensor([[  101,  9801, 30873,  ...,     0,     0,     0],
         [  101, 10003, 47869,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])},
 'y': tensor([[2, 2, 1, 1, 1, 1, 1],
         [1, 2, 1, 1, 1, 1, 1]])}

In [15]:
class Net(torch.nn.Module):
    def __init__(self, model_name, num_features=7):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.fc_out = torch.nn.Linear(self.model.config.hidden_size, 3)
    

    def forward(self, x):
        output = self.model(**x)
        output = self.fc_out(output[0])   #1d cnn하던가 ,gru?, (2,512,3)
        # 정답 쉐이프와 맞게 레이어 추가
        return torch.sum(output,dim=2)    # 회귀로


In [16]:
model = Net(model_name)
pred=model(batch["x"])
pred,batch['y'],pred.shape


(tensor([[-0.4920, -0.4917, -0.4111,  ..., -0.4676, -0.4563, -0.4434],
         [-0.3054, -0.3157, -0.2826,  ..., -0.2970, -0.2443, -0.2635]],
        grad_fn=<SumBackward1>),
 tensor([[2, 1, 2, 1, 1, 1, 1],
         [1, 0, 1, 1, 1, 1, 0]]),
 torch.Size([2, 512]))

In [25]:
class Net(torch.nn.Module):
    def __init__(self, model_name, num_features=7):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.fc_out = torch.nn.Linear(self.model.config.hidden_size, 3)
        self.fc_layers = torch.nn.ModuleList([self.fc_out for _ in range(num_features)]) # 배치 7 3

    def forward(self, x):
        outputs = []
        for fc in self.fc_layers:
            output = self.model(**x)
            output = fc(output[0][:, 0])
            outputs.append(output.unsqueeze(1)) 
        return torch.cat(outputs, dim=1) 


In [26]:
model = Net(model_name)
pred=model(batch["x"])
pred,batch['y'],pred.shape


(tensor([[[0.0974, 0.2472, 0.4966],
          [0.0974, 0.2472, 0.4966],
          [0.0974, 0.2472, 0.4966],
          [0.0974, 0.2472, 0.4966],
          [0.0974, 0.2472, 0.4966],
          [0.0974, 0.2472, 0.4966],
          [0.0974, 0.2472, 0.4966]],
 
         [[0.1234, 0.2891, 0.5211],
          [0.1234, 0.2891, 0.5211],
          [0.1234, 0.2891, 0.5211],
          [0.1234, 0.2891, 0.5211],
          [0.1234, 0.2891, 0.5211],
          [0.1234, 0.2891, 0.5211],
          [0.1234, 0.2891, 0.5211]]], grad_fn=<CatBackward0>),
 tensor([[2, 2, 1, 1, 1, 1, 1],
         [1, 2, 1, 1, 1, 1, 1]]),
 torch.Size([2, 7, 3]))

In [27]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train() # 학습 모드
    for batch in tqdm(dataloader):
        pred = model( batch["x"].to(device) )
        loss = loss_fn(pred.reshape(-1,3), batch['y'].flatten().to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

In [28]:
# loss_fn = torch.nn.CrossEntropyLoss()
# dt = ReviewDataset(tokenizer, train_arr, test_arr)
# dataloader = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# train_loop(dataloader, model, loss_fn, optimizer, device)

In [29]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    epoch_loss = 0
    pred_list = []
    act_func = torch.nn.Softmax(dim=2)
    model.eval() # 평가 모드
    for batch in tqdm(dataloader):
        pred = model( batch["x"].to(device) )
        if batch.get("y") is not None:
            loss = loss_fn(pred.reshape(-1,3), batch['y'].flatten().to(device))
            epoch_loss += loss.item()

        pred = act_func(pred) # logit 값을 확률로 변환
        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    
    return epoch_loss, pred

In [30]:
# loss_fn = torch.nn.CrossEntropyLoss()
# dt = ReviewDataset(tokenizer, train_arr, test_arr)
# dataloader = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# test_loop(dataloader, model, loss_fn, device)

In [31]:
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import accuracy_score
n_splits = 5

cv = StratifiedKFold(n_splits, shuffle=True, random_state=SEED)
cv = KFold(n_splits, shuffle=True, random_state=SEED)

batch_size = 2 # 배치 사이즈
loss_fn = torch.nn.CrossEntropyLoss() # 손실 객체
epochs = 100 # 최대 가능한 에폭수

In [32]:
# dt = ReviewDataset(tokenizer, train_arr, test_arr)
# dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
# batch = next(iter(dl))
# batch

In [33]:
from sklearn.metrics import f1_score

is_holdout = True
reset_seeds(SEED) # 재현을 위해 시드고정
best_score_list = []

for i, (tri, vai) in enumerate(cv.split(data_train,target_train)):
    # 학습용 데이터로더 객체
    train_dt = ReviewDataset(tokenizer,train_arr[tri], test_arr[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = ReviewDataset(tokenizer,train_arr[vai], test_arr[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

    best_score = 0  # 현재 최고 점수
    patience = 0  # 조기 종료 조건을 주기 위한 변수
    for epoch in range(epochs):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        print(pred)
        print(pred.shape)
        print(test_arr[vai])
        print(test_arr[vai].shape)
        
        pred = np.argmax(pred, axis=2)  # 다중 클래스 분류 문제로 변환
        #pred = pred.flatten()  # 2차원 배열을 1차원 배열로 변환
        #test_labels = test_arr[vai].flatten()  # 다중 클래스 분류 문제에 맞게 타겟 데이터를 평탄화

        print(f'변형후 :{pred},{pred.shape},{test_arr[vai].shape}')
        score = f1_score(pred, test_arr[vai], average='macro')  # F1 점수 계산
        
        print(f"Epoch {epoch}: Train Loss: {train_loss}, Valid Loss: {valid_loss}, F1 Score: {score}")

        if score > best_score:
            best_score = score  # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"model_{i}.pth")  # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 5:
            break

    print(f"{i}번째 폴드 최고 F1 점수: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break


  0%|          | 0/2302 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import numpy as np

# 샘플 개수, 문장 길이, 클래스 개수
num_samples = 5
sentence_length = 7
num_classes = 3

# 샘플을 담을 빈 배열 생성
data = np.empty((num_samples, sentence_length, num_classes))

# 각 샘플에 대해 클래스 할당
for i in range(num_samples):
    for j in range(sentence_length):
        # 각 샘플에 따라 클래스 할당
        data[i, j] = i + 1

print(data)
print(data.shape)


In [None]:
from sklearn.metrics import f1_score

is_holdout = True
reset_seeds(SEED) # 재현을 위해 시드고정
best_score_list = []

for i, (tri, vai) in enumerate(cv.split(data_train,target_train)):
    # 학습용 데이터로더 객체
    train_dt = ReviewDataset(tokenizer,train_arr[tri], test_arr[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = ReviewDataset(tokenizer,train_arr[vai], test_arr[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

    best_score = 0  # 현재 최고 점수
    patience = 0  # 조기 종료 조건을 주기 위한 변수
    for epoch in range(epochs):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        print(pred)
        print(pred.shape)
        print(test_arr[vai])
        print(test_arr[vai].shape)
        
        pred = np.argmax(pred, axis=2)  # 다중 클래스 분류 문제로 변환
        scores = []
        for j in range(pred.shape[1]):
            score = f1_score(pred[:, j], test_arr[vai][:, j], average='macro')
            scores.append(score)
        mean_f1_score = np.mean(scores)
        
        print(f"Epoch {epoch}: Train Loss: {train_loss}, Valid Loss: {valid_loss}, Mean F1 Score: {mean_f1_score}")

        if mean_f1_score > best_score:
            best_score = mean_f1_score  # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"model_{i}.pth")  # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 5:
            break

    print(f"{i}번째 폴드 최고 평균 F1 점수: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break


In [34]:
from sklearn.metrics import f1_score

is_holdout = True
reset_seeds(SEED) # 재현을 위해 시드고정
best_score_list = []

for i, (tri, vai) in enumerate(cv.split(data_train,target_train)):
    # 학습용 데이터로더 객체
    train_dt = ReviewDataset(tokenizer,train_arr[tri], test_arr[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = ReviewDataset(tokenizer,train_arr[vai], test_arr[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

    best_score = 0  # 현재 최고 점수
    patience = 0  # 조기 종료 조건을 주기 위한 변수
    for epoch in range(epochs):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        print(pred)
        print(pred.shape)
        print(test_arr[vai])
        print(test_arr[vai].shape)
        
        pred = np.argmax(pred, axis=2)  
        scores = []
        for j in range(pred.shape[1]):
            score = f1_score(test_arr[vai][:, j],pred[:, j], average='macro')
            print('======')
            print(f'{score}')
            print('======')
            scores.append(score)
        
        print(scores)
        print(scores.shape)
        mean_f1_score = np.mean(scores)

        print(mean_f1_score)
        print(mean_f1_score.shape)
        
        print(f"Epoch {epoch}: Train Loss: {train_loss}, Valid Loss: {valid_loss}, Mean F1 Score: {mean_f1_score}")
        
        if mean_f1_score > best_score:
            best_score = mean_f1_score  # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"model_{i}.pth")  # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 5:
            break

    print(f"{i}번째 폴드 최고 평균 F1 점수: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break


  0%|          | 0/2302 [00:00<?, ?it/s]

  0%|          | 0/576 [00:00<?, ?it/s]

[[[0.09783443 0.69039667 0.21176893]
  [0.09783443 0.69039667 0.21176893]
  [0.09783443 0.69039667 0.21176893]
  ...
  [0.09783443 0.69039667 0.21176893]
  [0.09783443 0.69039667 0.21176893]
  [0.09783443 0.69039667 0.21176893]]

 [[0.14635247 0.7001803  0.15346727]
  [0.14635247 0.7001803  0.15346727]
  [0.14635247 0.7001803  0.15346727]
  ...
  [0.14635247 0.7001803  0.15346727]
  [0.14635247 0.7001803  0.15346727]
  [0.14635247 0.7001803  0.15346727]]

 [[0.1742544  0.7274243  0.09832121]
  [0.1742544  0.7274243  0.09832121]
  [0.1742544  0.7274243  0.09832121]
  ...
  [0.1742544  0.7274243  0.09832121]
  [0.1742544  0.7274243  0.09832121]
  [0.1742544  0.7274243  0.09832121]]

 ...

 [[0.1832612  0.6759857  0.14075315]
  [0.1832612  0.6759857  0.14075315]
  [0.1832612  0.6759857  0.14075315]
  ...
  [0.1832612  0.6759857  0.14075315]
  [0.1832612  0.6759857  0.14075315]
  [0.1832612  0.6759857  0.14075315]]

 [[0.09601477 0.6497445  0.25424078]
  [0.09601477 0.6497445  0.25424078]


AttributeError: 'list' object has no attribute 'shape'