In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel
from transformers import BertTokenizer
import torch.nn as nn
from transformers import AutoTokenizer
from transformers import AutoModel
#from kobert_tokenizer import KoBERTTokenizer

In [2]:
data = pd.read_csv("/content/data.csv")

In [3]:
data["description"] = data["description"].fillna("")

In [4]:
# data 전처리
import re

def clean_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return "[PAD]"

    # 1. 이모지 및 특수기호 제거 (한방에 처리)
    text = re.sub(r'[^\uAC00-\uD7A3a-zA-Z0-9\s]', ' ', text)

    # 2. 반복 조사 줄이기 (ㅋㅋㅋ → ㅋ)
    text = re.sub(r'([ㅋㅎㅜㅠ])\1{1,}', r'\1', text)

    # 3. 숫자 제거 (선택적)
    text = re.sub(r'\d+', '', text)

    # 4. 불용어 제거
    stopwords = ['것', '수', '있', '하', '더', '들', '는', '은', '이', '가']
    text = ' '.join([word for word in text.split() if word not in stopwords])

    # 5. 공백 정리
    text = re.sub(r'\s+', ' ', text).strip()

    return text


data.description = data.description.apply(clean_text)
data.description


Unnamed: 0,description
0,번 착용했고 하자나 얼룩 없습니다
1,시착후 보관중입니당 pk 롱슬리브 스트라이프 블루
2,M사이즈 실착 회 하자 오염 없습니다 오후 시 이전 결제 택배 당일접수 에눌 교신 ...
3,스타일이 바뀌어 판매합니다
4,노스텔지아 가디건 구버전 제품입니다 실사는 번톡주세요
...,...
50475,국내 백화점구매 직장인인데 캐쥬얼 좋아해 사놓고 활용못해서 겹치는 스타일들 비움중 ...
50476,폴로 후드집업 판매합니다 구매할때 만원정도로 구매했습니다
50477,사용감 많지 않아요 스판기도 약간 있어서 편해요 정품입니다 실측 허리단면 총장 cm
50478,교신 에눌 할인 흥정 등 메세지엔 답변 안하고 있으며 또한 결제 진행 중 연락두절시...


In [5]:
import torch.nn as nn
from transformers import BertModel


class KoBERTRegressor(nn.Module):
    def __init__(self, bert_model=None):
        super(KoBERTRegressor, self).__init__()
        if bert_model is None:
            self.bert = BertModel.from_pretrained("monologg/kobert")
        else:
            self.bert = bert_model

        self.regressor = nn.Linear(768, 1)  # 고정값으로 명시
        # print("Regressor initialized with in_features=768")

    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds
        )
        cls_output = outputs.last_hidden_state[:, 0, :]  # [B, 768]
        return self.regressor(cls_output).squeeze(-1)



In [6]:
# Load KoBERT tokenizer
#tokenizer = KoBertTokenizer.from_pretrained("skt/kobert-base-v1", do_lower_case=False)


tokenizer = BertTokenizer.from_pretrained("monologg/kobert")
bert_model = BertModel.from_pretrained("monologg/kobert")



class KoBERTDataset(Dataset):
    def __init__(self, descriptions, scores, tokenizer, max_len=64):
        self.descriptions = descriptions
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        desc = self.descriptions[idx]
        if not desc.strip():  # 빈 문자열이면
            desc = "[PAD]"  # 그냥 아무 토큰이나 넣어 (빈 리스트 방지용)

        encoded = self.tokenizer(
            desc,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        input_ids = encoded["input_ids"].squeeze(0)
        attention_mask = encoded["attention_mask"].squeeze(0)
        return input_ids, attention_mask, torch.tensor(self.scores[idx], dtype=torch.float)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [7]:
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

descriptions = data["description"].tolist()
scores = data["interest_score"].tolist()

X_train, X_val, y_train, y_val = train_test_split(descriptions, scores, test_size=0.2, random_state=42)

train_dataset = KoBERTDataset(X_train, y_train, tokenizer)
val_dataset = KoBERTDataset(X_val, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = AutoModel.from_pretrained("skt/kobert-base-v1").to(device)
model = KoBERTRegressor(bert_model).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()


# 간단한 학습 루프 (3 epoch 예시)
for epoch in tqdm(range(3)):
    model.train()
    for input_ids, attention_mask, labels in tqdm(train_loader):

        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2524 [00:00<?, ?it/s]

Epoch 1, Loss: 0.0047


  0%|          | 0/2524 [00:00<?, ?it/s]

Epoch 2, Loss: 0.0040


  0%|          | 0/2524 [00:00<?, ?it/s]

Epoch 3, Loss: 0.0050
