In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer

## Multi Model Test

In [2]:
df = pd.read_csv('./data/review_dataset_v6_sum_e.csv', index_col=0)
df.head()

Unnamed: 0,index,id,search_word,title,url,blog_name,date,content,content_len,content_hash_cnt,...,e_sum_758,e_sum_759,e_sum_760,e_sum_761,e_sum_762,e_sum_763,e_sum_764,e_sum_765,e_sum_766,e_sum_767
0,0,shuenmama.223027650182,홍대 회식 맛집,홍대 고기집 데이트 쟁반한상 삼겹살 맛집 회식 쟁반집8292,https://blog.naver.com/shuenmama/223027650182,shuen,20230225,홍대에서 데이트 하기로 한 주말\n진짜 간만에 홍대\n전에 항상 세우던 공영주차장이...,3094,10,...,0.226764,-0.341097,0.107354,0.395617,-0.184648,-0.187328,-0.100806,0.066111,0.315414,-0.584649
1,1,rosepink1974.223153722255,홍대 회식 맛집,홍대삼겹살 청년화로 1987 이베리코 연남동고기집 회식맛집,https://blog.naver.com/rosepink1974/223153722255,예쁜 달코미의 단맛 인생,20230712,청년화로1987\n서울 마포구 동교로 219 1층\n청년화로 1987\n홍대입구역 ...,2345,9,...,0.274786,-0.311202,-0.107873,0.219888,-0.324829,-0.479143,-0.085153,-0.008704,0.34288,-0.252228
2,2,mou25.223209216526,홍대 회식 맛집,"합정 맛집 홍대 회식장소로 딱, 느낌 있는 소고기 고깃집...",https://blog.naver.com/mou25/223209216526,생애 기록장,20230912,"매번 느끼는 거지만,\n회식장소 하나는\n기가 막히게 섭외하는 울 주임님.\n\n얼...",2904,12,...,0.297851,0.366678,0.195712,0.164166,0.123258,-0.229331,0.080041,0.002673,0.381098,-0.2088
3,3,lulu_l.223118434610,홍대 회식 맛집,"홍대 맛집 합정 갈비가 부드러운 소고기집 연막탄 회식, 데이트...",https://blog.naver.com/lulu_l/223118434610,안나의 일상공유,20230602,홍대 소고기 맛집\n연막탄\n\n남자친구가 맛있는 고깃집을 알고\n있다길해 합정 맛...,2810,0,...,0.297081,-0.099999,0.227369,-0.031295,0.096868,-0.391839,-0.211752,0.15641,-0.170814,-0.501321
4,4,ruston_.223161590597,홍대 회식 맛집,"홍대회식, 육즙 폭발하는 소고기 맛집 '일편등심 홍대본점'",https://blog.naver.com/ruston_/223161590597,로빈이 토끼란 사실을 알고있었나?,20230720,안녕하세요. LoLCake입니다.\n\n\n최근 이직을 준비하는 동료의 축하 파티를...,2299,2,...,0.405006,-0.211296,0.137026,0.264585,-0.045813,-0.435813,-0.275717,-0.078445,0.584033,-0.496703


In [3]:
df2 = pd.read_csv('./data/review_dataset_embedding_all_mean.csv', index_col=0)
df2.head()

Unnamed: 0,index,e_sum_0,e_sum_1,e_sum_2,e_sum_3,e_sum_4,e_sum_5,e_sum_6,e_sum_7,e_sum_8,...,e_sum_758,e_sum_759,e_sum_760,e_sum_761,e_sum_762,e_sum_763,e_sum_764,e_sum_765,e_sum_766,e_sum_767
0,shuenmama.223027650182,-0.269514,-0.33922,-0.330182,0.040266,0.184372,-0.227383,0.26528,0.116666,-0.206969,...,0.178983,-0.214031,-0.059638,0.167407,0.020884,-0.203415,-0.161502,0.018985,0.237754,-0.221653
1,rosepink1974.223153722255,-0.187719,-0.324055,-0.297384,0.109595,0.266376,-0.113839,0.198577,0.2759,0.041351,...,0.25471,-0.104253,-0.208398,0.202277,-0.376773,-0.346854,-0.018586,0.084422,0.438359,-0.133161
2,mou25.223209216526,0.084926,-0.473558,-0.147312,0.005009,0.088285,-0.097935,-0.042144,0.203255,-0.180874,...,0.370275,-0.025442,0.027081,0.189991,0.08675,-0.290567,-0.198043,0.072573,0.350379,-0.047624
3,lulu_l.223118434610,0.04992,-0.534174,-0.485959,0.119297,0.101789,-0.176171,-0.035862,0.220996,0.050428,...,0.311148,-0.040022,-0.04351,0.173019,-0.055459,-0.228064,-0.176389,0.105344,0.209823,-0.166638
4,ruston_.223161590597,-0.102667,-0.188301,-0.230945,-0.044723,0.262243,-0.216912,0.018869,0.162123,0.149256,...,0.318246,0.044601,-0.016627,0.203433,-0.263383,-0.387199,-0.078136,-0.004678,0.465537,-0.184052


In [4]:
categorical_col = ['adpost_yn', 'map_yn', 'video_yn', 'phone_yn']
numerical_col = ['content_len', 'content_hash_cnt', 'article_hash_cnt', 'like_cnt', 'emoticon_cnt', 
                 'total_post', 'link_cnt', 'image_cnt', 'repeat_word_cnt', 'noun_verb_ratio']
embedding_col = ['content']
label_col = 'label_f2'

In [5]:
df = df[df.noun_verb_ratio < np.inf].reset_index()

In [6]:
df = pd.merge(
    df.set_index('id'),
    df2.set_index('index'),
    left_index=True, right_index=True,
    how='left'
)
df.reset_index(inplace=True)

In [7]:
uid = df['id'].tolist()
X = df[categorical_col + numerical_col + embedding_col]
y = df[label_col].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=22)

In [9]:
def split_X(X):
    return (X[categorical_col + numerical_col].values, X[embedding_col].values)

In [10]:
X_train_feat, X_train_emb = split_X(X_train)
X_test_feat, X_test_emb = split_X(X_test)

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [12]:
class FakeReviewDataset(Dataset):
    def __init__(self, feature, embedding, label):
        super().__init__()
        self.feature = torch.FloatTensor(feature)
        self.embedding = embedding.tolist()
        self.label = torch.tensor(label)
        
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        return {
            "feature": self.feature[idx],
            "embedding": self.embedding[idx],
            "label": self.label[idx]
        }

In [13]:
train_ds = FakeReviewDataset(X_train_feat, X_train_emb, y_train)
valid_ds = FakeReviewDataset(X_test_feat, X_test_emb, y_test)

In [14]:
class CustomLoader:
    def __init__(self, dataset, batch_size: int = 64):
        self.dataset = dataset
        self.batch_size = batch_size
        
    
    def get_batch(self, iteration):
        idx = range(iteration * self.batch_size, iteration * self.batch_size + self.batch_size)
        
        datasets = {
            'feature': [],
            'embedding': [],
            'label': []
        }
        
        for i in idx:
            try:
                pop = self.dataset[i]
                for key in datasets.keys():
                    datasets[key].append(pop[key])
            except:
                return {
                    "feature": torch.tensor([]),
                    "embedding": [],
                    "label": torch.tensor([])
                }
                
        datasets['feature'] = torch.vstack(datasets['feature'])
        datasets['embedding'] = sum(datasets['embedding'], [])
        datasets['label'] = torch.vstack(datasets['label'])
        
        return datasets

In [21]:
class FeatureClassifier(nn.Module):
    def __init__(
        self,
        input_size=14,
        num_layers=3,
        step=2
    ):
        super().__init__()
        
        for i in range(10):
            if (2 ** i) <= input_size <= (2 ** (i+1)):
                init_size = max(2**i, 2**(i+1))
                break
        
        layers = [
            nn.Linear(input_size, init_size),
            nn.BatchNorm1d(init_size),
        ]
        
        for i in range(num_layers):
            if i < num_layers:
                layers.append(nn.ReLU())
            
            layers.append(nn.Linear(init_size, init_size * (2**step)))
            
            init_size *= (2**step)
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)
    
    
    
class BertEmbedding(nn.Module):
    def __init__(self,
        model_path: str = 'jhgan/ko-sroberta-multitask',         
        device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    ):
        super().__init__()
        
        self.model = SentenceTransformer(model_path)
        self.model.to(device)
        self.device = device
        
        self.linear = nn.Sequential(
            nn.Linear(768, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(.5),
#             nn.Linear(768, 512),
#             nn.ReLU(),
#             nn.Linear(512, 256)
        )
        
    
    def split_list(self, tokens: list, seq_len: int = 128, pad_idx=0):
        batch = []

        for i in range(int(len(tokens) // seq_len) + 1):
            token = tokens[i*seq_len:(i+1)*seq_len]
            if len(token) < 128:
                token += [pad_idx] * (128 - len(token))

            batch.append(token)

        return batch

    def make_batchs(
        self,
        tokens: dict,
        seq_len: int = 128,
        pad_idx: int = 0,
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    ):
        return {
            k: torch.tensor(self.split_list(v), device=device)
            for k, v in tokens.items()
        }
    
    def forward(self, x: str | list):
        if isinstance(x, str):
            x = [x]
            
        embeddings = []
        for corpus in x:
            t = self.model.tokenizer(corpus)
            embedding = self.model(self.make_batchs(t, device=self.device))
            embeddings.append(embedding['sentence_embedding'].mean(axis=0))
            
        y = torch.vstack(embeddings)
        y = self.linear(y)
        
        return y
    
    
class FakeReviewClassifier(nn.Module):
    def __init__(
        self,
        input_size=14,
        num_feature_layers=3,
        feature_step=2
    ):
        super().__init__()
        
        self.feature = FeatureClassifier(
            input_size=input_size,
            num_layers=num_feature_layers,
            step=feature_step
        )
        
        self.embedding = BertEmbedding()
        
        feature_output_size = self.feature.model[-1].out_features
        embedding_output_size = self.embedding.linear[0].out_features
        
        self.generator = nn.Sequential(
            nn.Linear(feature_output_size + embedding_output_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 1),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, 32),
#             nn.ReLU(),
#             nn.Linear(32, 8),
#             nn.ReLU(),
#             nn.Linear(8, 1),
        )
        
        self.activation = nn.Sigmoid()
        
    def forward(self, **kwargs):
        feature_emb = self.feature(kwargs['feature'])
        embedding_emb = self.embedding(kwargs['embedding'])
                
        con = torch.concat([feature_emb, embedding_emb], axis=1)
        y = self.generator(con)
        return self.activation(y)

In [22]:
model = FakeReviewClassifier(
    num_feature_layers=2,
    feature_step=1
)
crit = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [23]:
epochs = 30
early_stop = 30
batch_size = 8

In [24]:
train_loader = CustomLoader(train_ds, batch_size=batch_size)
valid_loader = CustomLoader(valid_ds, batch_size=batch_size)

In [25]:
model.to(device)

FakeReviewClassifier(
  (feature): FeatureClassifier(
    (model): Sequential(
      (0): Linear(in_features=14, out_features=16, bias=True)
      (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Linear(in_features=16, out_features=32, bias=True)
      (4): ReLU()
      (5): Linear(in_features=32, out_features=64, bias=True)
    )
  )
  (embedding): BertEmbedding(
    (model): SentenceTransformer(
      (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
      (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    )
    (linear): Sequential(
      (0): Linear(in_features=768, out_features=512, bias=True)
      (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(

In [None]:
best_model = None
lowest_loss = np.inf
best_acc = 0

for epoch in range(epochs):
    train_loss = 0
    
    for t_i in tqdm(range(int(len(train_ds) // batch_size) + 1)):
        batch = train_loader.get_batch(t_i)
        for k, v in batch.items():
            if k != 'embedding':
                batch[k] = v.to(device)
            else:
                batch[k] = v

        if len(batch['feature']) == 0: continue
        y_hat_i = model(**batch)
        y_i = batch['label']

        loss = crit(y_hat_i.squeeze(), y_i.squeeze())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += float(loss)

    train_loss = train_loss / len(train_ds)
    
    
    with torch.no_grad():
        valid_loss = 0
        y_hat = []
        y_real = []
        
        for v_i in tqdm(range(int(len(valid_ds) // batch_size) + 1)):
            batch = valid_loader.get_batch(v_i)
            for k, v in batch.items():
                if k != 'embedding':
                    batch[k] = v.to(device)
                else:
                    batch[k] = v
            
            if len(batch['feature']) == 0: continue
            y_hat_i = model(**batch)
            y_i = batch['label']            
            
            y_hat += (y_hat_i.squeeze() > 0.5).cpu().numpy().astype(int).tolist()
            y_real += y_i.tolist()
            loss = crit(y_hat_i.squeeze(), y_i.squeeze())
        
            valid_loss += float(loss)
            
    valid_loss = valid_loss /len(valid_ds)
    valid_acc = accuracy_score(y_real, y_hat)
    
    if valid_acc >= best_acc:
        best_model = deepcopy(model.state_dict())
        best_acc = valid_acc
        best_epoch = epoch
        
    else:
        if (epoch - best_epoch) >= early_stop:
            print("Early Stop at Epoch {epoch}. Lowest Loss {lowest_loss}, Best Acc {best_acc} at {best_epoch} epoch".format(
                epoch=epoch,
                lowest_loss=round(lowest_loss, 5),
                best_acc=best_acc,
                best_epoch=best_epoch
            ))
            break

    print("Epoch {epoch}: train_loss={train_loss}, valid_loss={valid_loss}, valid_acc={valid_acc}, best_acc={best_acc}".format(
        epoch=epoch,
        train_loss=round(train_loss, 5),
        valid_loss=round(valid_loss, 5),
        valid_acc=accuracy_score(y_real, y_hat),
        best_acc=best_acc
    ))

model.load_state_dict(best_model)

  0%|          | 0/1000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (689 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1000/1000 [11:41<00:00,  1.42it/s]
100%|██████████| 251/251 [00:58<00:00,  4.26it/s]


Epoch 0: train_loss=0.08354, valid_loss=0.08153, valid_acc=0.625, best_acc=0.625


100%|██████████| 1000/1000 [11:43<00:00,  1.42it/s]
100%|██████████| 251/251 [00:58<00:00,  4.27it/s]


Epoch 1: train_loss=0.08277, valid_loss=0.08069, valid_acc=0.663, best_acc=0.663


100%|██████████| 1000/1000 [11:43<00:00,  1.42it/s]
100%|██████████| 251/251 [00:58<00:00,  4.27it/s]


Epoch 2: train_loss=0.08196, valid_loss=0.08223, valid_acc=0.6225, best_acc=0.663


100%|██████████| 1000/1000 [11:43<00:00,  1.42it/s]
100%|██████████| 251/251 [00:58<00:00,  4.25it/s]


Epoch 3: train_loss=0.08172, valid_loss=0.08027, valid_acc=0.6635, best_acc=0.6635


100%|██████████| 1000/1000 [11:44<00:00,  1.42it/s]
100%|██████████| 251/251 [00:58<00:00,  4.26it/s]


Epoch 4: train_loss=0.0816, valid_loss=0.07997, valid_acc=0.6515, best_acc=0.6635


 35%|███▍      | 346/1000 [04:04<06:46,  1.61it/s]

In [28]:
torch.save(model, './model/multi_update_bert.pt')

## Embedding 

In [2]:
model_path = 'jhgan/ko-sroberta-multitask'

In [3]:
device = torch.device('cuda:0')

In [11]:
model = SentenceTransformer(model_path)

In [5]:
df = pd.read_csv('./data/review_dataset_v6.csv', index_col=0)
uid = df['id']
X = df.content.tolist()
y = df.label_f2.values

In [6]:
def split_list(tokens: list, seq_len: int = 128, pad_idx=0):
    batch = []
    
    for i in range(int(len(tokens) // seq_len) + 1):
        token = tokens[i*seq_len:(i+1)*seq_len]
        if len(token) < 128:
            token += [pad_idx] * (128 - len(token))
        
        batch.append(token)
        
    return batch
    

def make_batchs(
    tokens: dict,
    seq_len: int = 128,
    pad_idx: int = 0,
    to_tensor: str = 'pt',
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
):
    if to_tensor:
        return {
            k: torch.tensor(split_list(v), device=device)
            for k, v in tokens.items()
        }
    
    return {
        k: split_list(v)
        for k, v in tokens.items()
    }

In [13]:
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [21]:
embedding_result = {}

with torch.no_grad():
    for i, c in tqdm(zip(uid, X)):
        t = model.tokenizer(c)
        embedding = model.forward(make_batchs(t, device=device))
        
        embedding_result[i] = embedding['sentence_embedding'].mean(axis=0).cpu().tolist()

10001it [05:10, 32.26it/s]


In [26]:
df = pd.DataFrame(embedding_result).T
df.columns = [f"e_sum_{i}" for i in range(768)]

In [29]:
df.reset_index().to_csv('./data/review_dataset_embedding_all_mean.csv')