In [None]:
import warnings
warnings.filterwarnings('ignore')

import re
import os
import string
import datetime as dt
import shutil
import json
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text

device = torch.device('cuda') if torch.cuda.is_available else 'cpu'
import datasets

from collections import Counter

In [None]:
# Cosine Simularity
class Similarity(nn.Module):
    """
    Dot product or cosine similarity
    """

    def __init__(self, temp = 0.05):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        return self.cos(x, y) / self.temp

# Backbone Model
class Custom_tf_encoder(nn.Module):
    def __init__(self, used_word, embed_dim, nhead, encoder_layers, dropout=0.1, max_len=500):
        super().__init__()
        self.embedding_layer = nn.Embedding(used_word, embed_dim)
        self.encoders = nn.ModuleList([nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=nhead, 
            dim_feedforward=(embed_dim*2), 
            dropout=dropout, 
            activation='gelu', 
            batch_first=True) for _ in range(encoder_layers)])
#         self.avgpool = nn.AvgPool1d(kernel_size=embed_dim)
#         self.fc = nn.Linear(embed_dim, 2)
#         self.maxpool = nn.MaxPool2d(kernel_size = (max_len,1))
    
    def mk_padding_mask(self, text):
        # <pad>: 0
        return torch.eq(text, 0)
        
    def forward(self, text):
        x = self.embedding_layer(text)
        padding_mask = self.mk_padding_mask(text).to(x.device)
        for layer in self.encoders:
            x = layer(x, src_key_padding_mask=padding_mask)
#         x = self.maxpool(x) # (batch, 1, embed_dim)
#         x = x.squeeze(1) # (batch, embed_dim)
#         x = self.fc(x)
        
        return x

In [None]:
# SIMCES Contrastive Model
class transformer_cl(nn.Module):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone
        self.Similarity = Similarity()
        
    def forward(self, x):
        emb_a = self.backbone(x)
        emb_b = self.backbone(x)
        emb_a, emb_b = emb_a[:, -1], emb_b[:, -1]
        cos_sim = self.Similarity(emb_a.unsqueeze(1), emb_b.unsqueeze(0))
        return cos_sim

In [None]:
# Pytorch Training
def cl_train(model, train_data, loss_fn, optimizer, device):
    cl_train_loss = 0
    model.train()
    for data in train_data:
        output = model(data.to(device))
        label = torch.arange(len(data)).to(device)
        
        optimizer.zero_grad()
        loss = loss_fn(output, label)
        loss.backward()
        optimizer.step()
        cl_train_loss += loss_fn(output, label)
    return cl_train_loss

def cl_valid(model, valid_data, loss_fn, device):
    cl_valid_loss = 0
    model.eval()
    for data in valid_data:
        output = model(data.to(device))
        label = torch.arange(len(data)).to(device)
        cl_valid_loss += loss_fn(output, label)
        
    return cl_valid_loss


def Training(epochs, model, train_batch, valid_batch, loss_fn, optimizer, save_path, device):
    val_best_loss = float("inf")
    for epoch in range(1, epochs+1):
        train_loss, valid_loss, best_valid_loss = 0, 0, float('inf')
        train_loss += cl_train(model, train_batch, loss_fn, optimizer, device)
        valid_loss += cl_valid(model, valid_batch, loss_fn, device)
        
        # best save only, early_stopping 구현
        if valid_loss < val_best_loss:
            es_patience = 0
            
            if not os.path.exists(os.path.dirname(save_path)):
                os.makedirs(os.path.dirname(save_path))
            torch.save(model, save_path)
        else:
            es_patience += 1
            if es_patience == 50:
                break
                print(f"Train Stopped at Epoch {epoch}")
        print(f"Train_loss: {train_loss}, Valid_loss: {valid_loss}")

In [None]:
# Data Preprocessing
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

def get_text_label(raw_text):
    prep_text = custom_standardization(raw_text).numpy()
    list_text = [data.decode('utf-8').replace('  ', ' ') for data in prep_text]
    return list_text

def get_length_percentage(dataset, limit):
    count_under_limit = 0
    for x in dataset:
        if len(x) <= limit:
            count_under_limit += 1
    answer = (count_under_limit/len(dataset))*100
    print(f'{round(answer, 2)}% contains under {limit} words')

In [None]:
imdb = datasets.load_dataset("imdb")
print(imdb)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
