<a href="https://colab.research.google.com/github/just1nseo/YBIGTA/blob/main/language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
Coll

In [3]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import (Dataset, DatasetDict)
from transformers import (AutoTokenizer,
        AutoModel,
        DataCollatorWithPadding,
        Trainer,
        TrainingArguments,
        AutoConfig)
from sklearn.model_selection import train_test_split

In [None]:
# Path
my_PATH = "/content/drive/MyDrive/ybigta/Project_1"
os.chdir(my_PATH)
# show file list in the current directory
!dir

In [None]:
#For loading data
def load_data(ch, t_size, v_size):
  df = pd.read_csv(f"{ch}.csv", usecols= ['Title', 'label'], index_col=0).dropna()
  train_df, test_df = train_test_split(df, test_size = t_size)
  val_df, test_df = train_test_split(test_df, test_size=v_size)

  raw_ds = DatasetDict({"train": Dataset.from_pandas(train_df),
       "validation": Dataset.from_pandas(val_df),
       "test": Dataset.from_pandas(test_df)})
  
  return raw_ds

In [None]:
ch = "LabeledData_added"
t_size = 0.2
v_size = 0.5

In [None]:
raw_ds = load_data(ch, t_size, v_size)
raw_ds

In [None]:
#getting the tokenizer
checkpoint = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
#tokenizing data
def tokenize_function(batch):
  return tokenizer(batch['Title'], truncation = True, max_length= 512)

tokenized =raw_ds.map(tokenize_function, batched = True)
tokenized

In [None]:
#set format of tokenized data for pytorch and get data collator
tokenized.set_format("torch", columns = ["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
class LanguageModel(nn.Module):
  def __init__(self, checkpoint):
    super(LanguageModel, self).__init__()

    # Load model with given checkpoint and get the body
    self.transformer = AutoModel.from_pretrained(checkpoint,
                                                   config = AutoConfig.from_pretrained(
                                                       checkpoint,
                                                       output_attentions = True, 
                                                       output_hidden_states = True))
    self.bidir_LSTM = nn.LSTM(768, 50, bidirectional= True) 
    self.flatten = nn.Flatten() 
    self.dense_50 = nn.Linear(100, 50)
  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
    #Add custom layers
    LSTM_out = self.bidir_LSTM(outputs.last_hidden_states)
    max_pool_out = torch.max(LSTM_out,1)
    output = F.relu(self.dense_50(max_pool_out))
    return output

  

In [None]:
model = LanguageModel(checkpoint = checkpoint)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size = 32, collate_fn = data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["valid"], batch_size=32, collate_fn = data_collator
)

In [None]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr = 5e-5,eps = 1e-8)

num_epoch = 3
num_training_steps = num_epoch * len(train_dataloader)
lr_scheduler = get_scheduler(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps = num_training_steps)

print(num_training_steps)

In [None]:
from datasets import load_metric
metric = load_metric(["accuracy", "f1"])

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('device:', device)

In [None]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs*len(eval_dataloader)))

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k:v.to(device) for k, v in batch_items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
    model.eval()
    for batch in eval_dataloader:
        batch = {k:v.to(device) for k, v in batch_items()}
        with torch.no.grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions = predictions, references=batch["label"])
        progress_bar_eval.update(1)
        
    print(metric.compute())

