In [2]:
!pip install transformers datasets kagglehub torchinfo scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
from torchinfo import summary
from torch.utils.data import DataLoader, Dataset
import transformers
import pandas as pd
import kagglehub
from kagglehub import load_dataset, KaggleDatasetAdapter

In [4]:
# dataset 다운로드

path = kagglehub.dataset_download("devicharith/language-translation-englishfrench")
print("path", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/devicharith/language-translation-englishfrench?dataset_version_number=2...


100%|██████████| 3.51M/3.51M [00:00<00:00, 133MB/s]

Extracting files...
path /root/.cache/kagglehub/datasets/devicharith/language-translation-englishfrench/versions/2





In [5]:
# 데이터 확인하기

# UnicodeDecodeError로 encoding 지정해줌 : ISO-8859-1, latin1 등
df = pd.read_csv(path + '/eng_-french.csv', keep_default_na=False, encoding='ISO-8859-1')

# head : 첫 5행 / columns : column 명 배열과 데이터 타입
print("columns", df.columns)
df.head()

columns Index(['English words/sentences', 'French words/sentences'], dtype='object')


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Coursâ¯!
2,Run!,Courezâ¯!
3,Who?,Qui ?
4,Wow!,Ãa alorsâ¯!


In [6]:
df.columns = ['en', 'fr'] # column 명이 길어서 변경
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   en      175621 non-null  object
 1   fr      175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [7]:
# dataset 전처리

df.dropna() # 결측치 제거
df['en'] = df['en'].str.strip() # 공백 제거
df['fr'] = df['fr'].str.strip()

df['input_text'] = df['en']
df['target_text'] = df['fr']

print(df[['input_text', 'target_text']].head())

  input_text    target_text
0        Hi.         Salut!
1       Run!      Coursâ¯!
2       Run!     Courezâ¯!
3       Who?          Qui ?
4       Wow!  Ãa alorsâ¯!


In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 모델, tokenzier 로드
model_name = "google-t5/t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5TokenizerFast(name_or_path='google-t5/t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>',

In [9]:
# 모델 구조 확인

# device 정의
device = torch.device("cuda")

batch_size = 16
max_len = 64

model = model.to(device)
encoder_input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, max_len), dtype=torch.int64).to(model.device)
decoder_input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, max_len), dtype=torch.int64).to(model.device)

print(summary(model, input_data={'input_ids': encoder_input_ids, 'decoder_input_ids': decoder_input_ids}))

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Layer (type:depth-idx)                                  Output Shape              Param #
T5ForConditionalGeneration                              [16, 64, 512]             --
├─T5Stack: 1-1                                          [16, 64, 512]             35,330,816
├─T5Stack: 1-2                                          --                        (recursive)
│    └─Embedding: 2-1                                   [16, 64, 512]             16,449,536
├─T5Stack: 1-3                                          --                        (recursive)
│    └─Dropout: 2-2                                     [16, 64, 512]             --
│    └─ModuleList: 2-3                                  --                        --
│    │    └─T5Block: 3-1                                [16, 64, 512]             3,147,008
│    │    └─T5Block: 3-2                                [16, 64, 512]             3,146,752
│    │    └─T5Block: 3-3                                [16, 64, 512]             3,146,752
│    

In [10]:
# dataset class 정의

class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_col="input_text", target_col="target_text", max_len=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_col = source_col
        self.target_col = target_col
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source = self.data.iloc[index][self.source_col]
        target = self.data.iloc[index][self.target_col]

        source_enc = self.tokenizer(
            source,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        target_enc = self.tokenizer(
            target,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": source_enc["input_ids"].squeeze(),
            "attention_mask": source_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }

# 예시
dataset = TranslationDataset(df, tokenizer)
print(dataset[0])

{'input_ids': tensor([2018,    5,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0

In [12]:
from transformers import DataCollatorForSeq2Seq

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [13]:
# 데이터 분할 및 DataLoader 정의

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq

# train/test 분할 -> train : test = 9 : 1
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Dataset 객체 생성
train_dataset = TranslationDataset(train_df, tokenizer, max_len=128)
test_dataset = TranslationDataset(test_df, tokenizer, max_len=128)

# DataLoader 정의
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=32)

In [14]:
from torch.optim import Adam

def train_model(model, dataloader, optimizer, device, tokenizer, epochs=3):
    model.train()
    train_losses = []
    train_accuracies = []

    for epoch in range(epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        losses = total_loss / len(dataloader)
        train_losses.append(losses)

        train_acc = accuracy(model, dataloader, tokenizer, device)
        train_accuracies.append(train_acc)

        print(f"Epoch {epoch+1} | Train Loss: {losses:.4f} | Accuracy: {train_acc:.4f}")

    return train_losses, train_accuracies

In [15]:
def accuracy(model, dataloader, tokenizer, device):
    model.eval()
    total, match = 0, 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
            predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            targets = tokenizer.batch_decode(labels, skip_special_tokens=True)

            for pred, tgt in zip(predictions, targets):
                if pred.strip() == tgt.strip():
                    match += 1
                total += 1

    return match / total if total > 0 else 0

In [16]:
import matplotlib.pyplot as plt

def plot_train_metrics(train_losses, train_accuracies):
    epochs = list(range(1, len(train_losses) + 1))

    plt.figure(figsize=(12, 4))

    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, marker='o', label='Train Loss')
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.legend()

    # Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, marker='x', label='Train Accuracy')
    plt.title('Training Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()

In [17]:
optimizer = Adam(model.parameters(), lr=5e-5)

# 학습 실행
train_losses, train_accuracies = train_model(model, train_loader, optimizer, device, tokenizer=tokenizer, epochs=3)

# 시각화
plot_train_metrics(train_losses, train_accuracies)

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


KeyboardInterrupt: 