## GPT2를 이용한 LaTex 생성

# 1. 모델 로딩 및 데이터 초기화

In [1]:
from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, AutoModelForCausalLM, pipeline, \
                         Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset


In [2]:
MODEL = 'gpt2'

tokenizer = AutoTokenizer.from_pretrained(MODEL)  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token  # set the pad token to avoid a warning


In [3]:
data = pd.read_csv('../data/english_to_latex.csv')

print(data.shape)

data.head(2)

(50, 2)


Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"


In [11]:
data.head(10)

Unnamed: 0,English,LaTeX
0,integral from a to b of x squared,"\int_{a}^{b} x^2 \,dx"
1,integral from negative 1 to 1 of x squared,"\int_{-1}^{1} x^2 \,dx"
2,integral from negative 1 to infinity of x cubed,"\int_{-1}^{\inf} x^3 \,dx"
3,integral from 0 to infinity of x squared,"\int_{0}^{\inf} x^2 \,dx"
4,integral from 0 to infinity of y squared,"\int_{0}^{\inf} y^2 \,dy"
5,integral from 1 to 2 of x over 2,"\int_{1}^{2} \frac{x}{2} \,dx"
6,f of x equals x squared,f(x) = x^2
7,h of x equals x squared,h(x) = x^2
8,g of x equals x squared,g(x) = x^2
9,g of x equals x to the eighth power,g(x) = x^8


# 2. 프롬프트 생성 및 데이터 전처리

In [13]:
# 단일 프롬프트 추가
CONVERSION_PROMPT = 'Convert English to LaTeX\n'
CONVERSION_TOKEN = 'LaTeX:'

# This is our "training prompt" that we want GPT2 to recognize and learn
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['LaTeX'].astype(str)

training_examples

0     Convert English to LaTeX\nEnglish: integral fr...
1     Convert English to LaTeX\nEnglish: integral fr...
2     Convert English to LaTeX\nEnglish: integral fr...
3     Convert English to LaTeX\nEnglish: integral fr...
4     Convert English to LaTeX\nEnglish: integral fr...
5     Convert English to LaTeX\nEnglish: integral fr...
6     Convert English to LaTeX\nEnglish: f of x equa...
7     Convert English to LaTeX\nEnglish: h of x equa...
8     Convert English to LaTeX\nEnglish: g of x equa...
9     Convert English to LaTeX\nEnglish: g of x equa...
10    Convert English to LaTeX\nEnglish: f of x equa...
11    Convert English to LaTeX\nEnglish: f of x equa...
12    Convert English to LaTeX\nEnglish: h of x equa...
13    Convert English to LaTeX\nEnglish: g of x equa...
14    Convert English to LaTeX\nEnglish: f of x equa...
15    Convert English to LaTeX\nEnglish: f of x equa...
16    Convert English to LaTeX\nEnglish: f of x equa...
17    Convert English to LaTeX\nEnglish: f of x 

In [7]:
task_df = pd.DataFrame({'text': training_examples})

task_df.head(2)

Unnamed: 0,text
0,Convert English to LaTeX\nEnglish: integral fr...
1,Convert English to LaTeX\nEnglish: integral fr...


In [20]:
tokenizer.eos_token

'<|endoftext|>'

이 코드는 `task_df`라는 데이터프레임의 'text' 열에 있는 각 문자열의 끝에 토크나이저의 EOS (End of Sequence) 토큰을 추가하는 작업을 수행합니다.

코드를 자세히 설명하면 다음과 같습니다:

1. `task_df['text']`: `task_df` 데이터프레임의 'text' 열을 선택합니다. 이 열은 텍스트 데이터를 포함하고 있습니다.

2. `map(lambda x: f'{x}{tokenizer.eos_token}')`: 'text' 열의 각 문자열에 대해 람다 함수를 적용합니다.
   - `lambda x: f'{x}{tokenizer.eos_token}'`: 람다 함수는 입력 문자열 `x`를 받아 해당 문자열의 끝에 `tokenizer.eos_token`을 추가한 새로운 문자열을 반환합니다.
   - `f'{x}{tokenizer.eos_token}'`: f-string 포맷팅을 사용하여 입력 문자열 `x`와 `tokenizer.eos_token`을 연결합니다.
   - `tokenizer.eos_token`: 토크나이저의 EOS 토큰을 나타냅니다. 이 토큰은 문장이나 시퀀스의 끝을 표시하는 특수 토큰입니다.

3. `task_df['text'] = ...`: 람다 함수를 적용한 결과로 생성된 새로운 문자열들로 'text' 열을 업데이트합니다.

이 코드의 목적은 텍스트 데이터를 토크나이저로 처리하기 전에 각 문장이나 시퀀스의 끝을 명시적으로 표시하는 것입니다. EOS 토큰을 추가함으로써 토크나이저는 문장의 끝을 인식할 수 있게 됩니다.

예를 들어, 다음과 같은 텍스트 데이터가 있다고 가정해보겠습니다:
```
"Hello, how are you?"
"I'm doing fine, thanks for asking."
```

위의 코드를 적용하면 각 문장의 끝에 EOS 토큰이 추가됩니다:
```
"Hello, how are you?<eos>"
"I'm doing fine, thanks for asking.<eos>"
```

이렇게 EOS 토큰이 추가된 텍스트 데이터는 토크나이저로 처리될 때 문장의 경계를 명확히 인식할 수 있습니다. 이는 특히 시퀀스 투 시퀀스(Sequence-to-Sequence) 모델이나 언어 모델링 작업에서 중요합니다.

EOS 토큰을 추가하는 것은 텍스트 데이터를 토크나이저로 처리하기 전에 일반적으로 수행되는 전처리 과정 중 하나입니다.

In [21]:
# 마지막에 EOS 토큰을 추가하여 모델이 예측을 중단할 시점을 알 수 있도록 합니다.

task_df['text'] = task_df['text'].map(lambda x: f'{x}{tokenizer.eos_token}')

In [22]:
print(task_df.head(2))

                                                text
0  Convert English to LaTeX\nEnglish: integral fr...
1  Convert English to LaTeX\nEnglish: integral fr...


In [23]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset
latex_data['text']

['Convert English to LaTeX\nEnglish: integral from a to b of x squared\nLaTeX: \\int_{a}^{b} x^2 \\,dx<|endoftext|><|endoftext|>',
 'Convert English to LaTeX\nEnglish: integral from negative 1 to 1 of x squared\nLaTeX: \\int_{-1}^{1} x^2 \\,dx<|endoftext|><|endoftext|>',
 'Convert English to LaTeX\nEnglish: integral from negative 1 to infinity of x cubed\nLaTeX: \\int_{-1}^{\\inf} x^3 \\,dx<|endoftext|><|endoftext|>',
 'Convert English to LaTeX\nEnglish: integral from 0 to infinity of x squared\nLaTeX: \\int_{0}^{\\inf} x^2 \\,dx<|endoftext|><|endoftext|>',
 'Convert English to LaTeX\nEnglish: integral from 0 to infinity of y squared\nLaTeX: \\int_{0}^{\\inf} y^2 \\,dy<|endoftext|><|endoftext|>',
 'Convert English to LaTeX\nEnglish: integral from 1 to 2 of x over 2\nLaTeX: \\int_{1}^{2} \\frac{x}{2} \\,dx<|endoftext|><|endoftext|>',
 'Convert English to LaTeX\nEnglish: f of x equals x squared\nLaTeX: f(x) = x^2<|endoftext|><|endoftext|>',
 'Convert English to LaTeX\nEnglish: h of x equ

In [24]:
latex_data = Dataset.from_pandas(task_df)  # turn a pandas DataFrame into a Dataset

def preprocess(examples):  
    # tokenize our text but don't pad because our collator will pad for us dynamically
    return tokenizer(examples['text'], truncation=True)

latex_data = latex_data.map(preprocess, batched=True)

latex_data = latex_data.train_test_split(train_size=.8)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [25]:
latex_data['train'][0]

{'text': 'Convert English to LaTeX\nEnglish: 2 pi r\nLaTeX: 2 * \\pi * r<|endoftext|><|endoftext|>',
 'input_ids': [3103,
  1851,
  3594,
  284,
  4689,
  49568,
  198,
  15823,
  25,
  362,
  31028,
  374,
  198,
  14772,
  49568,
  25,
  362,
  1635,
  3467,
  14415,
  1635,
  374,
  50256,
  50256],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [55]:
# 자동 회귀 언어 모델링을 위한 표준 데이터 콜레이터
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [27]:
latex_gpt2 = AutoModelForCausalLM.from_pretrained(MODEL)

In [28]:
latex_data

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 10
    })
})

# 3. LaTex 변환작업으로 GPT2 파인튜닝하기

이 코드는 Hugging Face의 `transformers` 라이브러리에서 제공하는 `TrainingArguments` 클래스를 사용하여 모델 학습을 위한 하이퍼파라미터와 설정을 지정하는 것입니다.

`TrainingArguments`는 모델 학습 과정을 제어하는 다양한 옵션을 제공하며, 이를 통해 학습 과정을 커스터마이즈할 수 있습니다.

코드를 자세히 살펴보겠습니다:

```python
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=20,
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    use_mps_device=True
)
```

- `output_dir="./english_to_latex"`: 모델의 체크포인트와 로그 파일이 저장될 출력 디렉토리를 지정합니다.

- `overwrite_output_dir=True`: 출력 디렉토리가 이미 존재하는 경우 해당 디렉토리의 내용을 덮어쓰도록 설정합니다.

- `num_train_epochs=5`: 학습 에포크(epoch) 수를 지정합니다. 전체 데이터셋을 5번 반복하여 학습합니다.

- `per_device_train_batch_size=1`: 학습 시 각 디바이스(GPU 또는 CPU)에서 사용할 배치 크기를 지정합니다. 여기서는 1로 설정되어 있습니다.

- `per_device_eval_batch_size=20`: 평가 시 각 디바이스에서 사용할 배치 크기를 지정합니다. 여기서는 20으로 설정되어 있습니다.

- `load_best_model_at_end=True`: 학습이 끝난 후 검증 손실(validation loss)이 가장 낮은 모델을 로드하도록 설정합니다.

- `logging_steps=5`: 로깅 간격을 지정합니다. 매 5번째 스텝마다 로그를 출력합니다.

- `log_level='info'`: 로그 레벨을 설정합니다. 여기서는 'info' 레벨로 설정되어 있습니다.

- `evaluation_strategy='epoch'`: 평가 전략을 설정합니다. 여기서는 'epoch'으로 설정되어 있어 매 에포크마다 평가를 수행합니다.

- `save_strategy='epoch'`: 모델 저장 전략을 설정합니다. 여기서는 'epoch'으로 설정되어 있어 매 에포크마다 모델을 저장합니다.

- `use_mps_device=True`: MPS(macOS 머신 러닝 가속) 디바이스를 사용하도록 설정합니다. 이는 macOS에서 사용 가능한 GPU 가속 기능입니다.

이렇게 설정된 `TrainingArguments`는 `Trainer` 클래스의 `args` 매개변수에 전달되어 학습 과정을 제어하는 데 사용됩니다.

```python
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)
```

위의 코드와 같이 `Trainer` 클래스를 초기화할 때 `training_args`를 전달함으로써 지정된 하이퍼파라미터와 설정에 따라 모델 학습이 수행됩니다.

In [56]:
training_args = TrainingArguments(
    output_dir="./english_to_latex",
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    use_mps_device=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


이 코드는 Hugging Face의 `transformers` 라이브러리에서 제공하는 `Trainer` 클래스를 사용하여 모델 학습을 위한 트레이너(Trainer)를 생성하는 것입니다.

`Trainer` 클래스는 모델 학습 과정을 관리하고 실행하는 역할을 합니다. 트레이너는 모델, 학습 인자, 데이터셋, 데이터 콜레이터 등을 받아 학습을 수행합니다.

코드를 자세히 살펴보겠습니다:

```python
trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)
```

- `model=latex_gpt2`: 학습할 모델을 지정합니다. 여기서는 `latex_gpt2`라는 사전 학습된 GPT-2 모델을 사용합니다.

- `args=training_args`: 이전에 정의한 `TrainingArguments` 객체를 전달합니다. 이 객체는 학습 과정을 제어하는 다양한 하이퍼파라미터와 설정을 포함하고 있습니다.

- `train_dataset=latex_data["train"]`: 학습에 사용할 훈련 데이터셋을 지정합니다. `latex_data`는 데이터셋을 담고 있는 딕셔너리이며, `"train"`은 훈련 데이터셋을 나타내는 키입니다.

- `eval_dataset=latex_data["test"]`: 평가에 사용할 검증 데이터셋을 지정합니다. `"test"`는 검증 데이터셋을 나타내는 키입니다.

- `data_collator=data_collator`: 데이터 콜레이터를 지정합니다. 데이터 콜레이터는 데이터셋에서 배치를 생성하고 전처리하는 역할을 합니다. 이전에 정의한 `data_collator`를 사용합니다.

이렇게 생성된 `trainer` 객체를 사용하여 모델 학습을 수행할 수 있습니다. `trainer`는 다음과 같은 주요 메서드를 제공합니다:

- `train()`: 모델 학습을 시작합니다. 지정된 에포크 수만큼 학습을 반복합니다.
- `evaluate()`: 모델을 평가합니다. 검증 데이터셋을 사용하여 모델의 성능을 측정합니다.
- `save_model()`: 학습된 모델을 저장합니다.

예를 들어, 다음과 같이 `trainer.train()`을 호출하여 모델 학습을 시작할 수 있습니다:

```python
trainer.train()
```

학습이 완료되면 `trainer.evaluate()`를 호출하여 모델의 성능을 평가할 수 있습니다:

```python
eval_results = trainer.evaluate()
print(eval_results)
```

이를 통해 모델 학습 과정을 간편하게 관리하고 실행할 수 있으며, 학습된 모델을 평가하고 저장할 수 있습니다.

In [57]:
trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

In [58]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 6.351103782653809,
 'eval_runtime': 1.6081,
 'eval_samples_per_second': 6.219,
 'eval_steps_per_second': 0.622}

In [59]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.0832,7.226024
2,0.5341,8.735909
3,0.613,9.669665
4,0.3258,9.084322
5,0.3973,9.225646


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Checkpoint destination directory ./english_to_latex/checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Saving model checkpoint to ./english_to_latex/checkpoint-40
Configuration saved in ./english_to_latex/checkpoint-40/config.json
Configuration saved in ./english_to_latex/checkpoint-40/generation_config.json
Model weights saved in ./english_to_latex/checkpoint-40/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num 

TrainOutput(global_step=200, training_loss=0.911777811050415, metrics={'train_runtime': 34.4433, 'train_samples_per_second': 5.807, 'train_steps_per_second': 5.807, 'total_flos': 3306977280000.0, 'train_loss': 0.911777811050415, 'epoch': 5.0})

# 4. LaTex 가이드로 GPT2 파인튜닝하기

In [31]:
book_data = TextDataset(
    tokenizer=tokenizer,
    file_path='../data/latex-guide-cos423.txt',  # train on a LaTeX cheat sheet they made
    block_size=128
)

Loading features from cached file ../data/cached_lm_GPT2TokenizerFast_128_latex-guide-cos423.txt [took 0.001 s]


In [32]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)


In [33]:
latex_gpt2 = AutoModelForCausalLM.from_pretrained(MODEL)

loading configuration file config.json from cache at /Users/wlkim/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transforme

In [34]:
training_args = TrainingArguments(
    output_dir="./math_book",
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    use_mps_device=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [35]:
trainer = Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=book_data.examples[:int(len(book_data.examples)*.8)],
    eval_dataset=book_data.examples[int(len(book_data.examples)*.8):]
)

In [36]:
trainer.evaluate()  # initial loss for the cheat sheet

***** Running Evaluation *****
  Num examples = 12
  Batch size = 32


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 6.566359043121338,
 'eval_runtime': 0.6286,
 'eval_samples_per_second': 19.091,
 'eval_steps_per_second': 1.591}

In [37]:
trainer.train()

***** Running training *****
  Num examples = 47
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 240
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,2.6398,3.688287
2,1.9806,3.111025
3,2.1214,3.067104
4,1.7667,3.03863
5,1.8662,3.006435
6,1.3431,3.044762
7,1.4649,3.067927
8,1.3253,3.037357
9,1.2977,3.038291
10,1.3204,3.049669


***** Running Evaluation *****
  Num examples = 12
  Batch size = 32
Checkpoint destination directory ./math_book/checkpoint-24 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Saving model checkpoint to ./math_book/checkpoint-24
Configuration saved in ./math_book/checkpoint-24/config.json
Configuration saved in ./math_book/checkpoint-24/generation_config.json
Model weights saved in ./math_book/checkpoint-24/model.safetensors
***** Running Evaluation *****
  Num examples = 12
  Batch size = 32
Checkpoint destination directory ./math_book/checkpoint-48 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Saving model checkpoint to ./math_book/checkpoint-48
Configuration saved in ./math_book/checkpoint-48/config.json
Configuration saved in ./math_book/checkpoint-48/generation_config.json
Model weights saved in ./math_book/checkpoint-48/model.safetensors
***** Running Evaluation *****
  Num examples = 12
  Batch size 

TrainOutput(global_step=240, training_loss=1.8412107666333517, metrics={'train_runtime': 61.0493, 'train_samples_per_second': 7.699, 'train_steps_per_second': 3.931, 'total_flos': 30701813760000.0, 'train_loss': 1.8412107666333517, 'epoch': 10.0})

In [38]:
trainer.save_model()

Saving model checkpoint to ./math_book
Configuration saved in ./math_book/config.json
Configuration saved in ./math_book/generation_config.json
Model weights saved in ./math_book/model.safetensors


# 5. LaTex 가이드 북으로 학습된 GPT2를 LaTex 변환 데이터로 추가 파인튜닝하기

In [39]:
# load up our gpt pre-trained on latex cheat sheets
math_latex_gpt2 = AutoModelForCausalLM.from_pretrained('./math_book')

training_args = TrainingArguments(
    output_dir="./math_english_to_latex",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    per_device_eval_batch_size=20,  # batch size for evaluation
    load_best_model_at_end=True,
    logging_steps=5,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    use_mps_device=True
)

trainer = Trainer(
    model=math_latex_gpt2,
    args=training_args,
    train_dataset=latex_data["train"],
    eval_dataset=latex_data["test"],
    data_collator=data_collator,
)

trainer.evaluate()  # loss is starting slightly lower than before

loading configuration file ./math_book/config.json
Model config GPT2Config {
  "_name_or_path": "./math_book",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 50257
}

loading we

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 6.351103782653809,
 'eval_runtime': 0.1047,
 'eval_samples_per_second': 95.541,
 'eval_steps_per_second': 9.554}

In [40]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 40
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,1.23,7.418637
2,0.5462,8.269737
3,0.8233,8.608335
4,0.3042,8.0682
5,0.3736,8.110291


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 20
Checkpoint destination directory ./math_english_to_latex/checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Saving model checkpoint to ./math_english_to_latex/checkpoint-40
Configuration saved in ./math_english_to_latex/checkpoint-40/config.json
Configuration saved in ./math_english_to_latex/checkpoint-40/generation_config.json
Model weights saved in ./math_english_to_latex/checkpoint-40/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Runnin

TrainOutput(global_step=200, training_loss=0.9154342466592789, metrics={'train_runtime': 34.2079, 'train_samples_per_second': 5.847, 'train_steps_per_second': 5.847, 'total_flos': 3306977280000.0, 'train_loss': 0.9154342466592789, 'epoch': 5.0})

In [41]:
trainer.save_model()  # save this model

Saving model checkpoint to ./math_english_to_latex
Configuration saved in ./math_english_to_latex/config.json
Configuration saved in ./math_english_to_latex/generation_config.json
Model weights saved in ./math_english_to_latex/model.safetensors


# 6. 모델 성능 비교하기

In [47]:
loaded_model = AutoModelForCausalLM.from_pretrained(MODEL)
non_finetuned_latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

loading configuration file config.json from cache at /Users/wlkim/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transforme

In [48]:
loaded_model = AutoModelForCausalLM.from_pretrained('./math_english_to_latex')
latex_generator = pipeline('text-generation', model=loaded_model, tokenizer=tokenizer)

loading configuration file ./math_english_to_latex/config.json
Model config GPT2Config {
  "_name_or_path": "./math_english_to_latex",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_si

In [49]:
text_sample = 'g of x equals integral from 0 to 1 of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=2, early_stopping=True, temperature=0.7,
    max_new_tokens=24
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Convert English to LaTeX
English: g of x equals integral from 0 to 1 of x squared
LaTeX: g(x) = x^2}^2
English: g(x) = x^2}^


In [50]:
# Another example
text_sample = 'r of x is sum from 0 to x of x squared'
conversion_text_sample = f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'

print(latex_generator(
    conversion_text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Convert English to LaTeX
English: r of x is sum from 0 to x of x squared
LaTeX: r(x) = x^2
LaTeX: r(x) = x^2


In [51]:
print(latex_generator(
    text_sample, num_beams=5, early_stopping=True, temperature=0.7,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


r of x is sum from 0 to x of x squared
English: x^2 = x^2
LaTeX: x^2 = x^2
LaTeX: x^2 = x^2



In [52]:
non_finetuned_latex_generator

<transformers.pipelines.text_generation.TextGenerationPipeline at 0x34011da50>

In [53]:
# try a few shot with standard gpt2
few_shot_prompt = CONVERSION_PROMPT+"""English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx \
###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx \
###
LCT
English: pi to the 8th power
LaTeX:"""

print(non_finetuned_latex_generator(
    few_shot_prompt, num_beams=1, early_stopping=True, temperature=0.1,
    max_length=len(tokenizer.encode(few_shot_prompt)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Convert English to LaTeX
English: f of x is sum from 0 to x of x squared
LaTeX: f(x) = \sum_{0}^{x} x^2 \,dx ###
LCT
English: f of x equals integral from 0 to pi of x to the fourth power
LaTeX: f(x) = \int_{0}^{\pi} x^4 \,dx ###
LCT
English: pi to the 8th power
LaTeX: pi to the 8th power
LCT
English: f(x) = \sum_{


In [54]:
# Just ask with standard gpt2
print(non_finetuned_latex_generator(
    conversion_text_sample, num_beams=1, early_stopping=True, temperature=0.1,
    max_length=len(tokenizer.encode(conversion_text_sample)) + 20
)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Convert English to LaTeX
English: r of x is sum from 0 to x of x squared
LaTeX: r of x is sum from 0 to x of x squared
English: r of x is sum
