# Autoregressive and Other Language Models
自回归和其他语言模型

## Preparation for Google Colab

In [None]:
import os
from google.colab import drive

#  挂载 google 云盘
drive.mount("/content/drive")

print(os.getcwd())  # /content

# print(os.listdir("/content/drive/MyDrive/"))

# print(os.listdir("/content/drive/MyDrive/Colab Notebooks"))

# if os.getcwd() != "/content/drive/MyDrive":
#     os.chdir("/content/drive/MyDrive")

# print(os.getcwd())

In [None]:
# 提前将 requirements.txt 放在 google 云盘上
!pip install -r /content/drive/MyDrive/requirements.txt

In [3]:
subdir = "ch04b"
work_path = "/content/drive/MyDrive/" + subdir
if not os.path.exists(work_path):
    os.mkdir(work_path)
os.chdir(work_path)
print(os.getcwd())

/content/drive/MyDrive/ch04b


In [4]:
!apt-get install tree && tree -a "./"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.
[01;34m./[0m
└── [00mTR2EN.txt[0m

0 directories, 1 file


## 数据处理

In [5]:
!head TR2EN.txt

EN	TR
Hi.	Merhaba.
Hi.	Selam.
Run!	Kaç!
Run!	Koş!
Run.	Kaç!
Run.	Koş!
Who?	Kim?
Fire!	Ateş!
Fire!	Yangın!


In [6]:
import pandas as pd
df = pd.read_csv("TR2EN.txt", sep="\t").astype(str)
df

Unnamed: 0,EN,TR
0,Hi.,Merhaba.
1,Hi.,Selam.
2,Run!,Kaç!
3,Run!,Koş!
4,Run.,Kaç!
...,...,...
473030,A carbon footprint is the amount of carbon dio...,Bir karbon ayakizi bizim faaliyetlerimizin bir...
473031,"At a moment when our economy is growing, our b...",Ekonomimizin büyüdüğü bir anda bizim işletmele...
473032,Using high heat settings while ironing synthet...,Sentetik kumaşları ütülerken yüksek ısı ayarla...
473033,"If you want to sound like a native speaker, yo...","Eğer bir yerli gibi konuşmak istiyorsan, banço..."


In [7]:
data = []
for item in df[:10500].iterrows():
    data.append(["translate english to turkish", item[1].EN, item[1].TR])
data[0]

['translate english to turkish', 'Hi.', 'Merhaba.']

In [8]:
data[0:3]

[['translate english to turkish', 'Hi.', 'Merhaba.'],
 ['translate english to turkish', 'Hi.', 'Selam.'],
 ['translate english to turkish', 'Run!', 'Kaç!']]

In [9]:
df = pd.DataFrame(data, columns=["prefix", "input_text", "target_text"])
df

Unnamed: 0,prefix,input_text,target_text
0,translate english to turkish,Hi.,Merhaba.
1,translate english to turkish,Hi.,Selam.
2,translate english to turkish,Run!,Kaç!
3,translate english to turkish,Run!,Koş!
4,translate english to turkish,Run.,Kaç!
...,...,...,...
10495,translate english to turkish,I'll take this.,Bunu alacağım.
10496,translate english to turkish,I'll teach Tom.,Tom'a öğreteceğim.
10497,translate english to turkish,I'll teach you.,Size öğreteceğim.
10498,translate english to turkish,I'll thank Tom.,Tom'a teşekkür edeceğim.


In [10]:
train_df = df[:50]
eval_df = df[50:100]

In [11]:
train_df

Unnamed: 0,prefix,input_text,target_text
0,translate english to turkish,Hi.,Merhaba.
1,translate english to turkish,Hi.,Selam.
2,translate english to turkish,Run!,Kaç!
3,translate english to turkish,Run!,Koş!
4,translate english to turkish,Run.,Kaç!
5,translate english to turkish,Run.,Koş!
6,translate english to turkish,Who?,Kim?
7,translate english to turkish,Fire!,Ateş!
8,translate english to turkish,Fire!,Yangın!
9,translate english to turkish,Help!,Yardım et!


## T5

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


T5TokenizerFast(name_or_path='google-t5/t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>',

In [13]:
from transformers import T5Model
model = T5Model.from_pretrained("google-t5/t5-small")
model

T5Model(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=

In [14]:
model.config

T5Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "google-t5/t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "ear