In [4]:
import re
import sacrebleu

import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import T5ForConditionalGeneration, T5Config, T5Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_name = 'utrobinmv/t5_translate_en_ru_zh_base_200'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(65100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(65100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [6]:
import os

# Path to your checkpoint
latest_checkpoint_path = ''
model_name = "t5_base_notune" # For naming of csv files (i.e. base, finetune, cosinesim, sbert)

# Load the checkpoint
# checkpoint = torch.load(latest_checkpoint_path, map_location='cpu')

# Update the model's parameters
# model.load_state_dict(checkpoint['model_state_dict'])


In [7]:
file_path_val = '/Users/lauralee/Desktop/skoo/AY2324/sem 2/CS4248/project/validation.csv'

val_df_unfiltered = pd.read_csv(file_path_val)
val_df_unfiltered

Unnamed: 0,en,zh
0,Last year I showed these two slides so that d...,去年我给各位展示了两个 关于北极冰帽的演示 在过去三百万年中 其面积由相当于美国南方48州面...
1,But this understates the seriousness of this p...,但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
2,"The arctic ice cap is, in a sense, the beatin...",感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
3,It expands in winter and contracts in summer.,冬天心脏舒张，夏天心脏收缩
4,The next slide I show you will be a rapid fas...,下面我要展示的是 在过去25年里的极剧变化
...,...,...
874,"You increase paralysis, and you decrease satis...",你增加了的是瘫痪，减少了的是满足。
875,Everybody needs a fishbowl.,每个人都需要这么个“鱼缸”。
876,This one is almost certainly too limited -- p...,至于这个（鱼缸）嘛，对这条鱼 可能是小了点，对于我们几乎肯定是太小了。
877,But the absence of some metaphorical fishbowl ...,但是，没有这么个象征性的鱼缸那就 意味着苦难将至， 也许是，灾难。


In [8]:
val_df = []

for index, row in val_df_unfiltered.iterrows():
  src_text = row['zh']
  ref_text = row['en']
  input_tokens = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  reference_tokens = tokenizer(ref_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  if input_tokens.size(1) > 27 or reference_tokens.size(1) > 27: # Since we translating both ways, check both sides to align predictions and references array
    continue
  else:
    val_df.append(row)

val_df = pd.DataFrame(val_df)
val_df

Unnamed: 0,en,zh
1,But this understates the seriousness of this p...,但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
2,"The arctic ice cap is, in a sense, the beatin...",感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
3,It expands in winter and contracts in summer.,冬天心脏舒张，夏天心脏收缩
4,The next slide I show you will be a rapid fas...,下面我要展示的是 在过去25年里的极剧变化
5,The permanent ice is marked in red.,红色的是永冻冰
...,...,...
874,"You increase paralysis, and you decrease satis...",你增加了的是瘫痪，减少了的是满足。
875,Everybody needs a fishbowl.,每个人都需要这么个“鱼缸”。
876,This one is almost certainly too limited -- p...,至于这个（鱼缸）嘛，对这条鱼 可能是小了点，对于我们几乎肯定是太小了。
877,But the absence of some metaphorical fishbowl ...,但是，没有这么个象征性的鱼缸那就 意味着苦难将至， 也许是，灾难。


backwards evaluation from chinese to english

In [9]:
prefix = 'translate to en: '

predictions, references = [], []

for index, row in val_df.iterrows(): 
  print("Row: ", str(index), ", translating: ", row["zh"])
  src_text = prefix + row['zh']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)
  references.append(row['en'])

references = [[reference] for reference in references]

Row:  1 , translating:  但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
translation:  But that doesn't fully explain the severity of the problem, because it doesn't indicate the thickness of the ice cap.
Row:  2 , translating:  感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
translation:  The Arctic ice cap feels like a beating heart in the global climate system.
Row:  3 , translating:  冬天心脏舒张，夏天心脏收缩
translation:  In winter the heart is sluggish and in summer the heart contractes.
Row:  4 , translating:  下面我要展示的是 在过去25年里的极剧变化
translation:  Here's what I want to show you about the dramatic changes over the last 25 years.
Row:  5 , translating:  红色的是永冻冰
translation:  The red is permafrost.
Row:  8 , translating:  在25年的时间里，它从这里，到了这里
translation:  In twenty-five years, it came from here.
Row:  10 , translating:  如果突破顶点，温室气体排放量 将是现有大气层中的全球温室污染总量
translation:  If we break through the peak, greenhouse gas emissions will be the total global greenhouse pollution in the existing atmosphere.
Row:  11 , translating:  在阿拉斯加的一些浅湖里 已经可以看到

In [11]:
def process_string(text):
    return replace_punctuation_with_en(text)

'''def remove_spaces(text):
    return ''.join(text.split())'''

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_en(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        '，': ',',
        '。': '.',
        '？': '?',
        '！': '!',
        '：': ':',
        '；': ';',
        '”': '"',
        '’': "'",
        '（': '(',
        '）': ')',
        '【': '[',
        '】': ']',
        '｛': '{',
        '｝': '}'
    }

    # Replace each English punctuation with its Chinese counterpart
    for chi_punc, eng_punc in punctuation_mapping.items():
        text = text.replace(chi_punc, eng_punc)

    return text

processed_predictions_backward = [process_string(sent) for sent in predictions]
processed_references_backward = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_backward, 'Reference': processed_references_backward})
df.to_csv('zh_to_en_t5_{model_name}_backwards_translations.csv', index=False)

processed_references_backward = [[reference] for reference in processed_references_backward]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_backward, processed_references_backward)
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_backward, processed_references_backward, smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")


BLEU Score without smoothing: BLEU = BLEU = 33.30 81.8/57.1/25.0/10.5 (BP = 1.000 ratio = 1.000 hyp_len = 22 ref_len = 22)
BLEU Score with smoothing: BLEU = BLEU = 37.94 81.8/59.1/28.6/15.0 (BP = 1.000 ratio = 1.000 hyp_len = 22 ref_len = 22)


forward evaluation from english to chinese

In [12]:
prefix = 'translate to zh: '

predictions, references = [], []

for index, row in val_df.iterrows():
  print("Row: ", str(index), ", translating: ", row["en"])
  src_text = prefix + row['en']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)
  references.append(row['zh'])

references = [[reference] for reference in references]


Row:  1 , translating:  But this understates the seriousness of this particular problem  because it doesn't show the thickness of the ice.
translation:  但這低估了這個特殊問題的嚴重性,因為它沒有顯示冰的厚度。
Row:  2 , translating:  The arctic ice cap is, in a sense,  the beating heart of the global climate system.
translation:  从某种意义上说,北极冰盖是全球气候系统的心脏。
Row:  3 , translating:  It expands in winter and contracts in summer.
translation:  它在冬季扩张,在夏季收缩。
Row:  4 , translating:  The next slide I show you will be  a rapid fast-forward of what's happened over the last 25 years.
translation:  我向您展示的下一张幻灯片将是过去25年中发生的事情的快速前瞻。
Row:  5 , translating:  The permanent ice is marked in red.
translation:  永久性冰被标记为红色。
Row:  8 , translating:  In 25 years it's gone from this, to this.
translation:  25年后,它从这个到那个。
Row:  10 , translating:  Compared to the total amount of global warming pollution in the atmosphere,  that amount could double if we cross this tipping point.
translation:  与全球变暖的大气污染总量相比,如果我们越过这个临界点,这一数字可能会翻一番。
Row:  11 , tr

In [13]:
def process_string(text):
    return replace_punctuation_with_zh(remove_spaces(text))

def remove_spaces(text):
    return ''.join(text.split())

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_zh(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        ',': '，',
        '.': '。',
        '?': '？',
        '!': '！',
        ':': '：',
        ';': '；',
        '"': '”',
        "'": '’',
        '(': '（',
        ')': '）',
        '[': '【',
        ']': '】',
        '{': '｛',
        '}': '｝'
    }

    # Replace each English punctuation with its Chinese counterpart
    for eng_punc, chi_punc in punctuation_mapping.items():
        text = text.replace(eng_punc, chi_punc)

    return text

processed_predictions_forward = [process_string(sent) for sent in predictions]
processed_references_forward = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_forward, 'Reference': processed_references_forward})
df.to_csv('zh_to_en_t5_{model_name}_forwards_translations.csv', index=False)

processed_references_forward = [[reference] for reference in processed_references_forward]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_forward, processed_references_forward, tokenize='zh')
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_forward, processed_references_forward, tokenize='zh', smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")



BLEU Score without smoothing: BLEU = BLEU = 10.24 64.3/22.2/3.8/2.0 (BP = 1.000 ratio = 1.000 hyp_len = 28 ref_len = 28)
BLEU Score with smoothing: BLEU = BLEU = 14.63 64.3/25.0/7.4/3.8 (BP = 1.000 ratio = 1.000 hyp_len = 28 ref_len = 28)


2 way evaluation from chinese to english back to chinese

In [14]:
prefix = 'translate to en: '

first_predictions, references = [], []

for index, row in val_df.iterrows():
  print("Row: ", str(index), ", translating: ", row['zh'])
  src_text = prefix + row['zh']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  first_predictions.append(translation)
  references.append(row['zh'])

references = [[reference] for reference in references]

Row:  1 , translating:  但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
translation:  But that doesn't fully explain the severity of the problem, because it doesn't indicate the thickness of the ice cap.
Row:  2 , translating:  感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
translation:  The Arctic ice cap feels like a beating heart in the global climate system.
Row:  3 , translating:  冬天心脏舒张，夏天心脏收缩
translation:  In winter the heart is sluggish and in summer the heart contractes.
Row:  4 , translating:  下面我要展示的是 在过去25年里的极剧变化
translation:  Here's what I want to show you about the dramatic changes over the last 25 years.
Row:  5 , translating:  红色的是永冻冰
translation:  The red is permafrost.
Row:  8 , translating:  在25年的时间里，它从这里，到了这里
translation:  In twenty-five years, it came from here.
Row:  10 , translating:  如果突破顶点，温室气体排放量 将是现有大气层中的全球温室污染总量
translation:  If we break through the peak, greenhouse gas emissions will be the total global greenhouse pollution in the existing atmosphere.
Row:  11 , translating:  在阿拉斯加的一些浅湖里 已经可以看到

In [15]:
prefix = 'translate to zh: '

predictions = []

for index, pred in enumerate(first_predictions): # Translate back to zh
  print("Row: ", str(index), ", translating: ", pred)
  src_text = prefix + pred
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)

Row:  0 , translating:  But that doesn't fully explain the severity of the problem, because it doesn't indicate the thickness of the ice cap.


translation:  但这并不能完全解释问题的严重性,因为它没有表明冰盖的厚度。
Row:  1 , translating:  The Arctic ice cap feels like a beating heart in the global climate system.
translation:  北极冰盖感觉就像是全球气候系统中一颗跳动的心脏。
Row:  2 , translating:  In winter the heart is sluggish and in summer the heart contractes.
translation:  在冬天,心脏是缓慢的,而在夏天,心脏收缩。
Row:  3 , translating:  Here's what I want to show you about the dramatic changes over the last 25 years.
translation:  这里是我想向你展示过去25年来的戏剧性变化。
Row:  4 , translating:  The red is permafrost.
translation:  红色是永恒的。
Row:  5 , translating:  In twenty-five years, it came from here.
translation:  二十五年后,它从这里来。
Row:  6 , translating:  If we break through the peak, greenhouse gas emissions will be the total global greenhouse pollution in the existing atmosphere.
translation:  如果我们突破高峰期,温室气体排放将是现有大气中全球温室气体总污染。
Row:  7 , translating:  In some shallow lakes in Alaska, biobubbles of probe heads can already be seen in the water.
translation:  在阿拉斯加的一些浅湖中,已经在水中可以看到探头的生物泡沫。
Row:  8 , translating: 

In [16]:
processed_predictions_backward_2way = []
processed_references_backward_2way = []

def process_string(text):
    return replace_punctuation_with_zh(remove_spaces(text))

def remove_spaces(text):
    return ''.join(text.split())

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_zh(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        ',': '，',
        '.': '。',
        '?': '？',
        '!': '！',
        ':': '：',
        ';': '；',
        '"': '”',
        "'": '’',
        '(': '（',
        ')': '）',
        '[': '【',
        ']': '】',
        '{': '｛',
        '}': '｝'
    }

    # Replace each English punctuation with its Chinese counterpart
    for eng_punc, chi_punc in punctuation_mapping.items():
        text = text.replace(eng_punc, chi_punc)

    return text

processed_predictions_backward_2way = [process_string(sent) for sent in predictions]
processed_references_backward_2way = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_backward_2way, 'Reference': processed_references_backward_2way})
df.to_csv('zh_to_en_t5_{model_name}_bidirectional_translations_zh.csv', index=False)

processed_references_backward_2way = [[reference] for reference in processed_references_backward_2way]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_backward_2way, processed_references_backward_2way, tokenize='zh')
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_backward_2way, processed_references_backward_2way, tokenize='zh', smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")


BLEU Score without smoothing: BLEU = BLEU = 41.94 96.6/75.0/37.0/11.5 (BP = 1.000 ratio = 1.000 hyp_len = 29 ref_len = 29)
BLEU Score with smoothing: BLEU = BLEU = 45.44 96.6/75.9/39.3/14.8 (BP = 1.000 ratio = 1.000 hyp_len = 29 ref_len = 29)


2 way evaluation from english to chinese back to english

In [17]:
prefix = 'translate to zh: '

first_predictions, references = [], []

for index, row in val_df.iterrows():
  print("Row: ", str(index), ", translating: ", row["en"])
  src_text = prefix + row['en']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  first_predictions.append(translation)
  references.append(row['en'])

references = [[reference] for reference in references]


Row:  1 , translating:  But this understates the seriousness of this particular problem  because it doesn't show the thickness of the ice.
translation:  但這低估了這個特殊問題的嚴重性,因為它沒有顯示冰的厚度。
Row:  2 , translating:  The arctic ice cap is, in a sense,  the beating heart of the global climate system.
translation:  从某种意义上说,北极冰盖是全球气候系统的心脏。
Row:  3 , translating:  It expands in winter and contracts in summer.
translation:  它在冬季扩张,在夏季收缩。
Row:  4 , translating:  The next slide I show you will be  a rapid fast-forward of what's happened over the last 25 years.
translation:  我向您展示的下一张幻灯片将是过去25年中发生的事情的快速前瞻。
Row:  5 , translating:  The permanent ice is marked in red.
translation:  永久性冰被标记为红色。
Row:  8 , translating:  In 25 years it's gone from this, to this.
translation:  25年后,它从这个到那个。
Row:  10 , translating:  Compared to the total amount of global warming pollution in the atmosphere,  that amount could double if we cross this tipping point.
translation:  与全球变暖的大气污染总量相比,如果我们越过这个临界点,这一数字可能会翻一番。
Row:  11 , tr

In [18]:
prefix = 'translate to en: '

predictions = []

for index, pred in enumerate(first_predictions): # Translate back to zh
  print("Row: ", str(index), ", translating: ", pred)
  src_text = prefix + pred
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)

Row:  0 , translating:  但這低估了這個特殊問題的嚴重性,因為它沒有顯示冰的厚度。
translation:  But that underestimates the severity of this particular problem, because it doesn't show the thickness of the ice.
Row:  1 , translating:  从某种意义上说,北极冰盖是全球气候系统的心脏。
translation:  In a sense, the Arctic ice sheet is at the heart of the global climate system.
Row:  2 , translating:  它在冬季扩张,在夏季收缩。
translation:  It expands in winter and contractes in summer.
Row:  3 , translating:  我向您展示的下一张幻灯片将是过去25年中发生的事情的快速前瞻。
translation:  The next slide I'll show you will be a quick-forward look at what has happened over the past 25 years.
Row:  4 , translating:  永久性冰被标记为红色。
translation:  The permanent ice is marked as red.
Row:  5 , translating:  25年后,它从这个到那个。
translation:  Twenty-five years later, it went from this to that.
Row:  6 , translating:  与全球变暖的大气污染总量相比,如果我们越过这个临界点,这一数字可能会翻一番。
translation:  Compared to the total amount of atmospheric pollution from global warming, this number could double if we crossed that threshold.
Row:  7 

In [19]:
def process_string(text):
    return replace_punctuation_with_en(text)

'''def remove_spaces(text):
    return ''.join(text.split())'''

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_en(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        '，': ',',
        '。': '.',
        '？': '?',
        '！': '!',
        '：': ':',
        '；': ';',
        '”': '"',
        '’': "'",
        '（': '(',
        '）': ')',
        '【': '[',
        '】': ']',
        '｛': '{',
        '｝': '}'
    }

    # Replace each English punctuation with its Chinese counterpart
    for chi_punc, eng_punc in punctuation_mapping.items():
        text = text.replace(chi_punc, eng_punc)

    return text

processed_predictions_forward_2way = [process_string(sent) for sent in predictions]
processed_references_forward_2way = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_forward_2way, 'Reference': processed_references_forward_2way})
df.to_csv('zh_to_en_t5_{model_name}_bidirectional_translations_en.csv', index=False)

processed_references_forward_2way = [[reference] for reference in processed_references_forward_2way]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_forward_2way, processed_references_forward_2way)
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_forward_2way, processed_references_forward_2way, smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")


BLEU Score without smoothing: BLEU = BLEU = 66.08 90.0/73.7/61.1/47.1 (BP = 1.000 ratio = 1.000 hyp_len = 20 ref_len = 20)
BLEU Score with smoothing: BLEU = BLEU = 67.95 90.0/75.0/63.2/50.0 (BP = 1.000 ratio = 1.000 hyp_len = 20 ref_len = 20)
