In [4]:
import re
import sacrebleu

import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import T5ForConditionalGeneration, T5Config, T5Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW

In [5]:
model_name = 'utrobinmv/t5_translate_en_ru_zh_base_200'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(65100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(65100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [6]:
import os

# Path to your checkpoint
latest_checkpoint_path = '/Users/lauralee/Desktop/skoo/AY2324/sem 2/CS4248/project/latest_checkpoint_t5_base_consistency.pt'
model_name = "t5_base_finetune_cossim" # For naming of csv files (i.e. base, finetune, cosinesim, sbert)

# Load the checkpoint
checkpoint = torch.load(latest_checkpoint_path, map_location='cpu')

# Update the model's parameters
model.load_state_dict(checkpoint['model_state_dict'])


<All keys matched successfully>

In [7]:
file_path_val = '/Users/lauralee/Desktop/skoo/AY2324/sem 2/CS4248/project/validation.csv'

val_df_unfiltered = pd.read_csv(file_path_val)
val_df_unfiltered

Unnamed: 0,en,zh
0,Last year I showed these two slides so that d...,去年我给各位展示了两个 关于北极冰帽的演示 在过去三百万年中 其面积由相当于美国南方48州面...
1,But this understates the seriousness of this p...,但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
2,"The arctic ice cap is, in a sense, the beatin...",感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
3,It expands in winter and contracts in summer.,冬天心脏舒张，夏天心脏收缩
4,The next slide I show you will be a rapid fas...,下面我要展示的是 在过去25年里的极剧变化
...,...,...
874,"You increase paralysis, and you decrease satis...",你增加了的是瘫痪，减少了的是满足。
875,Everybody needs a fishbowl.,每个人都需要这么个“鱼缸”。
876,This one is almost certainly too limited -- p...,至于这个（鱼缸）嘛，对这条鱼 可能是小了点，对于我们几乎肯定是太小了。
877,But the absence of some metaphorical fishbowl ...,但是，没有这么个象征性的鱼缸那就 意味着苦难将至， 也许是，灾难。


In [8]:
val_df = []

for index, row in val_df_unfiltered.iterrows():
  src_text = row['zh']
  ref_text = row['en']
  input_tokens = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  reference_tokens = tokenizer(ref_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  if input_tokens.size(1) > 27 or reference_tokens.size(1) > 27: # Since we translating both ways, check both sides to align predictions and references array
    continue
  else:
    val_df.append(row)

val_df = pd.DataFrame(val_df)
val_df

Unnamed: 0,en,zh
1,But this understates the seriousness of this p...,但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
2,"The arctic ice cap is, in a sense, the beatin...",感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
3,It expands in winter and contracts in summer.,冬天心脏舒张，夏天心脏收缩
4,The next slide I show you will be a rapid fas...,下面我要展示的是 在过去25年里的极剧变化
5,The permanent ice is marked in red.,红色的是永冻冰
...,...,...
874,"You increase paralysis, and you decrease satis...",你增加了的是瘫痪，减少了的是满足。
875,Everybody needs a fishbowl.,每个人都需要这么个“鱼缸”。
876,This one is almost certainly too limited -- p...,至于这个（鱼缸）嘛，对这条鱼 可能是小了点，对于我们几乎肯定是太小了。
877,But the absence of some metaphorical fishbowl ...,但是，没有这么个象征性的鱼缸那就 意味着苦难将至， 也许是，灾难。


backwards evaluation from chinese to english

In [9]:
prefix = 'translate to en: '

predictions, references = [], []

for index, row in val_df.iterrows(): 
  print("Row: ", str(index), ", translating: ", row["zh"])
  src_text = prefix + row['zh']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)
  references.append(row['en'])

references = [[reference] for reference in references]

Row:  1 , translating:  但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
translation:  But these do not fully explain the severity of the problem, because this does not indicate the thickness of the ice cap.
Row:  2 , translating:  感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
translation:  The Arctic ice cap feels like a beating heart in the global climate system.
Row:  3 , translating:  冬天心脏舒张，夏天心脏收缩
translation:  The heart is swelled in winter and the heart contractes in summer.
Row:  4 , translating:  下面我要展示的是 在过去25年里的极剧变化
translation:  Below I want to show you the dramatic changes that have taken place over the past 25 years.
Row:  5 , translating:  红色的是永冻冰
translation:  The red is the permafrost ice.
Row:  8 , translating:  在25年的时间里，它从这里，到了这里
translation:  25 years later, it came from here to here.
Row:  10 , translating:  如果突破顶点，温室气体排放量 将是现有大气层中的全球温室污染总量
translation:  If it breaks through, the GHG emissions will be the total global greenhouse gas pollutant in the existing atmosphere.
Row:  11 , translating:  在阿拉斯加的一

In [10]:
def process_string(text):
    return replace_punctuation_with_en(text)

'''def remove_spaces(text):
    return ''.join(text.split())'''

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_en(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        '，': ',',
        '。': '.',
        '？': '?',
        '！': '!',
        '：': ':',
        '；': ';',
        '”': '"',
        '’': "'",
        '（': '(',
        '）': ')',
        '【': '[',
        '】': ']',
        '｛': '{',
        '｝': '}'
    }

    # Replace each English punctuation with its Chinese counterpart
    for chi_punc, eng_punc in punctuation_mapping.items():
        text = text.replace(chi_punc, eng_punc)

    return text

processed_predictions_backward = [process_string(sent) for sent in predictions]
processed_references_backward = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_backward, 'Reference': processed_references_backward})
df.to_csv(f'zh_to_en_t5_{model_name}_backwards_translations.csv', index=False)

processed_references_backward = [[reference] for reference in processed_references_backward]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_backward, processed_references_backward)
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_backward, processed_references_backward, smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")


BLEU Score without smoothing: BLEU = BLEU = 26.51 87.5/43.5/13.6/9.5 (BP = 1.000 ratio = 1.000 hyp_len = 24 ref_len = 24)
BLEU Score with smoothing: BLEU = BLEU = 31.23 87.5/45.8/17.4/13.6 (BP = 1.000 ratio = 1.000 hyp_len = 24 ref_len = 24)


forward evaluation from english to chinese

In [11]:
prefix = 'translate to zh: '

predictions, references = [], []

for index, row in val_df.iterrows():
  print("Row: ", str(index), ", translating: ", row["en"])
  src_text = prefix + row['en']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)
  references.append(row['zh'])

references = [[reference] for reference in references]


Row:  1 , translating:  But this understates the seriousness of this particular problem  because it doesn't show the thickness of the ice.
translation:  但是这低估了这个问题的严重性, 因为它没有显示冰的厚度。
Row:  2 , translating:  The arctic ice cap is, in a sense,  the beating heart of the global climate system.
translation:  从某种意义上说,北极冰盖是全球气候系统的跳动中心。
Row:  3 , translating:  It expands in winter and contracts in summer.
translation:  它在冬天扩张,在夏天收缩。
Row:  4 , translating:  The next slide I show you will be  a rapid fast-forward of what's happened over the last 25 years.
translation:  下一张幻灯片将展示过去25年里发生的事情。
Row:  5 , translating:  The permanent ice is marked in red.
translation:  永久性冰被标记为红色。
Row:  8 , translating:  In 25 years it's gone from this, to this.
translation:  25年过去了。
Row:  10 , translating:  Compared to the total amount of global warming pollution in the atmosphere,  that amount could double if we cross this tipping point.
translation:  与全球变暖的污染总量相比, 如果跨过这个转折点,这一数量可以翻一番。
Row:  11 , translating:  Alread

In [12]:
def process_string(text):
    return replace_punctuation_with_zh(remove_spaces(text))

def remove_spaces(text):
    return ''.join(text.split())

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_zh(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        ',': '，',
        '.': '。',
        '?': '？',
        '!': '！',
        ':': '：',
        ';': '；',
        '"': '”',
        "'": '’',
        '(': '（',
        ')': '）',
        '[': '【',
        ']': '】',
        '{': '｛',
        '}': '｝'
    }

    # Replace each English punctuation with its Chinese counterpart
    for eng_punc, chi_punc in punctuation_mapping.items():
        text = text.replace(eng_punc, chi_punc)

    return text

processed_predictions_forward = [process_string(sent) for sent in predictions]
processed_references_forward = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_forward, 'Reference': processed_references_forward})
df.to_csv(f'zh_to_en_t5_{model_name}_forwards_translations.csv', index=False)

processed_references_forward = [[reference] for reference in processed_references_forward]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_forward, processed_references_forward, tokenize='zh')
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_forward, processed_references_forward, tokenize='zh', smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")



BLEU Score without smoothing: BLEU = BLEU = 49.68 100.0/73.1/40.0/20.8 (BP = 1.000 ratio = 1.000 hyp_len = 27 ref_len = 27)
BLEU Score with smoothing: BLEU = BLEU = 52.37 100.0/74.1/42.3/24.0 (BP = 1.000 ratio = 1.000 hyp_len = 27 ref_len = 27)


2 way evaluation from chinese to english back to chinese

In [13]:
prefix = 'translate to en: '

first_predictions, references = [], []

for index, row in val_df.iterrows():
  print("Row: ", str(index), ", translating: ", row['zh'])
  src_text = prefix + row['zh']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  first_predictions.append(translation)
  references.append(row['zh'])

references = [[reference] for reference in references]

Row:  1 , translating:  但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
translation:  But these do not fully explain the severity of the problem, because this does not indicate the thickness of the ice cap.
Row:  2 , translating:  感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
translation:  The Arctic ice cap feels like a beating heart in the global climate system.
Row:  3 , translating:  冬天心脏舒张，夏天心脏收缩
translation:  The heart is swelled in winter and the heart contractes in summer.
Row:  4 , translating:  下面我要展示的是 在过去25年里的极剧变化
translation:  Below I want to show you the dramatic changes that have taken place over the past 25 years.
Row:  5 , translating:  红色的是永冻冰
translation:  The red is the permafrost ice.
Row:  8 , translating:  在25年的时间里，它从这里，到了这里
translation:  25 years later, it came from here to here.
Row:  10 , translating:  如果突破顶点，温室气体排放量 将是现有大气层中的全球温室污染总量
translation:  If it breaks through, the GHG emissions will be the total global greenhouse gas pollutant in the existing atmosphere.
Row:  11 , translating:  在阿拉斯加的一

In [14]:
prefix = 'translate to zh: '

predictions = []

for index, pred in enumerate(first_predictions): # Translate back to zh
  print("Row: ", str(index), ", translating: ", pred)
  src_text = prefix + pred
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)

Row:  0 , translating:  But these do not fully explain the severity of the problem, because this does not indicate the thickness of the ice cap.
translation:  但是这些并不能完全解释问题的严重性, 因为这并不能说明冰盖的厚度。
Row:  1 , translating:  The Arctic ice cap feels like a beating heart in the global climate system.
translation:  北极的冰盖感觉就像是全球气候系统中跳动的心脏。
Row:  2 , translating:  The heart is swelled in winter and the heart contractes in summer.
translation:  心脏在冬天膨胀,夏天收缩。
Row:  3 , translating:  Below I want to show you the dramatic changes that have taken place over the past 25 years.
translation:  下面我将向你们展示过去25年里发生的巨大变化。
Row:  4 , translating:  The red is the permafrost ice.
translation:  红色是永久冻冰。
Row:  5 , translating:  25 years later, it came from here to here.
translation:  25年后,它从这里来到这里。
Row:  6 , translating:  If it breaks through, the GHG emissions will be the total global greenhouse gas pollutant in the existing atmosphere.
translation:  如果它突破,全球温室气体排放量 将是现有大气中温室气体的总和。
Row:  7 , translating:  In some sh

In [15]:
processed_predictions_backward_2way = []
processed_references_backward_2way = []

def process_string(text):
    return replace_punctuation_with_zh(remove_spaces(text))

def remove_spaces(text):
    return ''.join(text.split())

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_zh(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        ',': '，',
        '.': '。',
        '?': '？',
        '!': '！',
        ':': '：',
        ';': '；',
        '"': '”',
        "'": '’',
        '(': '（',
        ')': '）',
        '[': '【',
        ']': '】',
        '{': '｛',
        '}': '｝'
    }

    # Replace each English punctuation with its Chinese counterpart
    for eng_punc, chi_punc in punctuation_mapping.items():
        text = text.replace(eng_punc, chi_punc)

    return text

processed_predictions_backward_2way = [process_string(sent) for sent in predictions]
processed_references_backward_2way = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_backward_2way, 'Reference': processed_references_backward_2way})
df.to_csv(f'zh_to_en_t5_{model_name}_bidirectional_translations_zh.csv', index=False)

processed_references_backward_2way = [[reference] for reference in processed_references_backward_2way]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_backward_2way, processed_references_backward_2way, tokenize='zh')
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_backward_2way, processed_references_backward_2way, tokenize='zh', smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")


BLEU Score without smoothing: BLEU = BLEU = 44.76 93.8/67.7/36.7/17.2 (BP = 1.000 ratio = 1.000 hyp_len = 32 ref_len = 32)
BLEU Score with smoothing: BLEU = BLEU = 47.26 93.8/68.8/38.7/20.0 (BP = 1.000 ratio = 1.000 hyp_len = 32 ref_len = 32)


2 way evaluation from english to chinese back to english

In [16]:
prefix = 'translate to zh: '

first_predictions, references = [], []

for index, row in val_df.iterrows():
  print("Row: ", str(index), ", translating: ", row["en"])
  src_text = prefix + row['en']
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  first_predictions.append(translation)
  references.append(row['en'])

references = [[reference] for reference in references]


Row:  1 , translating:  But this understates the seriousness of this particular problem  because it doesn't show the thickness of the ice.
translation:  但是这低估了这个问题的严重性, 因为它没有显示冰的厚度。
Row:  2 , translating:  The arctic ice cap is, in a sense,  the beating heart of the global climate system.
translation:  从某种意义上说,北极冰盖是全球气候系统的跳动中心。
Row:  3 , translating:  It expands in winter and contracts in summer.
translation:  它在冬天扩张,在夏天收缩。
Row:  4 , translating:  The next slide I show you will be  a rapid fast-forward of what's happened over the last 25 years.
translation:  下一张幻灯片将展示过去25年里发生的事情。
Row:  5 , translating:  The permanent ice is marked in red.
translation:  永久性冰被标记为红色。
Row:  8 , translating:  In 25 years it's gone from this, to this.
translation:  25年过去了。
Row:  10 , translating:  Compared to the total amount of global warming pollution in the atmosphere,  that amount could double if we cross this tipping point.
translation:  与全球变暖的污染总量相比, 如果跨过这个转折点,这一数量可以翻一番。
Row:  11 , translating:  Alread

In [17]:
prefix = 'translate to en: '

predictions = []

for index, pred in enumerate(first_predictions): # Translate back to zh
  print("Row: ", str(index), ", translating: ", pred)
  src_text = prefix + pred
  input_ids = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
  generated_tokens = model.generate(input_ids)
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  print("translation: ", translation)
  predictions.append(translation)

Row:  0 , translating:  但是这低估了这个问题的严重性, 因为它没有显示冰的厚度。
translation:  But this underestimates the severity of the problem, because it doesn't show the thickness of the ice.
Row:  1 , translating:  从某种意义上说,北极冰盖是全球气候系统的跳动中心。
translation:  In a sense, the Arctic ice sheet is the beating center of the global climate system.
Row:  2 , translating:  它在冬天扩张,在夏天收缩。
translation:  It expands in winter, contracting in summer.
Row:  3 , translating:  下一张幻灯片将展示过去25年里发生的事情。
translation:  The next slide will show what has happened over the past 25 years.
Row:  4 , translating:  永久性冰被标记为红色。
translation:  The permanent ice is marked with red.
Row:  5 , translating:  25年过去了。
translation:  25年过去了。
Row:  6 , translating:  与全球变暖的污染总量相比, 如果跨过这个转折点,这一数量可以翻一番。
translation:  Compared to the total amount of global warming pollution, this number could be doubled.
Row:  7 , translating:  在阿拉斯加的一些浅湖里, 甲烷正积极地从水中涌出。
translation:  In some shallow lakes in Alaska, methane is actively pouring out of the water.
Row:  8 , t

In [18]:
def process_string(text):
    return replace_punctuation_with_en(text)

'''def remove_spaces(text):
    return ''.join(text.split())'''

# Replacing en punctuation with the zh counterparts
def replace_punctuation_with_en(text):
    # Define a mapping of English punctuation to Chinese punctuation
    punctuation_mapping = {
        '，': ',',
        '。': '.',
        '？': '?',
        '！': '!',
        '：': ':',
        '；': ';',
        '”': '"',
        '’': "'",
        '（': '(',
        '）': ')',
        '【': '[',
        '】': ']',
        '｛': '{',
        '｝': '}'
    }

    # Replace each English punctuation with its Chinese counterpart
    for chi_punc, eng_punc in punctuation_mapping.items():
        text = text.replace(chi_punc, eng_punc)

    return text

processed_predictions_forward_2way = [process_string(sent) for sent in predictions]
processed_references_forward_2way = [process_string(sent[0]) for sent in references]

df = pd.DataFrame({'Prediction': processed_predictions_forward_2way, 'Reference': processed_references_forward_2way})
df.to_csv(f'zh_to_en_t5_{model_name}_bidirectional_translations_en.csv', index=False)

processed_references_forward_2way = [[reference] for reference in processed_references_forward_2way]

bleu_score_no_smoothing = sacrebleu.corpus_bleu(processed_predictions_forward_2way, processed_references_forward_2way)
bleu_score_add_1 = sacrebleu.corpus_bleu(processed_predictions_forward_2way, processed_references_forward_2way, smooth_method="add-k", smooth_value=1)

print(f"BLEU Score without smoothing: BLEU = {bleu_score_no_smoothing}")
print(f"BLEU Score with smoothing: BLEU = {bleu_score_add_1}")


BLEU Score without smoothing: BLEU = BLEU = 63.36 89.5/77.8/52.9/43.8 (BP = 1.000 ratio = 1.000 hyp_len = 19 ref_len = 19)
BLEU Score with smoothing: BLEU = BLEU = 65.55 89.5/78.9/55.6/47.1 (BP = 1.000 ratio = 1.000 hyp_len = 19 ref_len = 19)
