## 1 Import libraries

In [None]:
!pip install datasets --quiet

from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd
import os

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 2 Data Acquisition

In [None]:
orig = "es"
target = "zh"

In [None]:
def create_datasets(dataset = "news_commentary", source = "en", target = "es"):
  """
  Load dataset and split it in train, validation, test

  """
  # Load dataset
  pair = source + '-' + target
  if pair not in ['ar-cs', 'ar-de', 'cs-de', 'ar-en', 'cs-en', 'de-en', 'ar-es', 'cs-es', 'de-es', 'en-es', 'ar-fr', 'cs-fr', 'de-fr', 'en-fr', 'es-fr', 'ar-it', 'cs-it', 'de-it', 'en-it', 'es-it', 'fr-it', 'ar-ja', 'cs-ja', 'de-ja', 'en-ja', 'es-ja', 'fr-ja', 'ar-nl', 'cs-nl', 'de-nl', 'en-nl', 'es-nl', 'fr-nl', 'it-nl', 'ar-pt', 'cs-pt', 'de-pt', 'en-pt', 'es-pt', 'fr-pt', 'it-pt', 'nl-pt', 'ar-ru', 'cs-ru', 'de-ru', 'en-ru', 'es-ru', 'fr-ru', 'it-ru', 'ja-ru', 'nl-ru', 'pt-ru', 'ar-zh', 'cs-zh', 'de-zh', 'en-zh', 'es-zh', 'fr-zh', 'it-zh', 'ja-zh', 'nl-zh', 'pt-zh', 'ru-zh']:
    pair = target + '-' + source
    
  df = load_dataset(dataset, pair)
  

  # Split dataset
  split_df = df["train"].train_test_split(train_size=0.70, seed=20)

  # Convert dataset to list
  train = split_df["train"]["translation"]

  rest = split_df["test"]["translation"]

  # split rest into validation and test
  rest_half = len(rest)//2
  val = rest[:rest_half]
  test = rest[rest_half:]

  

  return train, val, test

In [None]:
train, val, test = create_datasets(source = orig, target = target)



  0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
train[0]

{'es': 'Asimismo, hace unos años, el mundo se escandalizó al saber que el famoso escritor italiano Ignazio Silone había colaborado en su juventud con la policía fascista. La vida diaria bajo el totalitarismo, ya fuera comunista o fascista, estuvo basada rutinariamente en una profunda duplicidad cuyos efectos son muy duraderos.',
 'zh': '此外，昆德拉事件远远不是独此一例。2006年，诺贝尔奖获得者德国作家君特·格拉斯被揭露在60年前，当他还是一个十几岁的孩子的时候，曾是武装党卫队的成员。同样地，在几年以前，世界震惊地发现著名的意大利作家纳齐奥·西隆尼在年青的时候曾与法西斯警察勾结。每天生活在极权统治下，不管是共产主义还是法西斯统治，常常都会陷于深深的双重性中，而这种影响将会是非常深远的。'}

In [None]:
# Create directories
path = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}'
os.mkdir(path)

# Save splits to separate csv files, to load only part at a time later
train_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/train_pairs.csv'
val_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/val_pairs.csv'
test_file = f'drive/MyDrive/MIDS/W266/Final_Project/bert2bert-finetuned/{orig}_{target}/test_pairs.csv'

pd.DataFrame(train).to_csv(train_file)
pd.DataFrame(val).to_csv(val_file)
pd.DataFrame(test).to_csv(test_file)