In [2]:
import os
import torch
import random
import evaluate
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
from dataclasses import dataclass 
from time import perf_counter
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, disable_progress_bar, load_from_disk
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from translation_utils import create_translation_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset('./data/alt')
dataset

DatasetDict({
    train: Dataset({
        features: ['SNT.URLID', 'SNT.URLID.SNTID', 'url', 'translation'],
        num_rows: 18088
    })
    validation: Dataset({
        features: ['SNT.URLID', 'SNT.URLID.SNTID', 'url', 'translation'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['SNT.URLID', 'SNT.URLID.SNTID', 'url', 'translation'],
        num_rows: 1019
    })
})

In [4]:
def transform_features(example):
    # Engilsh to Chinese
    new_example = {'en': example['translation']['en'], 'zh': example['translation']['zh']}
    return new_example

# Process dataset
dataset = dataset.map(transform_features, remove_columns=['SNT.URLID', 'SNT.URLID.SNTID', 'translation', 'url'])

Map: 100%|██████████| 18088/18088 [00:01<00:00, 11603.35 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 11394.84 examples/s]
Map: 100%|██████████| 1019/1019 [00:00<00:00, 11358.04 examples/s]


In [5]:
df_train = pd.DataFrame(dataset['train'] )
df_train = df_train.dropna()
df_train.to_csv('./data/alt/train.csv', index=False)
df_train

Unnamed: 0,en,zh
0,Italy have defeated Portugal 31-5 in Pool C of...,意大利在法国巴黎王子公园体育场举办的2007年橄榄球世界杯C组以31-5击败葡萄牙。
1,Andrea Masi opened the scoring in the fourth m...,安德里亚·马西在第四分钟成功达阵，意大利率先得分。
2,Despite controlling the game for much of the f...,尽管上半场大部分时间意大利都控制着比赛，但在中场休息前，意大利未能再次达阵得分，不过大卫·博...
3,Portugal never gave up and David Penalva score...,葡萄牙从未放弃，大卫·佩纳尔瓦在第33分钟成功达阵，这是他们本场比赛唯一的得分。
4,Italy led 16-5 at half time but were matched b...,上半场意大利以16-5领先，但下半场大部分时间葡萄牙的表现不相上下。
...,...,...
18083,"""We have a very good recruiting policy and any...",周一，雅拉电车执行副总裁丹尼斯·克莱切告诉澳大利亚美联社：“我们有非常好的招聘政策，对于符合...
18084,Detective Senior Constable Barry Hills of Vict...,维多利亚警察局的侦探高级警员巴里·希尔斯谈到这个男孩时说：“他是个讨人喜欢的孩子，是个好孩子。”
18085,"""I think his obsession just got the better of ...",“我想他是因为抵受不住自己的痴迷。”
18086,Described as wearing a jacket similar to offic...,据描述，这名男孩穿着一件与雅拉电车制服相似的夹克。周日晚上，这名男孩在距电车被盗地点15公里...


In [6]:
df_val = pd.DataFrame(dataset['validation'] )
df_val = df_val.dropna()
df_val.to_csv('./data/alt/val.csv', index=False)

In [7]:
df_test = pd.DataFrame(dataset['test'] )
df_test = df_test.dropna()
df_test.to_csv('./data/alt/test.csv', index=False)

In [8]:
data_files = {'train': './data/alt/train.csv', 'validation': './data/alt/val.csv', 'test': './data/alt/test.csv'}


In [12]:
dataset_dict = load_dataset(
    "csv",
    delimiter=",",
    column_names=['en', 'zh'],
    data_files=data_files
)
dataset_dict

Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 8707.90it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2315.16it/s]
Generating train split: 18089 examples [00:00, 211546.01 examples/s]
Generating validation split: 1001 examples [00:00, 119261.97 examples/s]
Generating test split: 1019 examples [00:00, 125076.70 examples/s]


DatasetDict({
    train: Dataset({
        features: ['en', 'zh'],
        num_rows: 18089
    })
    validation: Dataset({
        features: ['en', 'zh'],
        num_rows: 1001
    })
    test: Dataset({
        features: ['en', 'zh'],
        num_rows: 1019
    })
})

In [15]:
dataset_dict['train'][1]

{'en': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes, Paris, France.',
 'zh': '意大利在法国巴黎王子公园体育场举办的2007年橄榄球世界杯C组以31-5击败葡萄牙。'}