In [1]:
pip install huggingface rouge_score bert_score sacrebleu datasets transformers evaluate

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Colle

In [2]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

data_dir = "/content/drive/My Drive/266 Data Project/corpora"

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
import transformers
import gensim
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import pandas as pd
import os
import json
from tqdm.autonotebook import trange, tqdm

In [None]:
class TranslationDataset:
    """
    Prepare tokenized datasets for training and evaluation without relying on DataLoader.
    """
    @staticmethod
    def prepare_dataset(english_texts, chinese_texts, tokenizer):
        # Tokenize parallel corpus
        tokenized_data = {
            "source": tokenizer(
                english_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors='pt'
            ),
            "target": tokenizer(
                chinese_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors='pt'
            )
        }

        # Prepare data dictionary for Hugging Face Dataset
        dataset_dict = {
            "input_ids": tokenized_data["source"]["input_ids"],
            "attention_mask": tokenized_data["source"]["attention_mask"],
            "labels": tokenized_data["target"]["input_ids"]
        }

        # Convert to Hugging Face Dataset
        return Dataset.from_dict({key: value.tolist() for key, value in dataset_dict.items()})


class BiomedicalMarianMTEnhancer(nn.Module):
    """
    Wraps MarianMT with additional medical term embeddings.
    """
    def __init__(self, base_model, tokenizer, biowordvec_path):
        super().__init__()
        self.base_model = base_model
        self.tokenizer = tokenizer

        # Load BioWordVec embeddings
        self.biowordvec = gensim.models.KeyedVectors.load_word2vec_format(
            biowordvec_path,
            binary=True
        )

        # Create a custom embedding layer for medical terms
        embedding_dim = self.biowordvec.vector_size
        vocab_size = base_model.config.vocab_size

        # Create a custom embedding layer
        self.medical_embedding_layer = nn.Embedding(
            vocab_size,
            embedding_dim
        )

        # Initialize medical embedding layer
        self._init_medical_embeddings()

        # Additional projection layer to align embeddings
        self.projection = nn.Linear(
            embedding_dim,
            base_model.config.d_model
        )

    def _init_medical_embeddings(self):
        weight = self.medical_embedding_layer.weight.data

        for token, idx in self.tokenizer.get_vocab().items():
            clean_token = token.replace('▁', '').strip()

            try:
                # Try to get embedding for the token
                vec = self.biowordvec[clean_token]
                weight[idx] = torch.tensor(vec)
            except KeyError:
                # Fallback to default initialization
                nn.init.xavier_uniform_(weight[idx].unsqueeze(0))

    def forward(self, input_ids, labels=None, attention_mask=None):
        # Get base model embeddings
        base_embeddings = self.base_model.model.get_input_embeddings()(input_ids)

        # Get medical term embeddings
        medical_embeddings = self.medical_embedding_layer(input_ids)

        # Project medical embeddings
        projected_medical_embeddings = self.projection(medical_embeddings)

        # Combine base and medical embeddings
        combined_embeddings = base_embeddings + projected_medical_embeddings

        # Continue with standard MarianMT forward pass
        outputs = self.base_model(
            inputs_embeds=combined_embeddings,
            attention_mask=attention_mask,
            labels=labels
        )

        return outputs

    def generate(self, input_ids=None, attention_mask=None, **kwargs):
        """
        Generate translations with custom embeddings and pass them into MarianMT method as input_embeddings
        """
        if input_ids is not None:
            # Compute the base embeddings
            base_embeddings = self.base_model.model.get_input_embeddings()(input_ids)

            # Compute the medical term embeddings
            medical_embeddings = self.medical_embedding_layer(input_ids)

            # Project medical embeddings
            projected_medical_embeddings = self.projection(medical_embeddings)

            # Combine base and medical embeddings
            combined_embeddings = base_embeddings + projected_medical_embeddings

            # Use the combined embeddings for generation
            return self.base_model.generate(
                inputs_embeds=combined_embeddings,
                attention_mask=attention_mask,
                **kwargs
            )
        else:
            raise ValueError("`input_ids` must be provided for generating embeddings.")

    def save_custom(self, save_directory, tokenizer=None):
        """
        Save the model and custom embeddings.
        """
        os.makedirs(save_directory, exist_ok=True)

        # Paths
        model_save_path = os.path.join(save_directory, "model")
        embedding_save_path = os.path.join(model_save_path, "medical_embeddings.pth")
        projection_save_path = os.path.join(model_save_path, "projection_layer.pth")
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        os.makedirs(model_save_path, exist_ok=True)

        # Save the base model
        self.base_model.save_pretrained(model_save_path)

        # Save the medical embedding and projection layer
        torch.save(self.medical_embedding_layer.state_dict(), embedding_save_path)
        torch.save(self.projection.state_dict(), projection_save_path)

        # Save custom configuration
        custom_config = {
            "embedding_dim": self.medical_embedding_layer.embedding_dim,
            "vocab_size": self.medical_embedding_layer.num_embeddings
        }
        with open(custom_config_path, "w") as f:
            json.dump(custom_config, f)

        # Save tokenizer
        if tokenizer is not None:
            tokenizer.save_pretrained(tokenizer_save_path)

    def from_custom(cls, save_directory):
        """
        Load the model and custom embeddings.
        """
        # Paths
        model_save_path = os.path.join(save_directory, "model")
        embedding_save_path = os.path.join(model_save_path, "medical_embeddings.pth")
        projection_save_path = os.path.join(model_save_path, "projection_layer.pth")
        custom_config_path = os.path.join(model_save_path, "custom_config.json")
        tokenizer_save_path = os.path.join(save_directory, "tokenizer")

        # Load the base model
        base_model = transformers.MarianMTModel.from_pretrained(model_save_path)

        # Load custom configuration
        with open(custom_config_path, "r") as f:
            custom_config = json.load(f)

        # Extract custom configuration values
        embedding_dim = custom_config.get("embedding_dim")
        vocab_size = custom_config.get("vocab_size")

        # Create an instance of the enhanced model
        enhancer = cls(
            base_model=base_model,
            tokenizer=None,  # Replace with tokenizer if required
            biowordvec_path=None  # BioWordVec is not reloaded here
        )

        # Resize and initialize the medical embedding layer based on the saved config
        enhancer.medical_embedding_layer = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )

        # Load the medical embedding and projection layer states
        medical_embedding_state = torch.load(embedding_save_path)
        projection_state = torch.load(projection_save_path)
        enhancer.medical_embedding_layer.load_state_dict(medical_embedding_state)
        enhancer.projection.load_state_dict(projection_state)

        # Load tokenizer
        tokenizer = transformers.MarianTokenizer.from_pretrained(tokenizer_save_path)

        return enhancer, tokenizer



def train_biomedical_translation_model(
    base_model,
    tokenizer,
    english_texts,
    chinese_texts,
    biowordvec_path,
    test_size=0.1,
    batch_size=16,
    learning_rate=1e-4,
    num_train_epochs=3,
    output_dir="./results"
):
    # Prepare datasets
    full_dataset = TranslationDataset.prepare_dataset(english_texts, chinese_texts, tokenizer)
    split_dataset = full_dataset.train_test_split(test_size=test_size, seed=42)

    # Wrap the base model with the enhancer
    enhanced_model = BiomedicalMarianMTEnhancer(
        base_model,
        tokenizer,
        biowordvec_path
    )

    # Define Seq2Seq training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_safetensors=False,
        num_train_epochs=num_train_epochs,
        logging_dir="./logs",
        logging_steps=500,
        predict_with_generate=True,  # This is essential for seq2seq tasks like translation
        generation_num_beams=4,  # Beam search during generation
        # load_best_model_at_end=True
    )

    # Initialize Seq2SeqTrainer
    trainer = Seq2SeqTrainer(
        model=enhanced_model,
        args=training_args,
        train_dataset=split_dataset["train"],
        eval_dataset=split_dataset["test"],
        tokenizer=tokenizer
    )

    # Train the model
    trainer.train()

    return enhanced_model


In [None]:
# Main execution
# Load pretrained MarianMT model
model_name = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = transformers.MarianTokenizer.from_pretrained(model_name)
base_model = transformers.MarianMTModel.from_pretrained(model_name)


# Load your parallel corpus
dataset = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_train.parquet")
english_texts = dataset["english"].tolist()
chinese_texts = dataset["chinese"].tolist()

# Train the biomedical translation model
enhanced_model = train_biomedical_translation_model(
    base_model,
    tokenizer,
    english_texts,
    chinese_texts,
    biowordvec_path='/content/drive/MyDrive/266 Data Project/corpora/nejm/BioWordVec_PubMed_MIMICIII_d200.vec.bin',
    num_train_epochs=3
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.6251,0.467812
2,0.3613,0.343017
3,0.2983,0.318339


In [None]:
save_dir = "/content/drive/MyDrive/266 Data Project/corpora/nejm/word-vec-model"
enhanced_model.save_custom(save_dir, tokenizer)



In [4]:
from evaluate import load

def evaluate_model_metrics(predictions, references, save_path=None):
    # Load the evaluation metrics
    bleu_metric = load("bleu")
    rouge_metric = load("rouge")
    bertscore_metric = load("bertscore")
    ter_metric = load("ter")

    # Format references for metric calculation
    references = [[ref] for ref in references]

    # Evaluate BLEU score
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)

    # Evaluate ROUGE score
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)

    # Evaluate BERTScore
    bertscore_result = bertscore_metric.compute(predictions=predictions, references=references, lang="zh")

    # Evaluate TER (Translation Edit Rate)
    ter_result = ter_metric.compute(predictions=predictions, references=references)

    # Extract summary statistics for BERTScore
    bertscore_summary = {
        "mean": sum(bertscore_result["f1"]) / len(bertscore_result["f1"]),
        "median": sorted(bertscore_result["f1"])[len(bertscore_result["f1"]) // 2],
        "std": (sum((x - sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))**2 for x in bertscore_result["f1"]) / len(bertscore_result["f1"]))**0.5
    }

    # Consolidate results
    results = {
        "BLEU": bleu_result,
        "ROUGE": rouge_result,
        "BERTScore": bertscore_summary,
        "TER": ter_result,
    }

    return results

In [None]:
class BiomedicalTranslationEvaluator:
    """
    Evaluate the performance of a biomedical translation model.
    """
    def __init__(self, model, tokenizer, device=None):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to the specified device
        self.model.to(self.device)

    def prepare_dataset(self, english_texts, chinese_texts, max_length=512):
        """
        Prepare a dataset for evaluation.
        """
        # Tokenize source (English) texts
        source_encodings = self.tokenizer(
            english_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        # Tokenize target (Chinese) texts for comparison (optional)
        target_encodings = self.tokenizer(
            chinese_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        # Move all tensors to the appropriate device
        return {
            "source_input_ids": source_encodings["input_ids"].to(self.device),
            "source_attention_mask": source_encodings["attention_mask"].to(self.device),
            "target_input_ids": target_encodings["input_ids"].to(self.device)
        }

    def generate_translations(self, dataset, batch_size=16):
        translations = []
        for i in trange(0, len(dataset["source_input_ids"]), batch_size):
            batch_input_ids = dataset["source_input_ids"][i:i + batch_size].to(self.device)
            batch_attention_mask = dataset["source_attention_mask"][i:i + batch_size].to(self.device)

            # Generate translations for the batch
            outputs = self.model.generate(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                num_beams=5,
                max_length=128,  # Adjust if needed
            )
            translations.extend(self.tokenizer.batch_decode(outputs, skip_special_tokens=True))

        return translations


    def run_evaluation(self, english_texts, chinese_texts):
        """
        Run the evaluation process.
        """
        # Prepare dataset
        dataset = self.prepare_dataset(english_texts, chinese_texts)

        # Generate translations
        translations = self.generate_translations(dataset)

        # Decode target inputs for human-readable comparison
        target_texts = self.tokenizer.batch_decode(
            dataset["target_input_ids"].to("cpu"), skip_special_tokens=True
        )

        return {
            "translations": translations,
            "targets": target_texts
        }


In [None]:
# Initialize the evaluator
evaluator = BiomedicalTranslationEvaluator(
    enhanced_model,
    tokenizer
)

test_dataset = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_test.parquet")
english_test_texts = test_dataset["english"].tolist()
chinese_test_texts = test_dataset["chinese"].tolist()

torch.cuda.empty_cache()
# Run evaluation
results = evaluator.run_evaluation(
    english_test_texts,  # List of English sentences
    chinese_test_texts   # List of Chinese reference translations
)

# Print results
print("Translations:")
print(results["translations"][0:5])
print("\nTargets:")
print(results["targets"][0:5])


  0%|          | 0/132 [00:00<?, ?it/s]

Translations:
['是 一种 结合 BRAF - ABL1   的   , 通过 与 所有 其他 ABL   不同 的 机制 , 将 BRAF - ABL1  固定 为 一种   .', '•    BRAF - ABL1 , 包括  T315I  .', '患者 的  和    尚 不 清楚 .', '在 这项 1 期 , 剂量  研究 中 , 我们 纳入 了 139 例 慢性  和 9 例 慢性   ( CML ) 患者 , 这些 患者 对 至少 两种 ATP 竞争性    ( TKI ) 产生  或 无法 接受 的  .', '主要 目的 是 确定   的 最大  剂量 或 推荐 剂量 ( 或 两者 ) .']

Targets:
['asciminib 是 与 BCR - ABL1  的  酰  相结合 的  剂 , 它 可 通过 不同于 所有 其他 ABL   的 机制 将 BCR - ABL1 锁定 在 非   .', 'asciminib 同时  作用 于 天然 和  的 BCR - ABL1 , 包括  基因 ( gatekeeper ) T315I  .', 'asciminib 用于     患者 的  和 抗   尚未 明确 .', '在 这项 1 期 剂量  研究 中 , 我们 纳入 了 141 例  和 9 例 加速 期 慢性   ( CML ) 患者 , 这些 患者  对 至少 两种 ATP 竞争性    ( TKI )  或 发生 不可 接受 的  .', '本 试验 的 主要 目的 是 确定 asciminib 的 最大  剂量 或 推荐 剂量 ( 或 这 两者 ) .']


In [None]:
evaluate_model_metrics(results["translations"], results["targets"])

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BLEU': {'bleu': 0.37843446242785433,
  'precisions': [0.7342565372118212,
   0.4705194621943599,
   0.3306263280806211,
   0.2448099929627023],
  'brevity_penalty': 0.9254280983107186,
  'length_ratio': 0.928075245463194,
  'translation_length': 51704,
  'reference_length': 55711},
 'ROUGE': {'rouge1': 0.5500944885242705,
  'rouge2': 0.335626527228126,
  'rougeL': 0.5387195093250838,
  'rougeLsum': 0.5381750158780909},
 'BERTScore': {'mean': 0.9285785793464145,
  'median': 0.9323609471321106,
  'std': 0.0697446607291192},
 'TER': {'score': 46.820567775214776,
  'num_edits': 25778,
  'ref_length': 55057.0}}

In [4]:
import pandas as pd
pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/nejm_test.parquet").english[3]

'in this phase 1 , dose @-@ escalation study , we enrolled 141 patients with chronic @-@ phase and 9 with accelerated @-@ phase chronic myeloid leukemia ( CML ) who had resistance to or unacceptable side effects from at least two previous ATP @-@ competitive tyrosine kinase inhibitors ( TKIs ) .'

In [7]:
print(pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/predictions/fine-tuned_en-zh.parquet")["predicted_chinese"][0:5])

0    是 一种   , 可 结合 BCR - ABL1  的   , 通过 不同于 所有 其他 A...
1                    同时   和  BCR - ABL1 , 包括  T315I  .
2                         在     患者 中 的  和 抗   尚 不 清楚 .
3    在 这项 1 期 剂量  研究 中 , 我们 纳入 了 141 例 慢性  患者 和 9 例...
4             主要 目的 是 确定   的 最大  剂量 或 推荐 剂量 ( 或 两者 ) .
Name: predicted_chinese, dtype: object


# I realized bertscore was calculated incorrectly in other notebooks. Here are corrected values.

In [7]:
bertscore_metric = load("bertscore")
bertscore_result = bertscore_metric.compute(predictions=true_model['predicted_chinese'], references=true_model['chinese'], lang="zh")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]



In [8]:
bertscore_summary = {
        "mean": sum(bertscore_result["f1"]) / len(bertscore_result["f1"]),
        "median": sorted(bertscore_result["f1"])[len(bertscore_result["f1"]) // 2],
        "std": (sum((x - sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))**2 for x in bertscore_result["f1"]) / len(bertscore_result["f1"]))**0.5
    }
bertscore_summary

{'mean': 0.8340935502502603,
 'median': 0.8449079394340515,
 'std': 0.10023681953712625}

In [None]:
bertscore_metric = load("bertscore")
bertscore_result = bertscore_metric.compute(predictions=true_model['predicted_chinese'], references=true_model['chinese'], lang="zh")

In [5]:
true_model = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/en_to_zh_predictions_no_ner.parquet")
true_model

Unnamed: 0,predicted_chinese,chinese
0,"是 一种 结合 BCR - ABL1 的 , 它 通过 与 所有 其他 ABL ...","asciminib 是 与 BCR - ABL1 的 酰 相结合 的 剂 , 它 可..."
1,"• 和 BCR - ABL1 , 包括 T315I .","asciminib 同时 作用 于 天然 和 的 BCR - ABL1 , 包括 基因..."
2,"在 患者 中 , 的 和 抗 尚 不 清楚 .",asciminib 用于 患者 的 和 抗 尚未 明确 .
3,"在 这项 1 期 剂量 研究 中 , 我们 纳入 了 142 例 慢性 患者 和 9 例...","在 这项 1 期 剂量 研究 中 , 我们 纳入 了 141 例 和 9 例 加速 期 ..."
4,主要 目的 是 确定 最大 剂量 或 推荐 剂量 ( 或 两者 ) 作为 .,本 试验 的 主要 目的 是 确定 asciminib 的 最大 剂量 或 推荐 剂量 (...
...,...,...
2097,对 同一 基因 的 患者 进行 的 研究 结果 是 的 .,"最近 对 患者 所 做 的 检测 显示 , 相同 的 基因 获得 了 结果 ."
2098,"因此 , 患者 被 认为 有 肿瘤 .","因此 , 我们 认为 患者 发生 了 肿瘤 ."
2099,"TASAVI3 与 人类 的 自身 疾病 相关 , 并且 在 刺激 期间 , T 细胞...",目前 已经 发现 TNFAIP3 与 人类 自身 疾病 以及 刺激 期间 T 细胞 导...
2100,,诊断


In [9]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=true_model)

https://docs.google.com/spreadsheets/d/12JiOyO0CKy6Y40Dj5aNrlYX_ujhIBeokyOAXzVnIvQA#gid=0


In [6]:
fine_tune_only = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/predictions/fine-tuned_en-zh.parquet")
fine_tune_only

Unnamed: 0,chinese,english,predicted_chinese
0,asciminib 是 与 BCR - ABL1 蛋白 的 豆蔻 酰 位点 相结合 的 别构...,asciminib is an allosteric inhibitor that bind...,"是 一种 , 可 结合 BCR - ABL1 的 , 通过 不同于 所有 其他 A..."
1,"asciminib 同时 靶向 作用 于 天然 和 突变 的 BCR - ABL1 , 包括...",asciminib targets both native and mutated BCR ...,"同时 和 BCR - ABL1 , 包括 T315I ."
2,asciminib 用于 费城 染色体 阳性 白血病 患者 的 安全性 和 抗 白血病 活性...,the safety and antileukemic activity of ascimi...,在 患者 中 的 和 抗 尚 不 清楚 .
3,"在 这项 1 期 剂量 递增 研究 中 , 我们 纳入 了 141 例 慢性期 和 9 例 ...","in this phase 1 , dose @-@ escalation study , ...","在 这项 1 期 剂量 研究 中 , 我们 纳入 了 141 例 慢性 患者 和 9 例..."
4,本 试验 的 主要 目的 是 确定 asciminib 的 最大 耐受 剂量 或 推荐 剂量...,the primary objective was to determine the max...,主要 目的 是 确定 的 最大 剂量 或 推荐 剂量 ( 或 两者 ) .
...,...,...,...
2097,"最近 对 患者 骨髓 标本 所 做 的 检测 显示 , 相同 的 基因 测序 获得 了 阴性...",recent testing of the patient 's bone marrow s...,"最近 对 患者 的 样本 进行 的 检测 发现 , 使用 相同 基因 的 结果 为 ."
2098,"因此 , 我们 认为 患者 发生 了 肿瘤 特异性 突变 .","therefore , the patient was believed to have a...","因此 , 患者 被 认为 有 肿瘤 ."
2099,目前 已经 发现 TNFAIP3 功能障碍 与 人类 自身 炎症性 疾病 以及 体外 刺激 ...,dysfunction of TNFAIP3 has been associated wit...,"在 刺激 过程 中 , TNFAIP3 与 人类 自身 疾病 相关 , 并且 与 T ..."
2100,解剖 诊断,anatomical Diagnosis,诊断


In [9]:
bertscore_result_ft = bertscore_metric.compute(predictions=fine_tune_only['predicted_chinese'], references=fine_tune_only['chinese'], lang="zh")



In [15]:
bertscore_summary_ft = {
        "mean": sum(bertscore_result_ft["f1"]) / len(bertscore_result_ft["f1"]),
        "median": sorted(bertscore_result_ft["f1"])[len(bertscore_result_ft["f1"]) // 2],
        "std": (sum((x - sum(bertscore_result_ft["f1"]) / len(bertscore_result_ft["f1"]))**2 for x in bertscore_result_ft["f1"]) / len(bertscore_result_ft["f1"]))**0.5
    }
bertscore_summary_ft

{'mean': 0.8267586609077499,
 'median': 0.8308088779449463,
 'std': 0.07449168318166981}

In [12]:
dirty = pd.read_parquet("/content/drive/MyDrive/266 Data Project/corpora/nejm/en_to_zh_dirty_embeddings_output.parquet")


In [13]:
bertscore_result_dirty = bertscore_metric.compute(predictions=dirty['predicted_chinese'], references=dirty['chinese'], lang="zh")



In [14]:
bertscore_summary_dirty = {
        "mean": sum(bertscore_result_dirty["f1"]) / len(bertscore_result_dirty["f1"]),
        "median": sorted(bertscore_result_dirty["f1"])[len(bertscore_result_dirty["f1"]) // 2],
        "std": (sum((x - sum(bertscore_result_dirty["f1"]) / len(bertscore_result_dirty["f1"]))**2 for x in bertscore_result_dirty["f1"]) / len(bertscore_result_dirty["f1"]))**0.5
    }
bertscore_summary_dirty

{'mean': 0.6876775735318661,
 'median': 0.6972000598907471,
 'std': 0.11251091437414876}

In [12]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=fine_tune_only)

https://docs.google.com/spreadsheets/d/1DXWHZLCBeyA3fixiuVshD_I4UU44SzQLCxBI9KyVTCU#gid=0
