<a href="https://colab.research.google.com/github/len-rtz/plus-facile/blob/main/finetuning-models/finetuning-BARThez.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.1

# Data Cleaning

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
import re
from tqdm import tqdm

# Load TSV file
df = pd.read_csv('wivico_dataset_v2.tsv', sep='\t')
print(f"Original dataset size: {len(df)} pairs")

# Filter for simplification pairs (pair == 0)
simplification_df = df[df['pair (0: simplification, 1: complexification)'] == 0]
print(f"Simplification pairs: {len(simplification_df)}")

In [None]:
# 1. Basic cleaning and filtering
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    return text.strip()

# Apply cleaning
simplification_df['wiki_sent_clean'] = simplification_df['wiki_sent'].apply(clean_text)
simplification_df['viki_sent_clean'] = simplification_df['viki_sent'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplification_df['wiki_sent_clean'] = simplification_df['wiki_sent'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplification_df['viki_sent_clean'] = simplification_df['viki_sent'].apply(clean_text)


In [None]:
# 2. Length checks
simplification_df['complex_len'] = simplification_df['wiki_sent_clean'].apply(len)
simplification_df['simple_len'] = simplification_df['viki_sent_clean'].apply(len)

# Filter out empty pairs or too short texts
min_length = 10
simplification_df = simplification_df[(simplification_df['complex_len'] > min_length) &
                                      (simplification_df['simple_len'] > min_length)]
print(f"After removing short texts: {len(simplification_df)} pairs")

After removing short texts: 42475 pairs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplification_df['complex_len'] = simplification_df['wiki_sent_clean'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simplification_df['simple_len'] = simplification_df['viki_sent_clean'].apply(len)


In [None]:
# 3. Simplification verification
# Verify if the "simple" text is actually simpler than the "complex" text
# We can use basic metrics like length ratio, word count, etc.

simplification_df['word_count_complex'] = simplification_df['wiki_sent_clean'].apply(lambda x: len(x.split()))
simplification_df['word_count_simple'] = simplification_df['viki_sent_clean'].apply(lambda x: len(x.split()))
simplification_df['char_ratio'] = simplification_df['simple_len'] / simplification_df['complex_len']
simplification_df['word_ratio'] = simplification_df['word_count_simple'] / simplification_df['word_count_complex']

# Define reasonable thresholds for simplification
# Usually simple text should be shorter or at least not much longer
max_length_ratio = 1.5  # Simple text should not be 50% longer than complex
min_length_ratio = 0.3  # Simple text should not be 70% shorter than complex

simplification_df = simplification_df[(simplification_df['char_ratio'] <= max_length_ratio) &
                                      (simplification_df['char_ratio'] >= min_length_ratio)]
print(f"After simplification ratio check: {len(simplification_df)} pairs")

After simplification ratio check: 40099 pairs


In [None]:
# 4. Content similarity check
# Ensure that simple and complex texts are actually related
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_content_similarity(df, sample_size=1000):
    sample_df = df.sample(min(sample_size, len(df)))

    vectorizer = TfidfVectorizer()
    all_texts = list(sample_df['wiki_sent_clean']) + list(sample_df['viki_sent_clean'])
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    similarities = []
    n = len(sample_df)
    for i in range(n):
        sim = cosine_similarity(tfidf_matrix[i:i+1], tfidf_matrix[i+n:i+n+1])[0][0]
        similarities.append(sim)

    return similarities

similarities = compute_content_similarity(simplification_df)
avg_similarity = np.mean(similarities)
print(f"Average content similarity between complex and simple texts: {avg_similarity:.4f}")

Average content similarity between complex and simple texts: 0.5671


In [None]:
# Filter out pairs with very low similarity
similarity_threshold = 0.3
low_similarity_count = sum(s < similarity_threshold for s in similarities)
print(f"Pairs with similarity below {similarity_threshold}: {low_similarity_count} ({low_similarity_count/len(similarities)*100:.2f}%)")

Pairs with similarity below 0.3: 73 (7.30%)


In [None]:
# 5. Create final cleaned dataset
final_df = simplification_df[['wiki_sent_clean', 'viki_sent_clean']].rename(
    columns={'wiki_sent_clean': 'complex', 'viki_sent_clean': 'simple'})

print(f"\nFinal dataset size: {len(final_df)} pairs")


Final dataset size: 40099 pairs


In [None]:
# Save a sample of the data to inspect
final_df.sample(10).to_csv('sample_cleaned_data.csv', index=False)

In [None]:
# Statistics summary
print("\nData Statistics:")
print(f"Average complex text length: {final_df['complex'].str.len().mean():.2f} characters")
print(f"Average simple text length: {final_df['simple'].str.len().mean():.2f} characters")
print(f"Average complex words: {final_df['complex'].apply(lambda x: len(x.split())).mean():.2f}")
print(f"Average simple words: {final_df['simple'].apply(lambda x: len(x.split())).mean():.2f}")


Data Statistics:
Average complex text length: 238.12 characters
Average simple text length: 167.75 characters
Average complex words: 38.49
Average simple words: 28.07


# Model Finetuning

In [None]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(final_df)

# Tokenize the entire dataset first
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split the tokenized dataset (80% train, 20% validation)
tokenized_splits = tokenized_dataset.train_test_split(test_size=0.2)

In [None]:
# Download BARThez model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Initialize model and tokenizer
model_name = "moussaKam/barthez"  # French BART model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Prepare data for training
def preprocess_function(examples):
 inputs = examples["complex"]
 targets = examples["simple"]
 model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
 labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
 model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Define Training arguments
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
 output_dir="./results",
 evaluation_strategy="epoch",
 learning_rate=5e-5,
 per_device_train_batch_size=8,
 per_device_eval_batch_size=8,
 weight_decay=0.01,
 save_total_limit=3,
 num_train_epochs=3,
 predict_with_generate=True,
 push_to_hub=False,
)

In [None]:
# Define trainer and train
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
import numpy as np
import evaluate

# Metric for evaluation
metric = evaluate.load("rouge")
def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

# Replace -100 in the labels as we can't decode them
 labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Rouge expects a newline after each sentence
 decoded_preds = [pred.strip() for pred in decoded_preds]
 decoded_labels = [label.strip() for label in decoded_labels]
 result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

# Extract the median ROUGE scores
 result = {key: value * 100 for key, value in result.items()}
return {k: round(v, 4) for k, v in result.items()}

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize trainer
trainer = Seq2SeqTrainer(
 model,
 training_args,
 train_dataset=tokenized_datasets["train"],
 eval_dataset=tokenized_datasets["test"],
 data_collator=data_collator,
 tokenizer=tokenizer,
 compute_metrics=compute_metrics
)
# Train the model
trainer.train()