## downloading requried packages

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install evaluate
!pip install sacrebleu
!pip install sentencepiece
!pip install huggingface_hub
!pip install sacremoses



## doing imports

In [None]:
import os
import re
import requests
import numpy as np
import torch
import nltk

from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    MarianConfig,
    MarianTokenizer,
    GenerationConfig,
    AutoModel,
    pipeline
)
from huggingface_hub import (
    HfApi,
    create_repo,
    notebook_login,
    login
)
import evaluate


## loading and pre-processing the data

### load the dataset

In [None]:
try:
    dataset = load_dataset("Helsinki-NLP/opus-100", "ar-en")
    # opus-100 is typically split into train, validation, test
    # Let's focus on the training set for cleaning as it's the largest
    data_to_clean = dataset['train']
    print(f"Original dataset size (train): {len(data_to_clean)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    # Handle potential issues like no internet or incorrect config name


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train-00000-of-00001.parquet:   0%|          | 0.00/99.3M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/979k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Original dataset size (train): 1000000


### starting the pre-processing

1. remove duplicated translation pairs function

In [None]:
def remove_duplicates(dataset: Dataset) -> Dataset:
    # Create a unique key for each pair
    keys = [f"{ex['translation']['en']}|{ex['translation']['ar']}" for ex in dataset]
    unique_keys = set()
    indices_to_keep = []
    for i, key in enumerate(keys):
        if key not in unique_keys:
            unique_keys.add(key)
            indices_to_keep.append(i)
    print(f"Removed {len(dataset) - len(indices_to_keep)} duplicates.")
    return dataset.select(indices_to_keep)


2. remove very small sentences (less than 2 words) and very long sentences (longer than 100 words)

In [None]:
def filter_short_long(dataset: Dataset, min_words=2, max_words=100) -> Dataset:
    def _filter(example):
        en_text = example['translation']['en']
        ar_text = example['translation']['ar']
        en_words = len(en_text.split())
        ar_words = len(ar_text.split())
        # Check both sides
        return (en_words >= min_words and en_words <= max_words and
                ar_words >= min_words and ar_words <= max_words)

    original_size = len(dataset)
    filtered_dataset = dataset.filter(_filter)
    print(f"Removed {original_size - len(filtered_dataset)} sentences due to length.")
    return filtered_dataset


3. remove suspicious translation pairs, like 3 words are translated to 12 words?

In [None]:

def filter_length_ratio(dataset: Dataset, max_ratio=3.0) -> Dataset:
    def _filter(example):
        en_text = example['translation']['en']
        ar_text = example['translation']['ar']
        en_words = len(en_text.split())
        ar_words = len(ar_text.split())

        if en_words == 0 or ar_words == 0:
             return False # Should be handled by filter_short_long, but good to double check

        ratio = max(en_words, ar_words) / min(en_words, ar_words)
        return ratio <= max_ratio

    original_size = len(dataset)
    filtered_dataset = dataset.filter(_filter)
    print(f"Removed {original_size - len(filtered_dataset)} sentences due to length ratio.")
    return filtered_dataset

4. removing useless whitespaces

In [None]:
def normalize_whitespace(dataset: Dataset) -> Dataset:
    """Normalizes whitespace in both English and Arabic text."""
    def _normalize(example):
        example['translation']['en'] = re.sub(r'\s+', ' ', example['translation']['en']).strip()
        example['translation']['ar'] = re.sub(r'\s+', ' ', example['translation']['ar']).strip()
        return example

    return dataset.map(_normalize)


5. removing overloading alphabetic

In [None]:

def filter_non_alphabetic_ratio(dataset: Dataset, max_ratio=0.3) -> Dataset:
    """Removes sentences where non-alphabetic characters exceed a certain ratio."""
    # Arabic alphabet includes letters with diacritics, handle them
    # This regex captures common Arabic letters and English letters
    en_alpha_pattern = re.compile(r'[a-zA-Z]')
    ar_alpha_pattern = re.compile(r'[\u0621-\u064A\u0660-\u0669]') # Basic Arabic range + digits (optional)

    def _filter(example):
        en_text = example['translation']['en']
        ar_text = example['translation']['ar']

        en_alpha_count = len(en_alpha_pattern.findall(en_text))
        ar_alpha_count = len(ar_alpha_pattern.findall(ar_text))

        # Avoid division by zero if text is empty
        en_ratio = (len(en_text) - en_alpha_count) / (len(en_text) + 1e-9) # Add small epsilon
        ar_ratio = (len(ar_text) - ar_alpha_count) / (len(ar_text) + 1e-9)

        return en_ratio <= max_ratio and ar_ratio <= max_ratio

    original_size = len(dataset)
    filtered_dataset = dataset.filter(_filter)
    print(f"Removed {original_size - len(filtered_dataset)} sentences due to non-alphabetic characters ratio.")
    return filtered_dataset

6. filter URLs and Emails

In [None]:

def filter_urls_emails(dataset: Dataset) -> Dataset:
    """Removes sentences containing URLs or email addresses."""
    url_email_pattern = re.compile(r'http[s]?://\S+|www\.\S+|\S+@\S+')

    def _filter(example):
        en_text = example['translation']['en']
        ar_text = example['translation']['ar']
        # Remove if pattern is found in either language
        return not (url_email_pattern.search(en_text) or url_email_pattern.search(ar_text))

    original_size = len(dataset)
    filtered_dataset = dataset.filter(_filter)
    print(f"Removed {original_size - len(filtered_dataset)} sentences containing URLs or emails.")
    return filtered_dataset



### using the functions

In [None]:

print("\nStarting cleaning process...")

# 2. Remove Duplicates
cleaned_data = remove_duplicates(data_to_clean)

# 8. Handle Whitespace (Do this early to make length calculations more accurate)
cleaned_data = normalize_whitespace(cleaned_data)

# 3 & 4. Remove Empty or Too Short/Long Sentences
# Adjust min/max words based on your needs. 2 words is a reasonable minimum.
cleaned_data = filter_short_long(cleaned_data, min_words=2, max_words=100)

# 5. Remove Sentences with Extreme Length Ratios
# A ratio of 3.0 means one sentence is no more than 3 times longer than the other.
cleaned_data = filter_length_ratio(cleaned_data, max_ratio=3.0)

# 6. Remove Sentences with Excessive Non-Alphabetic Characters
# This helps remove noisy strings. Adjust the ratio if needed.
cleaned_data = filter_non_alphabetic_ratio(cleaned_data, max_ratio=0.3)
cleaned_data = filter_urls_emails(cleaned_data)

print(f"\nFinished cleaning.")


Starting cleaning process...
Removed 40071 duplicates.


Map:   0%|          | 0/959929 [00:00<?, ? examples/s]

Filter:   0%|          | 0/959929 [00:00<?, ? examples/s]

Removed 61577 sentences due to length.


Filter:   0%|          | 0/898352 [00:00<?, ? examples/s]

Removed 10602 sentences due to length ratio.


Filter:   0%|          | 0/887750 [00:00<?, ? examples/s]

Removed 188242 sentences due to non-alphabetic characters ratio.


Filter:   0%|          | 0/699508 [00:00<?, ? examples/s]

Removed 116 sentences containing URLs or emails.

Finished cleaning.


In [None]:
print(f"Original dataset size (train): {len(data_to_clean)}")
print(f"Final dataset size (train): {len(cleaned_data)}")

Original dataset size (train): 1000000
Final dataset size (train): 699392


7. saving data into session storage for future use:

In [None]:
cleaned_data.save_to_disk("/content/cleaned_opus_ar_en")

Saving the dataset (0/1 shards):   0%|          | 0/699392 [00:00<?, ? examples/s]

## Trainging Phase!

1. downloading the tokenizer and prepare it's configuration

In [None]:
nltk.download('punkt', quiet=True) # Download sentence tokenizer for BLEU calculation

# Language codes for source and target
# the key of the datasets column
SOURCE_LANG = "ar"
TARGET_LANG = "en"

# Tokenizer configuration
# why 128?
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 512

2. choosing the model

In [None]:
MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-ar-en"

3. logs and model weights saving path

In [None]:
OUTPUT_DIR = "./opus-mt-ar-en-finetuned"

4. loading the cleaned dataset

In [None]:
DATASET_PATH = "/content/cleaned_opus_ar_en"


5. setting up training arguments **!**

In [None]:

TRAINING_ARGS = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1, # You can adjust the number of epochs
    predict_with_generate=True, # Required for BLEU calculation during evaluation
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
    push_to_hub=False, # Set to True if you want to push the model to Hugging Face Hub
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_steps=500, # Save checkpoint every 500 steps
    eval_steps=500, # Evaluate every 500 steps (should align with save_steps for evaluation strategy 'steps')
    report_to="none",
)


6. loading the dataset

In [None]:
print("--- Loading Dataset ---")
try:
    dataset = load_from_disk(DATASET_PATH)
    print(f"Dataset loaded successfully from disk at: {DATASET_PATH}")

    print(dataset)

    # Ensure necessary splits exist (at least 'train' and 'validation')
    # This part is important. If your saved data is just a single 'train' split,
    # the script will attempt to create a validation split.
    if 'train' not in dataset or 'validation' not in dataset:
         print("Warning: Dataset does not have 'train' or 'validation' splits. Attempting to create them.")
         # Simple split if only one split exists (e.g., if you loaded a single Dataset object from disk)
         if isinstance(dataset, Dataset): # Check if it's a single Dataset object
             print("Splitting the single dataset into train and validation...")
             # Adjust test_size as needed, ensure it's not too small
             split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
             dataset = DatasetDict({
                 'train': split_dataset['train'],
                 'validation': split_dataset['test']
             })
             print("Created 'train' and 'validation' splits.")
             print(dataset)
         elif isinstance(dataset, DatasetDict):
              # If it's already a DatasetDict but missing splits, check keys
              print(f"DatasetDict has splits: {list(dataset.keys())}. Please ensure 'train' and 'validation' are present or adjust the code.")
              # You might need to manually rename a split here if it exists with a different name
              # Example: if your saved data was originally named 'my_train_data', and you want to use it as 'train'
              # if 'my_train_data' in dataset: dataset['train'] = dataset.pop('my_train_data')
              # Then you might need to load a separate validation set or create one from the training data
              pass # Continue assuming the user will handle or the provided splits are sufficient
         else:
             print("Error: Dataset format not recognized or missing necessary splits ('train', 'validation').")
             exit()

    # If your saved dataset is already a DatasetDict with 'train' and 'validation' splits,
    # the splitting logic above will be skipped.

except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

--- Loading Dataset ---
Dataset loaded successfully from disk at: /content/cleaned_opus_ar_en
Dataset({
    features: ['translation'],
    num_rows: 699392
})
Splitting the single dataset into train and validation...
Created 'train' and 'validation' splits.
DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 629452
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 69940
    })
})


7. loading the tokenizer

In [None]:

print("\n--- Loading Tokenizer and Model ---")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
    print(f"Tokenizer and Model loaded successfully from '{MODEL_CHECKPOINT}'.")
except Exception as e:
    print(f"Error loading tokenizer or model: {e}")
    exit()



--- Loading Tokenizer and Model ---


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Tokenizer and Model loaded successfully from 'Helsinki-NLP/opus-mt-ar-en'.


8. tokenize the source and the target language

In [None]:
def preprocess_function(examples):
    # Ensure the input structure is correct based on the dataset column name
    # The 'translation' column should contain dictionaries like {'ar': '...', 'en': '...'}
    inputs = [ex[SOURCE_LANG] for ex in examples["translation"]]
    targets = [ex[TARGET_LANG] for ex in examples["translation"]]

    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

try:
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    print("Dataset tokenized successfully.")
    print(tokenized_datasets)
except Exception as e:
    print(f"Error during preprocessing: {e}")
    exit()


Map:   0%|          | 0/629452 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Map:   0%|          | 0/69940 [00:00<?, ? examples/s]

Dataset tokenized successfully.
DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 629452
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 69940
    })
})


- data collator (يعني ايه)

In [None]:
# Data collator for seq2seq tasks. It prepares batches by padding and creating attention masks.

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


حبة فانكشنز ماعرفش لازمتهم

In [None]:

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    # Replace -100s in labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process text
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute BLEU score
    # sacrebleu expects references as a list of lists
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    # Add per-example metrics (optional, can be slow on large datasets)
    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    # result["gen_len"] = np.mean(prediction_lens)
    # result = {k: round(v, 4) for k, v in result.items()}

    return result


9. setting up evaluation metric

In [None]:
try:
    metric = evaluate.load("sacrebleu")
    print("SacreBLEU metric loaded.")
except Exception as e:
    print(f"Error loading SacreBLEU metric: {e}")
    exit()


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

SacreBLEU metric loaded.


10. starting the training

In [None]:

try:
    trainer = Seq2SeqTrainer(
        model=model,
        args=TRAINING_ARGS,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    print("Trainer initialized.")
except Exception as e:
    print(f"Error initializing Trainer: {e}")

print("\n--- Starting Training ---")
try:
    train_result = trainer.train()
    trainer.save_model()
    tokenizer.save_pretrained(TRAINING_ARGS.output_dir)

    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    print("\nTraining finished successfully!")
    print(metrics)

except Exception as e:
    print(f"Error during training: {e}")
    # trainer.save_model(f"{OUTPUT_DIR}/checkpoint-error")


  trainer = Seq2SeqTrainer(


Trainer initialized.

--- Starting Training ---


Epoch,Training Loss,Validation Loss




Epoch,Training Loss,Validation Loss,Bleu
1,1.3764,1.201518,43.506685


***** train metrics *****
  epoch                    =        1.0
  total_flos               =  7617419GF
  train_loss               =     1.2844
  train_runtime            = 2:25:22.37
  train_samples_per_second =     72.165
  train_steps_per_second   =       4.51

Training finished successfully!
{'train_runtime': 8722.3704, 'train_samples_per_second': 72.165, 'train_steps_per_second': 4.51, 'total_flos': 8179141557878784.0, 'train_loss': 1.2843725846842247, 'epoch': 1.0}


## Evaluation

- Starting Evaluation

In [None]:
# think it's uesless because we alreayd evaluating after the epoch
#try:
#     eval_metrics = trainer.evaluate()
#     trainer.log_metrics("eval", eval_metrics)
#     trainer.save_metrics("eval", eval_metrics)
#     print("\nEvaluation finished.")
#     print(eval_metrics)
#except Exception as e:
#     print(f"Error during evaluation: {e}")
#     pass # Don't exit if evaluation fails, training might still be useful


- testing with live examples...

In [None]:
try:

    fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(TRAINING_ARGS.output_dir)
    fine_tuned_tokenizer = AutoTokenizer.from_pretrained(TRAINING_ARGS.output_dir)


    if hasattr(fine_tuned_tokenizer, 'src_lang') and hasattr(fine_tuned_tokenizer, 'tgt_lang'):
        fine_tuned_tokenizer.src_lang = SOURCE_LANG
        fine_tuned_tokenizer.tgt_lang = TARGET_LANG

    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    fine_tuned_model.to(device)

    arabic_sentences = [
        "مرحبا بالعالم!",
        "كيف حالك اليوم؟",
        "هذه جملة للاختبار.",
        "أنا أتعلم الترجمة الآلية.",
    ]

    # Tokenize the input sentences
    inputs = fine_tuned_tokenizer(arabic_sentences, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LENGTH).to(device)

    # Add num_beams for beam search decoding (usually improves quality)
    generated_tokens = fine_tuned_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        num_beams=5,
        max_length=MAX_TARGET_LENGTH,
    )

    # Decode the generated tokens
    decoded_translations = fine_tuned_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)


    for original, translated in zip(arabic_sentences, decoded_translations):
        print(f"Arabic: {original}")
        print(f"English: {translated}")
        print("-" * 20)

except Exception as e:
    print(f"Error during inference example: {e}")

    pass # Don't exit if inference fails


Arabic: مرحبا بالعالم!
English: Welcome to the world!
--------------------
Arabic: كيف حالك اليوم؟
English: How are you today?
--------------------
Arabic: هذه جملة للاختبار.
English: That's a test sentence.
--------------------
Arabic: أنا أتعلم الترجمة الآلية.
English: I'm learning machine translation.
--------------------


## uploading the model to hugging face


In [None]:


LOCAL_MODEL_DIR = "./opus-mt-ar-en-finetuned" # <-- Ensure this matches OUTPUT_DIR from your training script

HUB_REPO_ID = "your_username/opus-mt-ar-en-finetuned-myversion" # <-- **REQUIRED: Update with your username and desired repo name**

# Optional: Add a commit message for the initial push
COMMIT_MESSAGE = "Upload fine-tuned opus-mt-ar-en model"


# --- Check for local model files ---
model_file = os.path.join(LOCAL_MODEL_DIR, "pytorch_model.bin")
tokenizer_config_file = os.path.join(LOCAL_MODEL_DIR, "tokenizer_config.json")

if not os.path.exists(LOCAL_MODEL_DIR):
    print(f"Error: Local model directory not found at {LOCAL_MODEL_DIR}")
    print("Please ensure you have run the training script and it saved the model to this location.")
    exit()

if not os.path.exists(model_file) or not os.path.exists(tokenizer_config_file):
    print(f"Error: Model or tokenizer files not found in {LOCAL_MODEL_DIR}")
    print("Expected files like 'pytorch_model.bin' and 'tokenizer_config.json'.")
    print("Please check the contents of the directory.")
    exit()


print(f"\n--- Loading model and tokenizer from {LOCAL_MODEL_DIR} ---")
try:
    model = AutoModelForSeq2SeqLM.from_pretrained(LOCAL_MODEL_DIR)
    tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_DIR)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer from local directory: {e}")
    exit()


# --- Push the model and tokenizer to the Hugging Face Hub ---
print(f"\n--- Pushing model to Hugging Face Hub repository: {HUB_REPO_ID} ---")

api = HfApi()

try:
    # Create the repository on the Hub if it doesn't exist
    # set private=True if you want the repo to be private initially
    create_repo(repo_id=HUB_REPO_ID, repo_type="model", exist_ok=True)
    print(f"Repository '{HUB_REPO_ID}' created or already exists on the Hub.")

    # Push the model and tokenizer files to the repository
    # This uploads the entire contents of LOCAL_MODEL_DIR
    api.upload_folder(
        folder_path=LOCAL_MODEL_DIR,
        repo_id=HUB_REPO_ID,
        commit_message=COMMIT_MESSAGE,
        repo_type="model",
    )
    print(f"\nSuccessfully pushed model and tokenizer to https://huggingface.co/{HUB_REPO_ID}")

except Exception as e:
    print(f"Error pushing model to Hugging Face Hub: {e}")
    print("Please ensure you are logged in (`huggingface-cli login`) and have write access to the repository.")


print("\n--- Push to Hub Script Finished ---")


Error: Model or tokenizer files not found in ./opus-mt-ar-en-finetuned
Expected files like 'pytorch_model.bin' and 'tokenizer_config.json'.
Please check the contents of the directory.
Found model and tokenizer files in ./opus-mt-ar-en-finetuned.

--- Loading model and tokenizer from ./opus-mt-ar-en-finetuned ---




Model and tokenizer loaded successfully.

--- Pushing model to Hugging Face Hub repository: your_username/opus-mt-ar-en-finetuned-myversion ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Error pushing model to Hugging Face Hub: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-681a518a-35f8c201682ce62375824c0d;6bbaabaa-edee-4b1d-8526-6c5b1aaeee67)

Invalid username or password.
Please ensure you are logged in (`huggingface-cli login`) and have write access to the repository.

--- Push to Hub Script Finished ---




In [None]:

# This will prompt you to enter your Hugging Face token
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

# Log in with your token
login(token="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")

model_name = "opus-mt-ar-en-finetuned"
namespace = "KarimEmam"  # Replace with your actual username

try:
    # Create the repository
    create_repo(
        repo_id=f"{namespace}/{model_name}",
        repo_type="model",
        exist_ok=True
    )

    # Load and push your model and tokenizer
    model = AutoModel.from_pretrained("./opus-mt-ar-en-finetuned")
    tokenizer = AutoTokenizer.from_pretrained("./opus-mt-ar-en-finetuned")
    model.push_to_hub(f"{namespace}/{model_name}")
    tokenizer.push_to_hub(f"{namespace}/{model_name}")

    print(f"Model uploaded to: https://huggingface.co/{namespace}/{model_name}")
except Exception as e:
    print(f"Error: {e}")

Some weights of MarianModel were not initialized from the model checkpoint at ./opus-mt-ar-en-finetuned and are newly initialized: ['decoder.embed_positions.weight', 'encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model uploaded to: https://huggingface.co/KarimEmam/opus-mt-ar-en-finetuned


In [None]:
!huggingface-cli whoami

KarimEmam


In [None]:
notebook_login()  # This will show a widget to enter your token


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

# 1. First properly authenticate
notebook_login()

# 2. Reconstruct missing files and upload
try:
    # Load your local model
    local_path = "./opus-mt-ar-en-finetuned"
    tokenizer = AutoTokenizer.from_pretrained(local_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(local_path)

    # Create missing generation config
    generation_config = GenerationConfig.from_model_config(model.config)
    generation_config.save_pretrained(local_path)

    # Create proper Marian config
    config = MarianConfig.from_pretrained(local_path)
    config.save_pretrained(local_path)

    # Now push everything
    model.push_to_hub(
        "KarimEmam/opus-mt-ar-en-finetuned",
        commit_message="Add missing config files"
    )
    tokenizer.push_to_hub("KarimEmam/opus-mt-ar-en-finetuned")

    print("Successfully fixed and uploaded repository!")

except Exception as e:
    print(f"Error: {str(e)}")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Successfully fixed and uploaded repository!


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:

api = HfApi()
files = api.list_repo_files("KarimEmam/opus-mt-ar-en-finetuned")
print("Repository contents:")
for file in files:
    print(f"- {file}")

Repository contents:
- .gitattributes
- README.md
- config.json
- generation_config.json
- model.safetensors
- special_tokens_map.json
- tokenizer_config.json
- vocab.json


In [None]:

# 1. Get the original tokenizer files
original_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

# 2. Download the .spm files to local disk first
os.makedirs("temp_spm", exist_ok=True)

# Download source.spm
source_url = "https://huggingface.co/Helsinki-NLP/opus-mt-ar-en/resolve/main/source.spm"
source_path = "temp_spm/source.spm"
response = requests.get(source_url)
with open(source_path, "wb") as f:
    f.write(response.content)

# Download target.spm
target_url = "https://huggingface.co/Helsinki-NLP/opus-mt-ar-en/resolve/main/target.spm"
target_path = "temp_spm/target.spm"
response = requests.get(target_url)
with open(target_path, "wb") as f:
    f.write(response.content)

# 3. Upload to your repository
api = HfApi()
api.upload_file(
    path_or_fileobj=source_path,
    path_in_repo="source.spm",
    repo_id="KarimEmam/opus-mt-ar-en-finetuned",
    repo_type="model"
)
api.upload_file(
    path_or_fileobj=target_path,
    path_in_repo="target.spm",
    repo_id="KarimEmam/opus-mt-ar-en-finetuned",
    repo_type="model"
)

print("Successfully uploaded missing .spm files!")

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Successfully uploaded missing .spm files!


In [None]:

translator = pipeline(
    "translation_ar_to_en",
    model="KarimEmam/opus-mt-ar-en-finetuned"
)

print(translator( "الى اين تذهب ؟"))

Device set to use cpu


[{'translation_text': 'Where are you going?'}]


In [None]:

translator = pipeline(
    "translation_ar_to_en",
    model="KarimEmam/opus-mt-ar-en-finetuned"
)

print(translator( "يعتقد الامريكيون ان غزة تم احتلالها ولكن ارادة الشعب الفلسطينى عكس ذلك"))

Device set to use cpu


[{'translation_text': 'Americans believe Gaza has been occupied, but the will of the Palestinian people is opposite.'}]
