In [1]:
!pip install transformers datasets

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [2]:
import os

# Function to create a simple dataset
def create_custom_dataset(filename="custom_dataset.txt"):
    # Create a simple dataset with some text entries
    # You can replace this with your own dataset creation logic
    text_samples = [
        "مرحبا بك في عالم الذكاء الاصطناعي.",
        "تعلم الآلة هو مجال فرعي من الذكاء الاصطناعي.",
        "نماذج اللغة الكبيرة أحدثت ثورة في مجال معالجة اللغة الطبيعية.",
        "الشبكات العصبية العميقة قادرة على التعلم من البيانات.",
        "تستخدم خوارزميات التعلم العميق في العديد من التطبيقات.",
        # Add more sentences as needed
    ]

    # Write samples to a file
    with open(filename, 'w', encoding='utf-8') as f:
        for sample in text_samples:
            f.write(sample + "\n")

    return filename

# Create dataset file
dataset_file = create_custom_dataset()
print(f"Dataset created at: {dataset_file}")

Dataset created at: custom_dataset.txt


In [3]:
model_name = "gpt2"  # Can also use "gpt2-medium", "gpt2-large" or "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token to GPT2 tokenizer
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
def load_dataset(train_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=block_size,
    )
    return dataset

# Load dataset
train_dataset = load_dataset(dataset_file, tokenizer)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)



In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the model
model_path = "./fine_tuned_gpt2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Step,Training Loss


Model saved to ./fine_tuned_gpt2


In [10]:
def generate_text(prompt, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        top_p=0.95,
        top_k=50,
        temperature=0.7,
        no_repeat_ngram_size=2,
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Load the fine-tuned model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fine_tuned_model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained(model_path)

test_prompt = "السلام عليكم"
generated_text = generate_text(test_prompt, fine_tuned_model, fine_tuned_tokenizer)
print(f"Prompt: {test_prompt}")
print(f"Generated text: {generated_text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: السلام عليكم
Generated text: السلام عليكم الله أحدث الطلالى إلّذُ العالم في مع الاصغة والجالعلة البشيصة

بعدمة بال مآل من المخلاء الدععة ه


In [11]:
def evaluate_generation(model, tokenizer, prompts):
    results = []

    for prompt in prompts:
        generated = generate_text(prompt, model, tokenizer)
        results.append({
            "prompt": prompt,
            "generated": generated
        })

    return results

test_prompts = [
    "الذكاء الاصطناعي هو",
    "تعلم الآلة يساعدنا في",
    "اللغة العربية لها"
]

evaluation = evaluate_generation(fine_tuned_model, fine_tuned_tokenizer, test_prompts)

for result in evaluation:
    print(f"Prompt: {result['prompt']}")
    print(f"Generated: {result['generated']}")
    print("-" * 60)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: الذكاء الاصطناعي هو
Generated: الذكاء الاصطناعي هو مجال معال الآلة العصيرة في من اللغة. https://t.co/1NzqzPfvkG9 — كو أبي الدعلمة (@alqahiliya) February 14, 2017

The attack was reportedly carried out by
------------------------------------------------------------
Prompt: تعلم الآلة يساعدنا في
Generated: تعلم الآلة يساعدنا في الله عالم كان أحدورة من الكتاذراء العالة ثلاصطة الاطلبي.
والذقاس المجال معادةة والعبائة. مص
------------------------------------------------------------
Prompt: اللغة العربية لها
Generated: اللغة العربية لها في اللي على الذكاء الاصطاعة.

وأعلم النبواغ السباذة: بالن أجو من القحدة هو التعالة حلدثني مخ
------------------------------------------------------------


In [12]:
import json

# Save evaluation results
with open("text_generation_results.json", "w", encoding="utf-8") as f:
    json.dump(evaluation, f, ensure_ascii=False, indent=2)

# Save a sample of the generated text
with open("sample_generated_texts.txt", "w", encoding="utf-8") as f:
    for result in evaluation:
        f.write(f"Prompt: {result['prompt']}\n")
        f.write(f"Generated: {result['generated']}\n")
        f.write("-" * 60 + "\n")