In [4]:
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

Mounted at /content/drive
Google Drive mounted successfully.


In [5]:
!pip install hazm nltk

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gensim<5.0.0,>=4.3.1 (from hazm)
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)
  Downloading pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Collecting scipy<1.14.0,>=1.7.0

In [1]:
import os
from hazm import Normalizer, SentenceTokenizer, WordTokenizer
import re
import nltk
from tqdm.notebook import tqdm # For progress bar

# Download NLTK data (punkt for sentence tokenizer)
nltk.download('punkt')

# Initialize Hazm tools
normalizer = Normalizer()
sentence_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer()

# Function to clean and normalize text
def clean_text(text):
    text = normalizer.normalize(text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to preprocess a single legal text file
def preprocess_legal_text_initial(file_path, output_dir):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        cleaned_text = clean_text(text)
        sentences = sentence_tokenizer.tokenize(cleaned_text)

        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)

        # Create output file path
        file_name = os.path.basename(file_path)
        output_file_path = os.path.join(output_dir, f"processed_{file_name}")

        with open(output_file_path, 'w', encoding='utf-8') as f_out:
            for sentence in sentences:
                if sentence.strip(): # Write non-empty sentences
                    f_out.write(sentence.strip() + '\n')

        print(f"Successfully processed {file_name} and saved to {output_file_path}")
        return True
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return False
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# Define your input directory for raw legal texts
input_legal_texts_dir = '/content/drive/MyDrive/my_legal_corpus' # ***** این مسیر رو اصلاح کن *****

# Define the output directory for initially processed text files
processed_output_dir = '/content/drive/MyDrive/processed_legal_texts_temp'
os.makedirs(processed_output_dir, exist_ok=True)

# Process all files in the input directory
print(f"Starting initial preprocessing of files from: {input_legal_texts_dir}")
processed_files_count = 0
for filename in tqdm(os.listdir(input_legal_texts_dir), desc="Processing files"):
    if filename.endswith(".txt"): # Process only .txt files
        file_path = os.path.join(input_legal_texts_dir, filename)
        if preprocess_legal_text_initial(file_path, processed_output_dir):
            processed_files_count += 1

print(f"\nFinished initial preprocessing. Total files processed: {processed_files_count}")

Starting initial preprocessing of files from: /content/drive/MyDrive/my_legal_corpus


Processing files:   0%|          | 0/6 [00:00<?, ?it/s]

Successfully processed asasi.txt and saved to /content/drive/MyDrive/processed_legal_texts_temp/processed_asasi.txt
Successfully processed madani.txt and saved to /content/drive/MyDrive/processed_legal_texts_temp/processed_madani.txt
Successfully processed dadresimadani.txt and saved to /content/drive/MyDrive/processed_legal_texts_temp/processed_dadresimadani.txt
Successfully processed dadresikefari.txt and saved to /content/drive/MyDrive/processed_legal_texts_temp/processed_dadresikefari.txt
Successfully processed tejarat.txt and saved to /content/drive/MyDrive/processed_legal_texts_temp/processed_tejarat.txt
Successfully processed MOJAZAT.txt and saved to /content/drive/MyDrive/processed_legal_texts_temp/processed_MOJAZAT.txt

Finished initial preprocessing. Total files processed: 6


In [3]:
# Define the path for the final combined text file
final_combined_text_file_path = '/content/drive/MyDrive/all_legal_sentences.txt' # این فایل ورودی اصلی برای توکنایزر و مدل BERT خواهد بود

print(f"Combining all processed sentences into: {final_combined_text_file_path}")
total_lines = 0
with open(final_combined_text_file_path, 'w', encoding='utf-8') as outfile:
    for filename in tqdm(os.listdir(processed_output_dir), desc="Combining files"):
        if filename.startswith("processed_") and filename.endswith(".txt"):
            filepath = os.path.join(processed_output_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as infile:
                for line in infile:
                    if line.strip(): # Write only non-empty lines
                        outfile.write(line)
                        total_lines += 1

print(f"\nAll processed sentences combined. Total lines written: {total_lines}")
print("Initial preprocessing and data consolidation complete!")

Combining all processed sentences into: /content/drive/MyDrive/all_legal_sentences.txt


Combining files:   0%|          | 0/6 [00:00<?, ?it/s]


All processed sentences combined. Total lines written: 7630
Initial preprocessing and data consolidation complete!


In [None]:
!pip install --upgrade --force-reinstall torch==2.0.1 transformers==4.30.0 datasets==2.13.0 accelerate==0.21.0 tokenizers==0.13.3 bitsandbytes==0.39.0

# بعد از اجرای این سلول، **دیگر نیازی به ریستارت مجدد نیست** چون محیط تازه ریستارت شده بود.
print("\nRequired Hugging Face libraries with specific versions installed.")
print("Please proceed to run all subsequent cells from Cell 7 onwards.")

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl.metadata (24 kB)
Collecting transformers==4.30.0
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.13.0
  Downloading datasets-2.13.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl.metadata (17 kB)
Collecting tokenizers==0.13.3
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting bitsandbytes==0.39.0
  Downloading bitsandbytes-0.39.0-py3-none-any.whl.metadata (9.8 kB)
Collecting filelock (from torch==2.0.1)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions (from torch==2.0.1)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting symp

In [5]:
from tokenizers import BertWordPieceTokenizer
from pathlib import Path
import os
from transformers import AutoTokenizer

# Define the path to your combined text file in Google Drive
text_file_path = '/content/drive/MyDrive/all_legal_sentences.txt' # این مسیر فایل بزرگ نهایی شماست

# Ensure the directory for saving the tokenizer exists
tokenizer_output_dir = '/content/drive/MyDrive/custom_legal_bert_tokenizer'
os.makedirs(tokenizer_output_dir, exist_ok=True)

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False,
)

# Train the tokenizer
tokenizer.train(
    files=[text_file_path],
    vocab_size=30000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
)

# Save the tokenizer files
tokenizer.save_model(tokenizer_output_dir)

print(f"\nTokenizer training complete! Files saved to: {tokenizer_output_dir}")
print(f"Vocab size: {tokenizer.get_vocab_size()}")

# Optional: Load and test the tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_output_dir)
test_sentence = "قانون مجازات اسلامی مشتمل بر جرایم و مجازات‌های حدود، قصاص، دیات و تعزیرات است."
encoded = loaded_tokenizer.encode_plus(test_sentence, add_special_tokens=True)
print(f"\nTest sentence: {test_sentence}")
print(f"Encoded IDs: {encoded.input_ids}")
print(f"Decoded tokens: {loaded_tokenizer.convert_ids_to_tokens(encoded.input_ids)}")


Tokenizer training complete! Files saved to: /content/drive/MyDrive/custom_legal_bert_tokenizer
Vocab size: 9800

Test sentence: قانون مجازات اسلامی مشتمل بر جرایم و مجازات‌های حدود، قصاص، دیات و تعزیرات است.
Encoded IDs: [2, 216, 298, 575, 2455, 153, 902, 56, 1554, 1142, 23, 430, 23, 3986, 56, 2228, 151, 9, 3]
Decoded tokens: ['[CLS]', 'قانون', 'مجازات', 'اسلامی', 'مشتمل', 'بر', 'جرایم', 'و', 'مجازاتهای', 'حدود', '،', 'قصاص', '،', 'دیات', 'و', 'تعزیرات', 'است', '.', '[SEP]']


In [6]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict # اضافه کردن DatasetDict
import os

# Define the path where your custom tokenizer was saved
tokenizer_output_dir = '/content/drive/MyDrive/custom_legal_bert_tokenizer'

# Define the path to your combined text file in Google Drive
text_file_path = '/content/drive/MyDrive/all_legal_sentences.txt' # مسیر فایل متنی بزرگت

# 1. Load the custom tokenizer
print("Loading custom tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_output_dir)
print("Tokenizer loaded successfully.")

# 2. Load the dataset (رویکرد اصلاح شده برای خواندن فایل متنی)
print(f"Loading dataset from: {text_file_path}")
try:
    with open(text_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    lines = [line.strip() for line in lines if line.strip()]

    raw_dataset = Dataset.from_dict({"text": lines})
    dataset = DatasetDict({"train": raw_dataset})

    print("Dataset loaded successfully.")
    print(f"Dataset structure: {dataset}")

    print("\nSample from dataset:")
    print(dataset["train"][0])

except FileNotFoundError:
    print(f"Error: The file '{text_file_path}' was not found. Please ensure the path is correct and the file exists.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Loading custom tokenizer...
Tokenizer loaded successfully.
Loading dataset from: /content/drive/MyDrive/all_legal_sentences.txt
Dataset loaded successfully.
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 7630
    })
})

Sample from dataset:
{'text': 'قانون اساسی جمهوری اسلامی ایران بسم\u200cالله الرحمن الرحیم لقد أرسلنا رسلنا بالبینات و أنزلنا معهم الکتاب و ٱلمیزان لیقوم الناس بالقسط قانون اساسی جمهوری اسلامی ایران مبین نهادهای فرهنگی، اجتماعی، سیاسی و اقتصادی جامعه ایران براساس اصول و ضوابط اسلامی است که انعکاس خواست قلبی امت اسلامی می\u200cباشد.'}


In [7]:
# Function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

print("Tokenizing dataset...")
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=os.cpu_count(),
    remove_columns=["text"],
)
print("Dataset tokenized successfully.")
print(f"Tokenized dataset structure: {tokenized_datasets}")

block_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

print(f"Grouping texts into blocks of size {block_size}...")
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=os.cpu_count(),
)
print("Text grouping complete.")
print(f"Final dataset structure for MLM: {lm_datasets}")

print("\nSample of prepared MLM dataset (first example):")
print(lm_datasets["train"][0])

Tokenizing dataset...


Map (num_proc=2):   0%|          | 0/7630 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset tokenized successfully.
Tokenized dataset structure: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7630
    })
})
Grouping texts into blocks of size 128...


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=2):   0%|          | 0/7630 [00:00<?, ? examples/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


Text grouping complete.
Final dataset structure for MLM: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2351
    })
})

Sample of prepared MLM dataset (first example):
{'input_ids': [2, 216, 1578, 1126, 575, 539, 6849, 7993, 7994, 8249, 93, 264, 2354, 305, 708, 82, 305, 1776, 1590, 148, 56, 199, 1329, 305, 198, 2720, 208, 7613, 56, 6533, 6694, 1495, 89, 172, 7454, 1776, 89, 488, 216, 1578, 1126, 575, 539, 6703, 3178, 1917, 23, 1503, 23, 1742, 56, 2296, 2377, 539, 4145, 2441, 56, 2956, 575, 151, 150, 8549, 1308, 5841, 84, 1046, 575, 936, 9, 3, 2, 2928, 1478, 7225, 575, 539, 56, 5248, 3776, 1706, 2050, 136, 3187, 226, 4814, 150, 132, 933, 8433, 3438, 56, 9631, 1065, 7239, 215, 1706, 8601, 149, 914, 302, 175, 1308, 1578, 154, 1203, 485, 56, 7169, 132, 7135, 175, 4814, 2697, 23, 1819, 3668, 144, 494, 883, 7258, 142, 199, 192, 149, 5880, 93, 9, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [9]:
from transformers import BertConfig, BertForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
import os

# 3. Configure the BERT model from scratch
print("Initializing BERT configuration and model...")
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=512,
    num_hidden_layers=6,
    num_attention_heads=6,
    hidden_size=384,
    type_vocab_size=2,
)

model = BertForMaskedLM(config=config)
print(f"Number of parameters in the new model: {model.num_parameters()}")

# 4. Set up Training Arguments
output_dir = '/content/drive/MyDrive/legal_bert_pretraining_output'
os.makedirs(output_dir, exist_ok=True)

print("Setting up Training Arguments...")
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    do_train=True,
    gradient_accumulation_steps=1,
    # Uncomment and set to True if you have compatible GPU for faster training:
    # fp16=True,
    # bf16=True,
)

print("Setting up Data Collator for Masked Language Modeling...")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# 5. Initialize the Trainer and start pre-training
print("Initializing Trainer and starting pre-training...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    data_collator=data_collator,
)

print("\n--- Starting BERT Pre-training ---")
trainer.train()
print("\nBERT Pre-training completed!")

# Save the final model
final_model_save_path = os.path.join(output_dir, "final_model")
model.save_pretrained(final_model_save_path)
tokenizer.save_pretrained(final_model_save_path)
print(f"\nFinal pre-trained model and tokenizer saved to: {final_model_save_path}")

ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)