In [1]:
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

Mounted at /content/drive
Google Drive mounted successfully.


In [2]:
!pip install hazm nltk

Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gensim<5.0.0,>=4.3.1 (from hazm)
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting pybind11>=2.2 (from fasttext-wheel<0.10.0,>=0.9.2->hazm)
  Downloading pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Collecting scipy<1.14.0,>=1.7.0

In [None]:
import os
from hazm import Normalizer, SentenceTokenizer, WordTokenizer
import re
import nltk
from tqdm.notebook import tqdm # For progress bar

# Download NLTK data (punkt for sentence tokenizer)
nltk.download('punkt')

# Initialize Hazm tools
normalizer = Normalizer()
sentence_tokenizer = SentenceTokenizer()
word_tokenizer = WordTokenizer()

# Function to clean and normalize text
def clean_text(text):
    text = normalizer.normalize(text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to preprocess a single legal text file
def preprocess_legal_text_initial(file_path, output_dir):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        cleaned_text = clean_text(text)
        sentences = sentence_tokenizer.tokenize(cleaned_text)

        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)

        # Create output file path
        file_name = os.path.basename(file_path)
        output_file_path = os.path.join(output_dir, f"processed_{file_name}")

        with open(output_file_path, 'w', encoding='utf-8') as f_out:
            for sentence in sentences:
                if sentence.strip(): # Write non-empty sentences
                    f_out.write(sentence.strip() + '\n')

        print(f"Successfully processed {file_name} and saved to {output_file_path}")
        return True
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return False
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return False

In [None]:
# Define your input directory for raw legal texts
input_legal_texts_dir = '/content/drive/MyDrive/my_legal_corpus' # ***** این مسیر رو با پوشه فایل‌های خام خودت اصلاح کن *****

# Define the output directory for initially processed text files
processed_output_dir = '/content/drive/MyDrive/processed_legal_texts_temp'
os.makedirs(processed_output_dir, exist_ok=True)

# Process all files in the input directory
print(f"Starting initial preprocessing of files from: {input_legal_texts_dir}")
processed_files_count = 0
for filename in tqdm(os.listdir(input_legal_texts_dir), desc="Processing files"):
    if filename.endswith(".txt"): # Process only .txt files
        file_path = os.path.join(input_legal_texts_dir, filename)
        if preprocess_legal_text_initial(file_path, processed_output_dir):
            processed_files_count += 1

print(f"\nFinished initial preprocessing. Total files processed: {processed_files_count}")

In [None]:
# Define the path for the final combined text file
final_combined_text_file_path = '/content/drive/MyDrive/all_legal_sentences.txt'

print(f"Combining all processed sentences into: {final_combined_text_file_path}")
total_lines = 0
with open(final_combined_text_file_path, 'w', encoding='utf-8') as outfile:
    for filename in tqdm(os.listdir(processed_output_dir), desc="Combining files"):
        if filename.startswith("processed_") and filename.endswith(".txt"):
            filepath = os.path.join(processed_output_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as infile:
                for line in infile:
                    if line.strip(): # Write only non-empty lines
                        outfile.write(line)
                        total_lines += 1

print(f"\nAll processed sentences combined. Total lines written: {total_lines}")
print("Initial preprocessing and data consolidation complete!")