# Data cleaning

## Load file

In [33]:
import pandas as pd
year = 2012
file_name = f"output_{year}"
file_path = f"files/{year}"
data = pd.read_json(f"{file_path}/{file_name}.json")

In [34]:
data

Unnamed: 0,metadata,content
0,"{'document_id': 'F2012001', 'domain': '', 'yea...",AVIS ET LOIS\nAvis n° 03/A.CC/11 du 27 Moharra...
1,"{'document_id': 'F2012002', 'domain': '', 'yea...",AVIS ET LOIS\nAvis n° 01/A. CC/ 12 du 14 Safar...
2,"{'document_id': 'F2012003', 'domain': '', 'yea...",CONVENTIONS ET ACCORDS INTERNATIONAUX\nDécret ...
3,"{'document_id': 'F2012004', 'domain': '', 'yea...",DECRETS\nDécret présidentiel n° 12-23 du 24 Sa...
4,"{'document_id': 'F2012005', 'domain': '', 'yea...",CONVENTIONS ET ACCORDS INTERNATIONAUX\nDécret ...
...,...,...
66,"{'document_id': 'F2012068', 'domain': '', 'yea...",CONVENTIONS ET ACCORDS INTERNATIONAUX\nDécret ...
67,"{'document_id': 'F2012069', 'domain': '', 'yea...",CONVENTIONS ET ACCORDS INTERNATIONAUX\nDécret ...
68,"{'document_id': 'F2012070', 'domain': '', 'yea...",CONVENTIONS ET ACCORDS INTERNATIONAUX\nDécret ...
69,"{'document_id': 'F2012071', 'domain': '', 'yea...",DECRETS\nDécret présidentiel n° 12-434 du 12 S...


### Remove page header

In [35]:
def delete_page_header(content):
    """
    Removes JOURNAL OFFICIEL headers and their subsequent lines (2-4 lines total).
    Handles cases where the header has 2 lines or 4 lines.
    """
    lines = content.split('\n')
    to_delete = set()
    
    # Regex to match the header line (case-insensitive, allows spaces around N°)
    header_pattern = re.compile(
        r'^JOURNAL\s+OFFICIEL\s+DE\s+LA\s+REPUBLIQUE\s+ALGERIENNE\s+N°\s*\d+',
        re.IGNORECASE
    )
    
    # Regex to match date lines (e.g., "28", "28 Dhou El Kaada 1425", "9 janvier 2005")
    date_pattern = re.compile(
        r'^(\d+\s*[A-Za-z]+\s*\d*|\d+\s*[A-Za-z]+\s*\d+\s*[A-Za-z]+\s*\d+|\d+\s*[A-Za-zé]+\s*\d+)$',
        re.IGNORECASE
    )

    for i, line in enumerate(lines):
        if header_pattern.search(line.strip()):
            # Check if previous line is a page number
            if i > 0 and re.fullmatch(r'\s*\d+\s*', lines[i-1].strip()):
                to_delete.add(i-1)
            
            # Always delete the header line
            to_delete.add(i)
            
            # Check next 3 lines for date patterns
            for j in range(i+1, min(i+4, len(lines))):
                if date_pattern.search(lines[j].strip()):
                    to_delete.add(j)
                else:
                    break  # Stop if a non-date line is found
            
            break  # Only process the first matching header

    return '\n'.join([line for idx, line in enumerate(lines) if idx not in to_delete])

### Remove footer

In [36]:
import re

def remove_footer(text):
    """Remove footer starting with 'Imprimerie Officielle -' and ending with 'ALGER-GARE'."""
    return re.sub(
        r'(?i)Imprimerie\s+officielle\s*-.*?ALGER-GARE', 
        '', 
        text, 
        flags=re.DOTALL  # Handles multi-line footers
    )

### Replace newline characters

In [37]:
import re

def replace_newlines(text):
    return re.sub(r'[\r\n]+', ' ', text)

### Remove Arabic

In [38]:
import re

def remove_arabic(text):
    """Remove Arabic script characters and symbols."""
    # Regex to match Arabic Unicode characters and related symbols
    arabic_pattern = re.compile(
        r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+'
    )
    return arabic_pattern.sub(' ', text)

### Remove Punctuation

In [39]:
def remove_punctuation(text):
    """
    Remove unwanted punctuation (e.g., «»“”‘’()!?) but keep commas, periods, and hyphens (-).
    """
    # Keep: letters, numbers, spaces, commas (`,`), periods (`.`), and hyphens (`-`)
    return re.sub(r'[^\w\s,.\'’°/-]', ' ', text)

### Collapse allowed punctuation

In [40]:
def remove_repeated_punctuation(text):
    """Collapse repeated allowed punctuation (e.g., N°°°° → N°)."""
    return re.sub(r'([°/-])\1+', r'\1', text)  # Handles °°° → °, /// → /

### Remove Multiple Periods

In [41]:
def remove_multiple_periods(text):
    """Replace sequences of periods (e.g., .... or . . . .) with a space."""
    return re.sub(r'\.\s*\.+', ' ', text)

### remove dashes

In [42]:
def remove_symbols(text):
    """Remove symbols like ★, ◼, etc."""
    return re.sub(r'[★◼▪•H#]+', ' ', text)

In [43]:
def remove_dashes(text):
    """
    Remove em-dashes (—) and standalone hyphens (e.g., " — "), but keep hyphens in terms like `08-05`.
    Example: 
      Input: "08-05 — code-pénal ————H————" 
      Output: "08-05 code-pénal"
    """
    # Remove em-dashes (—)
    text = re.sub(r'—+', ' ', text)
    # Remove standalone hyphens (not part of a word/number sequence)
    text = re.sub(r'(?<!\w)-(?!\w)', ' ', text)
    text = remove_symbols(text)
    return text

### Remove Multiple Spaces

In [44]:
def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

### Lowercasing

In [45]:
def lowercasing(text):
    return text.lower()

### Handel words duplicates

In [46]:
def remove_duplicate_words(content):
    words = content.split()
    if not words:
        return ""
    filtered_words = [words[0]]
    for word in words[1:]:
        if word != filtered_words[-1]:
            filtered_words.append(word)
    return ' '.join(filtered_words)

### Apply Cleaning Pipeline

In [47]:
def clean_text(text):
    text = remove_repeated_punctuation(text)  
    text = delete_page_header(text)  
    text = remove_footer(text)
    text = replace_newlines(text)  
    text = remove_arabic(text)     
    text = remove_punctuation(text)   
    text = remove_multiple_periods(text)
    text = remove_dashes(text)
    text = remove_multiple_spaces(text)
    text = remove_duplicate_words(text)
    text = lowercasing(text)
    return text

data["content"] = data["content"].apply(clean_text)

### Save cleaned data

In [48]:
import json

# Convert DataFrame to list of records (like the original JSON structure)
cleaned_records = data.to_dict(orient='records')
cleaned_file_name = f"{file_name}_cleaned"
# Save with proper formatting and without escaping slashes
with open(f"{file_path}/{cleaned_file_name}.json", 'w', encoding='utf-8') as f:
    json.dump(cleaned_records, f, indent=2, ensure_ascii=False)

print(f"Cleaned data saved to: {file_path}/{cleaned_file_name}.json")

Cleaned data saved to: files/2012/output_2012_cleaned.json
