In [None]:
import json
from pathlib import Path
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
plt.style.use('default')
sns.set_palette("husl")

DATA_DIR = Path('../data')
PROCESSED_DIR = DATA_DIR / 'processed'
text_file = PROCESSED_DIR / 'labor_law.txt'
metadata_file = PROCESSED_DIR / 'labor_law_metadata.json'

**Load dataand metadata**

In [3]:
with open(text_file, 'r', encoding = 'utf-8') as f:
    text = f.read()

with open(metadata_file, 'r', encoding = 'utf-8') as f:
    metadata = json.load(f)

print(f"\nüìä Basic Info:")
print(f"   - Total Pages: {metadata['total_pages']}")
print(f"   - Total Characters: {metadata['total_chars']:,}")
print(f"   - Total Words: {metadata['total_words']:,}")
print(f"   - Average chars/page: {metadata['total_chars'] // metadata['total_pages']:,}")


üìä Basic Info:
   - Total Pages: 86
   - Total Characters: 335,890
   - Total Words: 39,576
   - Average chars/page: 3,905


In [4]:
print(text[:1000])

’Ä’Ä ‘±’¶’£’°’µ’´’∂ ‘∫’∏’≤’∏’æ, ’ï÷Ä’•’∂’Ω’£’´÷Ä÷Ñ N ’Ä’ï-124-’Ü ‘∏’∂’§’∏÷Ç’∂’æ’•’¨ ’ß. 09.11.2004
’Ä‘±’Ö‘±’ç’è‘±’Ü‘ª ’Ä‘±’Ü’ê‘±’ä‘µ’è’à’í‘π’Ö‘±’Ü ‘±’á‘Ω‘±’è‘±’Ü’î‘±’Ö‘ª’Ü ’ï’ê‘µ’Ü’ç‘≥‘ª’ê’î ’ç’ø’∏÷Ä’°’£÷Ä’æ’•’¨ ’ß. 14.12.2004
’à’í’™’´ ’¥’•’ª ’ß ’¥’ø’•’¨. 21.06.2005
01.01.2026 -
’Ä‘±’Ö‘±’ç’è‘±’Ü‘ª ’Ä‘±’Ü’ê‘±’ä‘µ’è’à’í‘π’Ö‘±’Ü
‘±’á‘Ω‘±’è‘±’Ü’î‘±’Ö‘ª’Ü ’ï’ê‘µ’Ü’ç‘≥‘ª’ê’î
‘∏’∂’§’∏÷Ç’∂’æ’°’Æ ’ß 2004 ’©’æ’°’Ø’°’∂’´ ’∂’∏’µ’•’¥’¢’•÷Ä’´ 9-’´’∂
‘≤‘±‘∫‘ª’Ü 1.
‘∏’Ü‘¥’Ä‘±’Ü’à’í’ê ‘¥’ê’à’í’Ö‘π’Ü‘µ’ê
‘≥‘º’à’í‘Ω 1.
‘±’á‘Ω‘±’è‘±’Ü’î‘±’Ö‘ª’Ü ’ï’ê‘µ’Ü’ç‘¥’ê’à’í‘π’Ö’à’í’Ü‘∏ ‘µ’é ‘¥’ê‘±’Ü’à’é ‘ø‘±’ê‘≥‘±’é’à’ê’é’à’Ç ’Ä‘±’ê‘±‘≤‘µ’ê’à’í‘π’Ö’à’í’Ü’Ü‘µ’ê‘∏
’Ä’∏’§’æ’°’Æ 1.’Ä’°’µ’°’Ω’ø’°’∂’´ ’Ä’°’∂÷Ä’°’∫’•’ø’∏÷Ç’©’µ’°’∂ ’°’∑’≠’°’ø’°’∂÷Ñ’°’µ’´’∂ ÷Ö÷Ä’•’∂’Ω’£÷Ä÷Ñ’∏’æ ’Ø’°÷Ä’£’°’æ’∏÷Ä’æ’∏’≤ ’∞’°÷Ä’°’¢’•÷Ä’∏÷Ç’©’µ’∏÷Ç’∂’∂’•÷Ä’®
1. ’ç’∏÷Ç’µ’∂ ÷Ö÷Ä’•’∂’Ω’£’´÷Ä÷Ñ’® ’Ø’°÷Ä’£’°’æ’∏÷Ä’∏÷Ç’¥ ’ß ’Ø’∏’¨’•’Ø’ø’´’æ ÷á ’°’∂’∞’°’ø’°’Ø’°’∂ ’°’∑’≠’°’ø’°’∂÷Ñ’°’µ’´’∂ ’∞’°÷Ä’°’¢’•÷Ä’∏÷Ç’©’µ’∏÷Ç’∂’∂’•÷Ä’®, ’Ω’°’∞’¥’°’∂’∏÷Ç’¥ ’ß ’°’µ’§

In [5]:
text[-500:]

' ’∫’°÷Ä’°’∫’∏÷Ç÷Ä’§’´ ’°’¥’¢’∏’≤’ª\n’™’°’¥’°’∂’°’Ø’°’∞’°’ø’æ’°’Æ’´ ’°’∑’≠’°’ø’°’∂÷Ñ’°’µ’´’∂ ÷Ö÷Ä’•÷Ä’´ ÷Ñ’°’∂’°’Ø’∏’æ ’¢’°’¶’¥’°’∫’°’ø’Ø’•’¨’∏÷Ç ’¥’´’ª’∏÷Å’∏’æ:\n(265-÷Ä’§ ’∞’∏’§’æ’°’Æ’® ÷É’∏÷É. 24.06.10 ’Ä’ï-117-’Ü, 12.03.14 ’Ä’ï-5-’Ü, 03.05.23 ’Ä’ï-160-’Ü (÷Ö÷Ä’•’∂÷Ñ’∂ ’∏÷Ç’∂’´ ’•’¶÷Ä’°÷É’°’Ø’´’π ’¥’°’Ω ÷á ’°’∂÷Å’∏÷Ç’¥’°’µ’´’∂\n’§÷Ä’∏÷Ç’µ’©’∂’•÷Ä), 02.10.24 ’Ä’ï-364-’Ü ÷Ö÷Ä’•’∂÷Ñ’∂’•÷Ä )\n’Ä’∏’§’æ’°’Æ 266.‘±’∑’≠’°’ø’°’∂÷Ñ’°’µ’´’∂ ’æ’•’≥’•÷Ä’∏’æ ’§’°’ø’°’Ø’°’∂ ’Æ’°’≠’Ω’•÷Ä’®\n‘±’∑’≠’°’ø’°’∂÷Ñ’°’µ’´’∂ ’æ’•’≥’•÷Ä’∏’æ ’§’°’ø’°’Ø’°’∂ ’Æ’°’≠’Ω’•÷Ä’® ’Ø’°’ø’°÷Ä’æ’∏÷Ç’¥ ’•’∂ ÷Ö÷Ä’•’∂÷Ñ’∏’æ ’Ω’°’∞’¥’°’∂’æ’°’Æ ’Ø’°÷Ä’£’∏’æ:\n’Ä’°’µ’°’Ω’ø’°’∂’´ ’Ä’°’∂÷Ä’°’∫’•’ø’∏÷Ç’©’µ’°’∂\n’å. ’î’∏’π’°÷Ä’µ’°’∂\n’Ü’°’≠’°’£’°’∞\n2004 ’©. ’§’•’Ø’ø’•’¥’¢’•÷Ä’´ 14\n‘µ÷Ä÷á’°’∂\n’Ä’ï-124-’Ü\n¬© 1996 - 2026, ‘ª’ê’è‘µ‘ø 86 PDF -’® ’Ω’ø’•’≤’Æ’æ’°’Æ ’ß. 11.01.2026'

**Analyze Document Structure - Find Articles**

In [6]:
article_pattern = r'’Ä’∏’§’æ’°’Æ\s+\d+'
chapter_pattern = r'‘≥‘º’à’í‘Ω\s+[IVXLCDM]+|‘≥’¨’∏÷Ç’≠\s+\d+'

articles = re.findall(article_pattern, text, re.IGNORECASE)
chapters = re.findall(chapter_pattern, text, re.IGNORECASE)

print(f"\nüìù Document Structure:")
print(f"   - Total Articles: {len(articles)}")
print(f"   - Total Chapters: {len(chapters)}")







üìù Document Structure:
   - Total Articles: 286
   - Total Chapters: 25


In [7]:
# Split into lines and analyze
lines = text.split('\n')
non_empty_lines = [line for line in lines if line.strip()]

print(f"üìä Line Statistics:")
print(f"   - Total lines: {len(lines)}")
print(f"   - Non-empty lines: {len(non_empty_lines)}")
print(f"   - Empty lines: {len(lines) - len(non_empty_lines)}")

# Word length distribution
words = text.split()
word_lengths = [len(word) for word in words]

print(f"\nüìä Word Statistics:")
print(f"   - Total words: {len(words)}")
print(f"   - Average word length: {sum(word_lengths) / len(word_lengths):.2f} chars")
print(f"   - Shortest word: {min(word_lengths)} chars")
print(f"   - Longest word: {max(word_lengths)} chars")

# Line length distribution
line_lengths = [len(line) for line in non_empty_lines]
if line_lengths:
    print(f"\nüìä Line Statistics:")
    print(f"   - Average line length: {sum(line_lengths) / len(line_lengths):.2f} chars")
    print(f"   - Shortest line: {min(line_lengths)} chars")
    print(f"   - Longest line: {max(line_lengths)} chars")

üìä Line Statistics:
   - Total lines: 4399
   - Non-empty lines: 4314
   - Empty lines: 85

üìä Word Statistics:
   - Total words: 39576
   - Average word length: 7.49 chars
   - Shortest word: 1 chars
   - Longest word: 27 chars

üìä Line Statistics:
   - Average line length: 76.84 chars
   - Shortest line: 4 chars
   - Longest line: 192 chars


In [8]:

# Analyze unique characters (check for encoding issues)
unique_chars = set(text)
print(f"üî§ Character Analysis:")
print(f"   - Unique characters: {len(unique_chars)}")

# Separate Armenian, Latin, numbers, special chars
armenian_chars = [c for c in unique_chars if '\u0530' <= c <= '\u058F']
latin_chars = [c for c in unique_chars if c.isalpha() and c.isascii()]
digits = [c for c in unique_chars if c.isdigit()]
special = [c for c in unique_chars if not c.isalnum() and not c.isspace()]

print(f"   - Armenian characters: {len(armenian_chars)}")
print(f"   - Latin characters: {len(latin_chars)}")
print(f"   - Digits: {len(digits)}")
print(f"   - Special characters: {len(special)}")

if latin_chars:
    print(f"\n   Latin characters found: {''.join(sorted(set(latin_chars)))}")

üî§ Character Analysis:
   - Unique characters: 115
   - Armenian characters: 76
   - Latin characters: 16
   - Digits: 10
   - Special characters: 11

   Latin characters found: DFNPaehikmoprstw


In [9]:
# Look for numbers (article numbers, page numbers, etc.)
numbers = re.findall(r'\d+', text)
number_counter = Counter(numbers)

print("üî¢ Most common numbers (might be page numbers or article references):")
for num, count in number_counter.most_common(20):
    print(f"   {num}: appears {count} times")

# Find lines with only numbers (likely page numbers to remove)
page_number_lines = [line.strip() for line in lines if line.strip().isdigit()]
print(f"\nüìÑ Found {len(page_number_lines)} lines with only numbers (potential page numbers)")
if page_number_lines[:10]:
    print(f"   Examples: {page_number_lines[:10]}")

üî¢ Most common numbers (might be page numbers or article references):
   1: appears 566 times
   2: appears 410 times
   3: appears 283 times
   06: appears 196 times
   10: appears 194 times
   24: appears 183 times
   2026: appears 177 times
   05: appears 170 times
   4: appears 169 times
   23: appears 154 times
   03: appears 153 times
   01: appears 143 times
   160: appears 139 times
   117: appears 125 times
   11: appears 122 times
   5: appears 113 times
   22: appears 100 times
   12: appears 87 times
   1996: appears 86 times
   6: appears 76 times

üìÑ Found 0 lines with only numbers (potential page numbers)


**Identify cleaning needs**

In [10]:


issues = []

# Check for page numbers
if page_number_lines:
    issues.append(f"‚úì Remove {len(page_number_lines)} standalone page numbers")

# Check for extra whitespace
extra_spaces = len([line for line in lines if '  ' in line])
if extra_spaces > 0:
    issues.append(f"‚úì Fix {extra_spaces} lines with extra whitespace")

# Check for very short lines (might be fragments)
short_lines = [line for line in non_empty_lines if len(line) < 10]
if len(short_lines) > 100:
    issues.append(f"‚úì Review {len(short_lines)} very short lines (< 10 chars)")

# Check encoding
if len(armenian_chars) < 30:
    issues.append("‚ö†Ô∏è  Warning: Very few Armenian characters detected - check encoding!")

if issues:
    for issue in issues:
        print(f"   {issue}")
else:
    print("   ‚ú® Text looks clean! Minimal cleaning needed.")

   ‚ú® Text looks clean! Minimal cleaning needed.
