# Burmese Poem Classification with Deep Learning

This notebook implements a comprehensive deep learning system for classifying Burmese poems into different types.

## Data Sources:
1. **Excel File**: `NLP Project.xlsx` - Contains 45 poems across 35 different poem types
2. **JSON Files**: Extracted poems from PDF using OCR + Gemini 2.5

## Approach:
- Multi-model ensemble (LSTM, CNN, Transformer)
- Transfer learning with multilingual models
- Custom Burmese text preprocessing
- Data augmentation for small dataset handling


In [3]:
!python --version

Python 3.10.18


In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import json
import re
import os
import glob
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Deep Learning libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Dense, LSTM, GRU, Conv1D, MaxPooling1D, GlobalMaxPooling1D,
    Embedding, Dropout, BatchNormalization, Bidirectional,
    Input, Concatenate, Attention
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ML libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import nltk
from collections import Counter
import unicodedata

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")


ModuleNotFoundError: No module named 'sklearn'

## 1. Data Loading and Preprocessing


In [None]:
# Import the custom syllable splitter from existing notebook
def custom_syllable_splitter(text: str) -> list:
    """
    Performs a deep syllable split by breaking down all consonant stacks.
    This function uses a two-step process to achieve the required logic for
    cases like 'နက္ခတ္တ' and 'ဥက္ကဋ္ဌ'.
    """
    # Step 1: Pre-processing to split stacks using a loop
    stacked_consonant_pattern = r'([က-အ])(်?္)([က-အ])'
    processed_text = text
    while re.search(stacked_consonant_pattern, processed_text):
        processed_text = re.sub(stacked_consonant_pattern, r'\1်'  + r'\3', processed_text)
    processed_text = re.sub(r"(([A-Za-z0-9]+)|[က-အ|ဥ|ဦ](င်္|[က-အ|ဥ][ှ]*[့း]*[်]|္[က-အ]|[ါ-ှႏꩻ][ꩻ]*){0,}|.)",r"\1 ", processed_text)
    
    # Step 2: Tokenization of the processed parts
    final_list = processed_text.split(" ")
    
    # Filter out empty strings caused by trailing spaces
    final_list = [word for word in final_list if word.strip()]
        
    return final_list

print("Custom syllable splitter loaded successfully!")


In [None]:
class BurmeseTextPreprocessor:
    """
    Comprehensive text preprocessor for Burmese poems
    """
    
    def __init__(self):
        self.burmese_consonants = "ကခဂဃငစဆဇဈညဋဌဍဎဏတထဒဓနပဖဗဘမယရလဝသဟဠအ"
        self.consonantal_medials = "ျြှွ"
        
    def normalize_unicode(self, text):
        """Normalize Unicode text to handle different encodings"""
        return unicodedata.normalize('NFC', text)
    
    def clean_ocr_errors(self, text):
        """Clean common OCR errors in Burmese text"""
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text)
        
        # Remove unwanted characters but keep Burmese text, numbers, and basic punctuation
        text = re.sub(r'[^\u1000-\u109F\u0020-\u007E၀-၉]', '', text)
        
        # Fix common number format issues
        text = re.sub(r'(၁|၂|၃|၄|၅|၆|၇|၈|၉|၀)\s*[။|၊]', r'\1။', text)
        
        return text.strip()
    
    def preprocess_poem_lines(self, poem_lines):
        """Preprocess a list of poem lines"""
        if isinstance(poem_lines, str):
            poem_lines = [poem_lines]
        
        processed_lines = []
        for line in poem_lines:
            if pd.isna(line) or line is None:
                continue
            
            # Basic cleaning
            line = str(line)
            line = self.normalize_unicode(line)
            line = self.clean_ocr_errors(line)
            
            if line:  # Only add non-empty lines
                processed_lines.append(line)
        
        return processed_lines
    
    def extract_syllables(self, text):
        """Extract syllables using the custom syllable splitter"""
        return custom_syllable_splitter(text)
    
    def get_poem_features(self, poem_lines):
        """Extract various features from poem lines"""
        if not poem_lines:
            return {
                'total_lines': 0,
                'total_syllables': 0,
                'avg_line_length': 0,
                'avg_syllables_per_line': 0,
                'text_content': ''
            }
        
        total_lines = len(poem_lines)
        combined_text = ' '.join(poem_lines)
        total_chars = len(combined_text)
        
        # Get syllables for the entire poem
        all_syllables = self.extract_syllables(combined_text)
        total_syllables = len(all_syllables)
        
        return {
            'total_lines': total_lines,
            'total_syllables': total_syllables,
            'avg_line_length': total_chars / total_lines if total_lines > 0 else 0,
            'avg_syllables_per_line': total_syllables / total_lines if total_lines > 0 else 0,
            'text_content': combined_text
        }

# Initialize preprocessor
preprocessor = BurmeseTextPreprocessor()
print("Burmese text preprocessor initialized!")


In [None]:
def load_excel_data(file_path='NLP Project.xlsx'):
    """Load and process data from Excel file"""
    print("Loading Excel data...")
    
    df = pd.read_excel(file_path)
    print(f"Excel data shape: {df.shape}")
    print(f"Poem types (columns): {list(df.columns)}")
    
    # Reshape data from wide to long format
    excel_poems = []
    
    for col in df.columns:
        for idx, row in df.iterrows():
            poem_text = row[col]
            if pd.notna(poem_text) and str(poem_text).strip():
                excel_poems.append({
                    'title': f'Excel_Poem_{idx}_{col}',
                    'author': 'Unknown',
                    'poem_lines': [str(poem_text)],
                    'poem_type': col,
                    'source': 'excel',
                    'notes': f'Row {idx}'
                })
    
    print(f"Extracted {len(excel_poems)} poems from Excel")
    return excel_poems

def load_json_data(json_dir='PoemJsonFiles'):
    """Load and process data from JSON files"""
    print("Loading JSON data...")
    
    json_poems = []
    
    # Check if directory exists and has files
    if os.path.exists(json_dir):
        json_files = glob.glob(os.path.join(json_dir, '*.json'))
        print(f"Found {len(json_files)} JSON files")
        
        for json_file in json_files:
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                # Handle both single poem and list of poems
                if isinstance(data, dict):
                    data = [data]
                
                for poem in data:
                    if 'poem_lines' in poem and 'poem_type' in poem:
                        poem['source'] = 'json'
                        json_poems.append(poem)
                        
            except Exception as e:
                print(f"Error loading {json_file}: {e}")
    else:
        print(f"JSON directory {json_dir} not found or empty")
    
    print(f"Extracted {len(json_poems)} poems from JSON files")
    return json_poems

def create_sample_json_data():
    """Create sample JSON data based on your example"""
    sample_poems = [
        {
            "title": "ပန်းသဇင်",
            "author": "မောင်အေးမောင်",
            "language": "Burmese",
            "poem_lines": [
                "၁၊ ခါနွေလေပြန်၊ သူရကန်မူး",
                "ဆူထန်တက်ကြွေ၊ ပူလျှံငွေကို၊",
                "မတွေ့သဘော၊ နှဲမြော၏သို့၊",
                "မဇ္ဈိသည့်သွင်၊ ညွန့်မရှင်သို။"
            ],
            "notes": "ဂန္ထလောက ၁၂၉၄-ခု။ တော်သလင်းလ",
            "release_date": "1294, တော်သလင်းလ",
            "poem_type": "ခေတ်စမ်းကဗျာ"
        },
        {
            "title": "ကံနဲသူနတ်ရှင်နောင်",
            "author": "မောင်အေးမောင်",
            "language": "Burmese", 
            "poem_lines": [
                "ပင်တိုင်စံခာတုကလျှာအနှံ့ငယ်ကြောင့်၊ ကြင်ပိုင်ရန်-စာအညကဗျာဖွဲ့ရှာရ၊",
                "မဟာသမုဒြိယာခဲ့ပါတဲ့အကြောင်းခံ၊",
                "အသက်လိုအချစ်ထူးရော့ထင့်၊ ပုရစ်ဖူးတွေဝေဝေမြိုင်တော့၊"
            ],
            "notes": "ဂန္ထလောက ၁၂၉၄-ခု၊တော်သလင်းလက",
            "release_date": "1294, တော်သလင်းလ",
            "poem_type": "ခေတ်စမ်းကဗျာ"
        }
    ]
    
    return sample_poems

# Load all data
excel_poems = load_excel_data()
json_poems = load_json_data()

# If no JSON data found, use sample data for demonstration
if not json_poems:
    print("No JSON files found, using sample data for demonstration...")
    json_poems = create_sample_json_data()

print(f"\nTotal poems loaded:")
print(f"- Excel: {len(excel_poems)}")
print(f"- JSON: {len(json_poems)}")
print(f"- Combined: {len(excel_poems) + len(json_poems)}")
