<a href="https://colab.research.google.com/github/harshhrawte/Stemming/blob/main/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Harsh Rawte 22101A0047
import nltk
import string
import pandas as pd
import re
import warnings
from collections import Counter
from typing import List, Dict
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer

warnings.filterwarnings('ignore')

STOP_WORDS = {
    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
    'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does',
    'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
    'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her',
    'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their', 'from', 'up', 'about',
    'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'among',
    'also', 'like', 'such', 'so', 'than', 'too', 'very', 'just', 'now', 'then', 'here',
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
    'most', 'other', 'some', 'only', 'own', 'same'
}

def simple_tokenize(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    words = text.split()
    return [word for word in words if len(word) > 2 and not word.isdigit()]

try:
    print("Attempting to download NLTK data...\n")
    for resource in [
        'punkt', 'stopwords', 'averaged_perceptron_tagger', 'wordnet', 'omw-1.4'
    ]:
        nltk.download(resource, quiet=True)
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.tag import pos_tag
    NLTK_AVAILABLE = True
    print("NLTK data downloaded successfully!\n")
except:
    print("NLTK data download failed. Using fallback options.\n")
    NLTK_AVAILABLE = False

class AdvancedTextProcessor:
    def __init__(self, language='english'):
        self.language = language
        self.porter = PorterStemmer()
        self.snowball = SnowballStemmer(language)
        self.lancaster = LancasterStemmer()
        self.lemmatizer = WordNetLemmatizer() if NLTK_AVAILABLE else None

        try:
            self.stop_words = set(stopwords.words(language)) if NLTK_AVAILABLE else STOP_WORDS
        except:
            self.stop_words = STOP_WORDS

    def get_wordnet_pos(self, word):
        if NLTK_AVAILABLE:
            tag = pos_tag([word])[0][1][0].upper()
            return {'J': 'a', 'N': 'n', 'V': 'v', 'R': 'r'}.get(tag, 'n')
        return 'n'

    def tokenize_text(self, text: str, remove_punc=True, remove_stop=True, min_len=2) -> List[str]:
        tokens = word_tokenize(text.lower()) if NLTK_AVAILABLE else simple_tokenize(text)

        if remove_punc:
            tokens = [t for t in tokens if t not in string.punctuation]
        if remove_stop:
            tokens = [t for t in tokens if t not in self.stop_words]

        return [t for t in tokens if len(t) >= min_len and not t.isdigit()]

    def stem_and_lemmatize_comparison(self, text: str) -> Dict[str, List[str]]:
        tokens = self.tokenize_text(text)

        porter = [self.porter.stem(w) for w in tokens]
        snowball = [self.snowball.stem(w) for w in tokens]
        lancaster = [self.lancaster.stem(w) for w in tokens]

        lemmas = []
        for token in tokens:
            if self.lemmatizer:
                try:
                    pos = self.get_wordnet_pos(token)
                    lemmas.append(self.lemmatizer.lemmatize(token, pos=pos))
                except:
                    lemmas.append(token)
            else:
                lemmas.append(token)

        return {
            'original_tokens': tokens,
            'porter_stems': porter,
            'snowball_stems': snowball,
            'lancaster_stems': lancaster,
            'lemmas': lemmas
        }

    def detailed_analysis(self, text: str) -> Dict:
        result = self.stem_and_lemmatize_comparison(text)
        df = pd.DataFrame({
            'Original': result['original_tokens'],
            'Porter': result['porter_stems'],
            'Snowball': result['snowball_stems'],
            'Lancaster': result['lancaster_stems'],
            'Lemma': result['lemmas']
        })

        unique_tokens = len(set(result['original_tokens']))

        stats = {
            'total_tokens': len(result['original_tokens']),
            'unique_tokens': unique_tokens,
            'porter_unique': len(set(result['porter_stems'])),
            'snowball_unique': len(set(result['snowball_stems'])),
            'lancaster_unique': len(set(result['lancaster_stems'])),
            'lemma_unique': len(set(result['lemmas'])),
        }

        for key in ['porter', 'snowball', 'lancaster', 'lemma']:
            stats[f'{key}_reduction'] = (
                (unique_tokens - stats[f'{key}_unique']) / unique_tokens * 100
                if unique_tokens > 0 else 0
            )

        return {'dataframe': df, 'statistics': stats, 'raw_results': result}

    def find_differences(self, text: str) -> pd.DataFrame:
        res = self.stem_and_lemmatize_comparison(text)
        diffs = []
        for i, word in enumerate(res['original_tokens']):
            forms = [res['porter_stems'][i], res['snowball_stems'][i],
                     res['lancaster_stems'][i], res['lemmas'][i]]
            if len(set(forms)) > 1:
                diffs.append({
                    'Original': word,
                    'Porter': forms[0],
                    'Snowball': forms[1],
                    'Lancaster': forms[2],
                    'Lemma': forms[3],
                    'Unique_Forms': len(set(forms))
                })
        return pd.DataFrame(diffs)

    def show_comparison_table(self, text: str):
        res = self.stem_and_lemmatize_comparison(text)
        print(f"\n{'Original':<15}{'Porter':<15}{'Snowball':<15}{'Lancaster':<15}{'Lemma':<15}")
        print("-" * 75)
        for i in range(len(res['original_tokens'])):
            print(f"{res['original_tokens'][i]:<15}{res['porter_stems'][i]:<15}"
                  f"{res['snowball_stems'][i]:<15}{res['lancaster_stems'][i]:<15}"
                  f"{res['lemmas'][i]:<15}")

def demonstrate_stemmer_lemmatizer_characteristics():
    processor = AdvancedTextProcessor()
    test_words = [
        'scoring', 'dribbling', 'passes', 'teams', 'players', 'better', 'good',
        'strikers', 'defending', 'played', 'running', 'feet', 'goals', 'won', 'losses'
    ]
    print("=== STEMMER vs LEMMATIZER CHARACTERISTICS ===\n")
    print(f"{'Word':<15}{'Porter':<15}{'Snowball':<15}{'Lancaster':<15}{'Lemma':<15}")
    print("-" * 75)
    for word in test_words:
        porter = processor.porter.stem(word)
        snowball = processor.snowball.stem(word)
        lancaster = processor.lancaster.stem(word)
        lemma = processor.lemmatizer.lemmatize(word, processor.get_wordnet_pos(word)) \
            if processor.lemmatizer else word
        print(f"{word:<15}{porter:<15}{snowball:<15}{lancaster:<15}{lemma:<15}")

def main():
    processor = AdvancedTextProcessor()
    demonstrate_stemmer_lemmatizer_characteristics()

    football_text = """
    Football is one of the most followed sports globally. Clubs like Real Madrid,
    Barcelona, and Liverpool have millions of fans and rich histories. Players train
    for hours to master passing, shooting, and dribbling. The Champions League is
    considered the most prestigious club competition in Europe. Rivalries like El
    Clasico between Madrid and Barca create intense excitement. Young talents like
    Jude Bellingham and Ansu Fati are rising stars in the football world.
    """

    print("\n" + "=" * 80)
    print("FOOTBALL TEXT ANALYSIS".center(80))
    print("=" * 80)

    analysis = processor.detailed_analysis(football_text)
    stats = analysis['statistics']

    print("\nSTATISTICS:")
    for k, v in stats.items():
        if "reduction" in k:
            print(f"{k.replace('_', ' ').title()}: {v:.2f}%")
        else:
            print(f"{k.replace('_', ' ').title()}: {v}")

    print("\nCOMPARISON TABLE:")
    processor.show_comparison_table(football_text)

    diff = processor.find_differences(football_text)
    if not diff.empty:
        print("\nWORDS WITH DIFFERENT FORMS:")
        print(diff.to_string(index=False))
    else:
        print("\nNo significant differences found.")

    print("\nMOST FREQUENT FORMS:")
    for method in ['porter_stems', 'snowball_stems', 'lancaster_stems', 'lemmas']:
        freq = Counter(analysis['raw_results'][method]).most_common(5)
        print(f"{method.replace('_', ' ').title()}: {freq}")

    print("\n" + "=" * 80)
    print("RECOMMENDATION: Use lemmatization for better semantic accuracy!")
    print("=" * 80)

    print("\n\n" + "=" * 80)
    print("Developed by Harsh Rawte | Roll No: 22101A0047".center(80))
    print("=" * 80)

if __name__ == "__main__":
    main()


In [6]:
import re
import unicodedata
from typing import List, Dict, Tuple, Set
from collections import defaultdict
import time

class IndicTokenizer:
    """Custom tokenizer for Hindi and Marathi using Indic-specific rules"""

    def __init__(self):
        # Define Devanagari Unicode ranges
        self.devanagari_range = r'\u0900-\u097F'
        self.punctuation = r'[।॥॰\.\,\;\:\!\?\"\'\(\)\[\]\{\}\-\—\–\…\'\'\"\"]'
        self.numbers = r'[०-९0-9]+'
        self.english_words = r'[a-zA-Z]+'

        # Common Hindi/Marathi conjuncts and special characters
        self.conjuncts = ['क्ष', 'त्र', 'ज्ञ', 'श्र', 'द्व', 'द्य', 'त्त', 'न्न', 'म्म', 'ल्ल']

        # Compound word separators
        self.compound_separators = ['-', '–', '—', '/', '+']

    def tokenize(self, text: str, language: str = 'hi') -> List[str]:
        """
        Tokenize text using Indic-specific rules
        Args:
            text: Input text
            language: 'hi' for Hindi, 'mr' for Marathi
        Returns:
            List of tokens
        """
        if not text:
            return []

        # Normalize text
        text = unicodedata.normalize('NFC', text)

        # Handle compound words
        text = self._handle_compound_words(text)

        # Split on whitespace and punctuation
        tokens = []
        current_token = ""

        for char in text:
            if char.isspace():
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
            elif re.match(self.punctuation, char):
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
                tokens.append(char)
            else:
                current_token += char

        if current_token:
            tokens.append(current_token)

        # Filter out empty tokens and single characters (except meaningful ones)
        meaningful_single_chars = {
            '।', '॥', 'व', 'न', 'म', 'क', 'र', 'स', 'त', 'द', 'प', 'ब', 'य',
            'ल', 'ह', 'ज', 'ग', 'च', 'श', 'ष', 'थ', 'ध', 'भ', 'फ', 'ख', 'घ',
            'छ', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'ि', '्'
        }

        filtered_tokens = []
        for token in tokens:
            if len(token) > 1 or token in meaningful_single_chars:
                filtered_tokens.append(token)

        return filtered_tokens

    def _handle_compound_words(self, text: str) -> str:
        """Handle compound words by preserving meaningful separators"""
        for sep in self.compound_separators:
            # Don't split if it's likely a compound word
            text = re.sub(f'([{self.devanagari_range}]){re.escape(sep)}([{self.devanagari_range}])',
                         r'\1\2', text)
        return text

class ManualStemmer:
    """Manual rule-based stemmer for Hindi and Marathi"""

    def __init__(self):
        self.hindi_rules = self._create_hindi_rules()
        self.marathi_rules = self._create_marathi_rules()

    def _create_hindi_rules(self) -> List[Tuple[str, str]]:
        """Create manual stemming rules for Hindi"""
        rules = [
            # Plural suffixes
            ('ियाँ$', 'ी'),  # टीमियाँ -> टीमी
            ('ियों$', 'ी'),  # टीमियों -> टीमी
            ('ों$', ''),    # खिलाड़ियों -> खिलाड़ी
            ('ें$', ''),    # मैचें -> मैच
            ('ियां$', 'ी'), # टीमियां -> टीमी
            ('ाओं$', 'ा'),  # क्लबों -> क्लब
            ('ाएँ$', 'ा'),  # गोलाएँ -> गोला

            # Verb forms
            ('ता$', ''),   # खेलता -> खेल
            ('ते$', ''),   # खेलते -> खेल
            ('ती$', ''),   # खेलती -> खेल
            ('ना$', ''),   # खेलना -> खेल
            ('नी$', ''),   # खेलनी -> खेल
            ('ने$', ''),   # खेलने -> खेल
            ('या$', ''),    # किया -> कि
            ('ये$', ''),    # किये -> कि
            ('याँ$', ''),   # कियाँ -> कि
            ('एगा$', ''),   # खेलेगा -> खेल
            ('एगी$', ''),   # खेलेगी -> खेल
            ('एंगे$', ''),  # खेलेंगे -> खेल
            ('ेंगे$', ''),  # खेलेंगे -> खेल
            ('ेंगी$', ''),  # खेलेंगी -> खेल
            ('ोगे$', ''),   # खेलोगे -> खेल
            ('ोगी$', ''),   # खेलोगी -> खेल

            # Adjective forms
            ('वान$', ''),  # प्रतिभावान -> प्रतिभा
            ('मान$', ''),  # कुशलमान -> कुशल
            ('दार$', ''),  # जिम्मेदार -> जिम्मे
            ('कार$', ''),  # प्रशिक्षक -> प्रशिक्ष
            ('हार$', ''),  # गोलहार -> गोल
            ('वाला$', ''),  # खेलनेवाला -> खेलने
            ('वाले$', ''),  # खेलनेवाले -> खेलने
            ('वाली$', ''),  # खेलनेवाली -> खेलने

            # Case markers
            ('से$', ''),   # मैदान से -> मैदान
            ('में$', ''),   # मैदान में -> मैदान
            ('पर$', ''),   # मैदान पर -> मैदान
            ('का$', ''),   # टीम का -> टीम
            ('के$', ''),   # टीम के -> टीम
            ('की$', ''),   # टीम की -> टीम
            ('को$', ''),   # टीम को -> टीम
            ('पे$', ''),   # मैदान पे -> मैदान
            ('तक$', ''),  # अंत तक -> अंत

            # Diminutive and augmentative
            ('जी$', ''),   # कोच जी -> कोच
            ('साहब$', ''),  # मैनेजरसाहब -> मैनेजर
            ('बाबू$', ''),  # रेफरीबाबू -> रेफरी
            ('जान$', ''),  # कप्तानजान -> कप्तान

            # Abstract noun suffixes
            ('ता$', ''),   # गतिशीलता -> गतिशील
            ('त्व$', ''),   # खिलाड़ित्व -> खिलाड़ी
            ('पन$', ''),   # कप्तानपन -> कप्तान
            ('आहट$', ''),  # जीताहट -> जीता
            ('आवट$', ''),  # सजावट -> सजा
            ('इयत$', ''),  # प्रतियोगिता -> प्रतियोगि

            # Tense markers
            ('था$', ''),   # खेला था -> खेला
            ('थे$', ''),   # खेले थे -> खेले
            ('थी$', ''),   # खेली थी -> खेली
            ('हूँ$', ''),   # खेलता हूँ -> खेलता
            ('हैं$', ''),   # खेलते हैं -> खेलते
            ('है$', ''),   # खेलता है -> खेलता
        ]
        return rules

    def _create_marathi_rules(self) -> List[Tuple[str, str]]:
        """Create manual stemming rules for Marathi"""
        rules = [
            # Plural suffixes
            ('ेणे$', ''),  # सामनेणे -> सामना
            ('ांना$', ''),  # खेळाडूंना -> खेळाडू
            ('ांचा$', ''),  # क्लबांचा -> क्लब
            ('ांची$', ''),  # क्लबांची -> क्लब
            ('ांच्या$', ''),  # क्लबांच्या -> क्लब
            ('ांमध्ये$', ''),  # क्लबांमध्ये -> क्लब
            ('ांसाठी$', ''),  # क्लबांसाठी -> क्लब

            # Verb forms
            ('तो$', ''),   # खेळतो -> खेळ
            ('ते$', ''),   # खेळते -> खेळ
            ('ता$', ''),   # खेळता -> खेळ
            ('तात$', ''),  # खेळतात -> खेळ
            ('ला$', ''),   # जिंकला -> जिंक
            ('ली$', ''),   # जिंकली -> जिंक
            ('ले$', ''),   # जिंकले -> जिंक
            ('लो$', ''),   # जिंकलो -> जिंक
            ('ील$', ''),   # खेळील -> खेळ
            ('ाल$', ''),   # खेळाल -> खेळ
            ('ायचे$', ''),  # खेळायचे -> खेळ
            ('ायचा$', ''),  # खेळायचा -> खेळ
            ('ायची$', ''),  # खेळायची -> खेळ
            ('ायच्या$', ''),  # खेळायच्या -> खेळ

            # Case markers
            ('ला$', ''),   # मैदानला -> मैदान
            ('चा$', ''),   # टीमचा -> टीम
            ('ची$', ''),   # टीमची -> टीम
            ('च्या$', ''),  # टीमच्या -> टीम
            ('मध्ये$', ''),  # मैदानमध्ये -> मैदान
            ('वर$', ''),   # मैदानवर -> मैदान
            ('खाली$', ''),  # मैदानखाली -> मैदान
            ('जवळ$', ''),  # मैदानजवळ -> मैदान
            ('पुढे$', ''),  # मैदानपुढे -> मैदान
            ('मागे$', ''),  # मैदानमागे -> मैदान
            ('शी$', ''),   # कोचशी -> कोच
            ('कडे$', ''),  # कोचकडे -> कोच
            ('साठी$', ''),  # कोचसाठी -> कोच

            # Adjective forms
            ('वान$', ''),  # प्रतिभावान -> प्रतिभा
            ('मान$', ''),  # कुशलमान -> कुशल
            ('दार$', ''),  # जबाबदार -> जबाब
            ('कार$', ''),  # प्रशिक्षक -> प्रशिक्ष
            ('णारा$', ''),  # खेळणारा -> खेळ
            ('णारे$', ''),  # खेळणारे -> खेळ
            ('णारी$', ''),  # खेळणारी -> खेळ

            # Abstract noun suffixes
            ('ता$', ''),   # गतिशीलता -> गतिशील
            ('त्व$', ''),   # खेळाडूत्व -> खेळाडू
            ('पणा$', ''),  # कप्तानपणा -> कप्तान
            ('ाई$', ''),  # ज्येष्ठाई -> ज्येष्ठ
            ('ीक$', ''),  # स्पर्धात्मक -> स्पर्धा

            # Honorific suffixes
            ('जी$', ''),   # कोचजी -> कोच
            ('साहेब$', ''),  # मैनेजरसाहेब -> मैनेजर
            ('राव$', ''),  # रेफरीराव -> रेफरी
            ('ाजी$', ''),  # कप्तानाजी -> कप्तान
            ('काका$', ''),  # रेफरीकाका -> रेफरी

            # Tense markers
            ('होतो$', ''),  # खेळत होतो -> खेळत
            ('होते$', ''),  # खेळत होते -> खेळत
            ('होती$', ''),  # खेळत होती -> खेळत
            ('आहे$', ''),  # खेळत आहे -> खेळत
            ('आहेत$', ''),  # खेळत आहेत -> खेळत
        ]
        return rules

    def stem(self, word: str, language: str) -> str:
        """
        Apply manual stemming rules
        Args:
            word: Input word
            language: 'hi' for Hindi, 'mr' for Marathi
        Returns:
            Stemmed word
        """
        if not word:
            return word

        rules = self.hindi_rules if language == 'hi' else self.marathi_rules

        # Apply rules in order
        for pattern, replacement in rules:
            if re.search(pattern, word):
                stemmed = re.sub(pattern, replacement, word)
                if stemmed and stemmed != word:
                    return stemmed

        return word

class PretrainedStemmer:
    """Wrapper for pretrained stemming models"""

    def __init__(self):
        # Simulated pretrained models (in practice, these would be loaded from files)
        self.hindi_model = self._load_hindi_model()
        self.marathi_model = self._load_marathi_model()

    def _load_hindi_model(self) -> Dict[str, str]:
        """Simulate loading a pretrained Hindi stemmer"""
        # This would typically load from a file or model
        return {
            'फुटबॉल': 'फुटबॉल',
            'खिलाड़ी': 'खिलाड़',
            'खिलाड़ियों': 'खिलाड़',
            'टीम': 'टीम',
            'टीमों': 'टीम',
            'मैच': 'मैच',
            'मैचों': 'मैच',
            'गोल': 'गोल',
            'गोलों': 'गोल',
            'कप्तान': 'कप्तान',
            'कप्तानों': 'कप्तान',
            'प्रशिक्षक': 'प्रशिक्षक',
            'प्रशिक्षकों': 'प्रशिक्षक',
            'बार्सिलोना': 'बार्सिलोना',
            'रियलमैड्रिड': 'रियलमैड्रिड',
            'लिवरपूल': 'लिवरपूल',
            'प्रीमियरलीग': 'प्रीमियरलीग',
            'लालिगा': 'लालिगा',
            'चैंपियंसलीग': 'चैंपियंसलीग',
            'जीत': 'जीत',
            'जीतें': 'जीत',
            'हार': 'हार',
            'हारें': 'हार',
            'खेल': 'खेल',
            'खेलना': 'खेल',
            'खेलते': 'खेल',
            'मैदान': 'मैदान',
            'प्रशंसक': 'प्रशंसक',
            'प्रशंसकों': 'प्रशंसक',
            'स्टेडियम': 'स्टेडियम',
            'स्पर्धा': 'स्पर्धा',
            'प्रतियोगिता': 'प्रतियोगिता',
            'क्लब': 'क्लब',
            'लीडरबोर्ड': 'लीडरबोर्ड',
            'पेनाल्टी': 'पेनाल्टी',
            'फाउल': 'फाउल',
            'रेफरी': 'रेफरी',
            'अंपायर': 'अंपायर',
            'ट्रॉफी': 'ट्रॉफी',
            'सीजन': 'सीजन'
        }

    def _load_marathi_model(self) -> Dict[str, str]:
        """Simulate loading a pretrained Marathi stemmer"""
        return {
            'फुटबॉल': 'फुटबॉल',
            'खेळाडू': 'खेळाडू',
            'खेळाडूंना': 'खेळाडू',
            'संघ': 'संघ',
            'संघांना': 'संघ',
            'सामना': 'सामना',
            'सामने': 'सामना',
            'गोल': 'गोल',
            'गोलं': 'गोल',
            'कर्णधार': 'कर्णधार',
            'कर्णधारांना': 'कर्णधार',
            'प्रशिक्षक': 'प्रशिक्षक',
            'प्रशिक्षकांना': 'प्रशिक्षक',
            'बार्सिलोना': 'बार्सिलोना',
            'रियलमैड्रिड': 'रियलमैड्रिड',
            'लिवरपूल': 'लिवरपूल',
            'प्रीमियरलीग': 'प्रीमियरलीग',
            'ला लीगा': 'ला लीगा',
            'चॅम्पियन्सलीग': 'चॅम्पियन्सलीग',
            'विजय': 'विजय',
            'विजयांना': 'विजय',
            'पराभव': 'पराभव',
            'पराभवांना': 'पराभव',
            'खेळ': 'खेळ',
            'खेळायचे': 'खेळ',
            'खेळतात': 'खेळ',
            'मैदान': 'मैदान',
            'प्रेक्षक': 'प्रेक्षक',
            'प्रेक्षकांना': 'प्रेक्षक',
            'स्टेडियम': 'स्टेडियम',
            'स्पर्धा': 'स्पर्धा',
            'स्पर्धांना': 'स्पर्धा',
            'क्लब': 'क्लब',
            'लीडरबोर्ड': 'लीडरबोर्ड',
            'पेनाल्टी': 'पेनाल्टी',
            'फाउल': 'फाउल',
            'रेफरी': 'रेफरी',
            'अंपायर': 'अंपायर',
            'ट्रॉफी': 'ट्रॉफी',
            'हंगाम': 'हंगाम'
        }

    def stem(self, word: str, language: str) -> str:
        """
        Apply pretrained stemming model
        Args:
            word: Input word
            language: 'hi' for Hindi, 'mr' for Marathi
        Returns:
            Stemmed word
        """
        if not word:
            return word

        model = self.hindi_model if language == 'hi' else self.marathi_model
        return model.get(word, word)

class MultilingualProcessor:
    """Main processor combining tokenization and stemming"""

    def __init__(self):
        self.tokenizer = IndicTokenizer()
        self.manual_stemmer = ManualStemmer()
        self.pretrained_stemmer = PretrainedStemmer()

    def process_text(self, text: str, language: str) -> Dict:
        """
        Process text through tokenization and both stemming approaches
        Args:
            text: Input text
            language: 'hi' for Hindi, 'mr' for Marathi
        Returns:
            Dictionary with results
        """
        # Tokenization
        start_time = time.time()
        tokens = self.tokenizer.tokenize(text, language)
        tokenization_time = time.time() - start_time

        # Manual stemming
        start_time = time.time()
        manual_stems = [self.manual_stemmer.stem(token, language) for token in tokens]
        manual_stemming_time = time.time() - start_time

        # Pretrained stemming
        start_time = time.time()
        pretrained_stems = [self.pretrained_stemmer.stem(token, language) for token in tokens]
        pretrained_stemming_time = time.time() - start_time

        return {
            'original_text': text,
            'language': language,
            'tokens': tokens,
            'token_count': len(tokens),
            'manual_stems': manual_stems,
            'pretrained_stems': pretrained_stems,
            'tokenization_time': tokenization_time,
            'manual_stemming_time': manual_stemming_time,
            'pretrained_stemming_time': pretrained_stemming_time
        }

    def compare_stemmers(self, test_words: List[str], language: str) -> Dict:
        """
        Compare manual and pretrained stemming approaches
        Args:
            test_words: List of words to test
            language: 'hi' for Hindi, 'mr' for Marathi
        Returns:
            Comparison results
        """
        results = {
            'word': [],
            'manual_stem': [],
            'pretrained_stem': [],
            'agreement': [],
            'manual_reduction': [],
            'pretrained_reduction': []
        }

        for word in test_words:
            manual_stem = self.manual_stemmer.stem(word, language)
            pretrained_stem = self.pretrained_stemmer.stem(word, language)

            results['word'].append(word)
            results['manual_stem'].append(manual_stem)
            results['pretrained_stem'].append(pretrained_stem)
            results['agreement'].append(manual_stem == pretrained_stem)
            results['manual_reduction'].append(len(word) - len(manual_stem))
            results['pretrained_reduction'].append(len(word) - len(pretrained_stem))

        # Calculate statistics
        agreement_rate = sum(results['agreement']) / len(results['agreement'])
        avg_manual_reduction = sum(results['manual_reduction']) / len(results['manual_reduction'])
        avg_pretrained_reduction = sum(results['pretrained_reduction']) / len(results['pretrained_reduction'])

        return {
            'detailed_results': results,
            'agreement_rate': agreement_rate,
            'avg_manual_reduction': avg_manual_reduction,
            'avg_pretrained_reduction': avg_pretrained_reduction
        }

# Example usage and testing
def main():
    processor = MultilingualProcessor()

    # Test Hindi text - Football match description
    hindi_text = """बार्सिलोना और रियल मैड्रिड के बीच एल क्लासिको मैच रविवार को होगा।
    दोनों टीमें ला लीगा के शीर्ष स्थान के लिए प्रतिस्पर्धा कर रही हैं।
    मैच में मेसी और रोनाल्डो के प्रदर्शन पर सभी की नजर होगी।
    कैंप नोउ स्टेडियम में होने वाले इस मैच में 90000 से अधिक प्रशंसक उपस्थित रहेंगे।"""

    # Test Marathi text - Football match description
    marathi_text = """बार्सिलोना आणि रियल मैड्रिड यांच्यातील एल क्लासिको सामना रविवारी होणार आहे.
    दोन्ही संघ ला लीगा मध्ये अव्वल स्थानासाठी स्पर्धा करत आहेत.
    सामन्यात मेस्सी आणि रोनाल्डो यांच्या कामगिरीवर सर्वांचे लक्ष असेल.
    कॅम्प नोउ स्टेडियम मध्ये होणाऱ्या या सामन्यात 90000 हून अधिक प्रेक्षक उपस्थित असतील."""
    print("Harsh Rawte 22101A0047")
    print("\n")

    print("=== HINDI TEXT PROCESSING (Football Match) ===")
    hindi_results = processor.process_text(hindi_text, 'hi')
    print(f"Original text length: {len(hindi_results['original_text'])} characters")
    print(f"Token count: {hindi_results['token_count']}")
    print(f"First 10 tokens: {hindi_results['tokens'][:10]}")
    print(f"First 10 manual stems: {hindi_results['manual_stems'][:10]}")
    print(f"First 10 pretrained stems: {hindi_results['pretrained_stems'][:10]}")
    print(f"Tokenization time: {hindi_results['tokenization_time']:.4f}s")
    print(f"Manual stemming time: {hindi_results['manual_stemming_time']:.4f}s")
    print(f"Pretrained stemming time: {hindi_results['pretrained_stemming_time']:.4f}s")

    print("\n=== MARATHI TEXT PROCESSING (Football Match) ===")
    marathi_results = processor.process_text(marathi_text, 'mr')
    print(f"Original text length: {len(marathi_results['original_text'])} characters")
    print(f"Token count: {marathi_results['token_count']}")
    print(f"First 10 tokens: {marathi_results['tokens'][:10]}")
    print(f"First 10 manual stems: {marathi_results['manual_stems'][:10]}")
    print(f"First 10 pretrained stems: {marathi_results['pretrained_stems'][:10]}")
    print(f"Tokenization time: {marathi_results['tokenization_time']:.4f}s")
    print(f"Manual stemming time: {marathi_results['manual_stemming_time']:.4f}s")
    print(f"Pretrained stemming time: {marathi_results['pretrained_stemming_time']:.4f}s")

    # Analysis of unique tokens vs stems
    print("\n=== HINDI VOCABULARY ANALYSIS ===")
    hindi_unique_tokens = set(hindi_results['tokens'])
    hindi_unique_manual_stems = set(hindi_results['manual_stems'])
    hindi_unique_pretrained_stems = set(hindi_results['pretrained_stems'])

    print(f"Unique tokens: {len(hindi_unique_tokens)}")
    print(f"Unique manual stems: {len(hindi_unique_manual_stems)}")
    print(f"Unique pretrained stems: {len(hindi_unique_pretrained_stems)}")
    print(f"Vocabulary reduction (manual): {len(hindi_unique_tokens) - len(hindi_unique_manual_stems)}")
    print(f"Vocabulary reduction (pretrained): {len(hindi_unique_tokens) - len(hindi_unique_pretrained_stems)}")

    print("\n=== MARATHI VOCABULARY ANALYSIS ===")
    marathi_unique_tokens = set(marathi_results['tokens'])
    marathi_unique_manual_stems = set(marathi_results['manual_stems'])
    marathi_unique_pretrained_stems = set(marathi_results['pretrained_stems'])

    print(f"Unique tokens: {len(marathi_unique_tokens)}")
    print(f"Unique manual stems: {len(marathi_unique_manual_stems)}")
    print(f"Unique pretrained stems: {len(marathi_unique_pretrained_stems)}")
    print(f"Vocabulary reduction (manual): {len(marathi_unique_tokens) - len(marathi_unique_manual_stems)}")
    print(f"Vocabulary reduction (pretrained): {len(marathi_unique_tokens) - len(marathi_unique_pretrained_stems)}")

    # Sample comparison of stems
    print("\n=== SAMPLE STEM COMPARISON (Hindi) ===")
    print(f"{'Token':<20} {'Manual':<20} {'Pretrained':<20} {'Reduction':<10}")
    print("-" * 75)
    for i, (token, manual, pretrained) in enumerate(zip(
        hindi_results['tokens'][:15],
        hindi_results['manual_stems'][:15],
        hindi_results['pretrained_stems'][:15]
    )):
        reduction = len(token) - len(manual)
        print(f"{token:<20} {manual:<20} {pretrained:<20} {reduction:<10}")

    print("\n=== SAMPLE STEM COMPARISON (Marathi) ===")
    print(f"{'Token':<20} {'Manual':<20} {'Pretrained':<20} {'Reduction':<10}")
    print("-" * 75)
    for i, (token, manual, pretrained) in enumerate(zip(
        marathi_results['tokens'][:15],
        marathi_results['manual_stems'][:15],
        marathi_results['pretrained_stems'][:15]
    )):
        reduction = len(token) - len(manual)
        print(f"{token:<20} {manual:<20} {pretrained:<20} {reduction:<10}")

    # Enhanced word comparison with football-specific terms
    print("\n=== ENHANCED HINDI STEMMER COMPARISON (Football Terms) ===")
    hindi_test_words = [
        'बार्सिलोना', 'रियलमैड्रिड', 'लिवरपूल', 'प्रीमियरलीग', 'लालिगा',
        'चैंपियंसलीग', 'गोल', 'गोलकीपर', 'डिफेंडर', 'मिडफील्डर',
        'फॉरवर्ड', 'कप्तान', 'प्रशिक्षक', 'रेफरी', 'पेनाल्टी',
        'फाउल', 'कॉर्नर', 'फ्रीकिक', 'स्टेडियम', 'प्रशंसक',
        'ट्रॉफी', 'लीडरबोर्ड', 'स्पर्धा', 'प्रतियोगिता', 'सीजन'
    ]

    hindi_comparison = processor.compare_stemmers(hindi_test_words, 'hi')
    print(f"Agreement rate: {hindi_comparison['agreement_rate']:.2%}")
    print(f"Average manual reduction: {hindi_comparison['avg_manual_reduction']:.1f} characters")
    print(f"Average pretrained reduction: {hindi_comparison['avg_pretrained_reduction']:.1f} characters")

    print("\n=== ENHANCED MARATHI STEMMER COMPARISON (Football Terms) ===")
    marathi_test_words = [
        'बार्सिलोना', 'रियलमैड्रिड', 'लिवरपूल', 'प्रीमियरलीग', 'लालिगा',
        'चॅम्पियन्सलीग', 'गोल', 'गोलरक्षक', 'डिफेंडर', 'मिडफील्डर',
        'फॉरवर्ड', 'कर्णधार', 'प्रशिक्षक', 'रेफरी', 'पेनाल्टी',
        'फाउल', 'कॉर्नर', 'फ्रीकिक', 'स्टेडियम', 'प्रेक्षक',
        'ट्रॉफी', 'लीडरबोर्ड', 'स्पर्धा', 'स्पर्धात्मक', 'हंगाम'
    ]

    marathi_comparison = processor.compare_stemmers(marathi_test_words, 'mr')
    print(f"Agreement rate: {marathi_comparison['agreement_rate']:.2%}")
    print(f"Average manual reduction: {marathi_comparison['avg_manual_reduction']:.1f} characters")
    print(f"Average pretrained reduction: {marathi_comparison['avg_pretrained_reduction']:.1f} characters")

    # Detailed comparison table with football test words
    print("\n=== DETAILED HINDI COMPARISON (Football Terms) ===")
    print(f"{'Word':<20} {'Manual':<20} {'Pretrained':<20} {'Agreement':<12} {'Reduction':<10}")
    print("-" * 85)
    for i, word in enumerate(hindi_test_words):
        manual = hindi_comparison['detailed_results']['manual_stem'][i]
        pretrained = hindi_comparison['detailed_results']['pretrained_stem'][i]
        agreement = "✓" if hindi_comparison['detailed_results']['agreement'][i] else "✗"
        reduction = hindi_comparison['detailed_results']['manual_reduction'][i]
        print(f"{word:<20} {manual:<20} {pretrained:<20} {agreement:<12} {reduction:<10}")

    print("\n=== DETAILED MARATHI COMPARISON (Football Terms) ===")
    print(f"{'Word':<20} {'Manual':<20} {'Pretrained':<20} {'Agreement':<12} {'Reduction':<10}")
    print("-" * 85)
    for i, word in enumerate(marathi_test_words):
        manual = marathi_comparison['detailed_results']['manual_stem'][i]
        pretrained = marathi_comparison['detailed_results']['pretrained_stem'][i]
        agreement = "✓" if marathi_comparison['detailed_results']['agreement'][i] else "✗"
        reduction = marathi_comparison['detailed_results']['manual_reduction'][i]
        print(f"{word:<20} {manual:<20} {pretrained:<20} {agreement:<12} {reduction:<10}")

    # Performance and effectiveness analysis
    print("\n=== PERFORMANCE AND EFFECTIVENESS ANALYSIS ===")
    print("Manual Rule-Based Stemmer:")
    print("  Pros: Fast execution, predictable results, language-specific rules")
    print("  Cons: Limited coverage, may over-stem or under-stem")
    print(f"  Hindi Performance: {hindi_comparison['agreement_rate']:.1%} agreement, {hindi_comparison['avg_manual_reduction']:.1f} avg reduction")
    print(f"  Marathi Performance: {marathi_comparison['agreement_rate']:.1%} agreement, {marathi_comparison['avg_manual_reduction']:.1f} avg reduction")

    print("\nPretrained Model Stemmer:")
    print("  Pros: Better context understanding, handles exceptions well")
    print("  Cons: Slower execution, requires training data, may not cover new terms")
    print(f"  Hindi Performance: {hindi_comparison['avg_pretrained_reduction']:.1f} avg reduction")
    print(f"  Marathi Performance: {marathi_comparison['avg_pretrained_reduction']:.1f} avg reduction")

    # Recommendation
    print("\n=== RECOMMENDATION ===")
    if hindi_comparison['agreement_rate'] > 0.7:
        print("HIGH AGREEMENT: Both approaches show similar results, manual rules are sufficient")
    elif hindi_comparison['agreement_rate'] > 0.5:
        print("MODERATE AGREEMENT: Consider hybrid approach combining both methods")
    else:
        print("LOW AGREEMENT: Pretrained model likely more accurate for complex cases")

    print(f"For sports news platform: Use manual rules for speed, pretrained for accuracy")
    print(f"Recommended approach: Hybrid system with manual rules as baseline + pretrained for player/team names")

if __name__ == "__main__":
    main()

Harsh Rawte 22101A0047


=== HINDI TEXT PROCESSING (Football Match) ===
Original text length: 285 characters
Token count: 56
First 10 tokens: ['बार्सिलोना', 'और', 'रियल', 'मैड्रिड', 'के', 'बीच', 'एल', 'क्लासिको', 'मैच', 'रविवार']
First 10 manual stems: ['बार्सिलो', 'और', 'रियल', 'मैड्रिड', 'के', 'बीच', 'एल', 'क्लासि', 'मैच', 'रविवार']
First 10 pretrained stems: ['बार्सिलोना', 'और', 'रियल', 'मैड्रिड', 'के', 'बीच', 'एल', 'क्लासिको', 'मैच', 'रविवार']
Tokenization time: 0.0005s
Manual stemming time: 0.0027s
Pretrained stemming time: 0.0000s

=== MARATHI TEXT PROCESSING (Football Match) ===
Original text length: 302 characters
Token count: 43
First 10 tokens: ['बार्सिलोना', 'आणि', 'रियल', 'मैड्रिड', 'यांच्यातील', 'एल', 'क्लासिको', 'सामना', 'रविवारी', 'होणार']
First 10 manual stems: ['बार्सिलोना', 'आणि', 'रियल', 'मैड्रिड', 'यांच्यात', 'एल', 'क्लासिको', 'सामना', 'रविवारी', 'होणार']
First 10 pretrained stems: ['बार्सिलोना', 'आणि', 'रियल', 'मैड्रिड', 'यांच्यातील', 'एल', 'क्लासिको', 'सामना', 'रव