In [1]:
# Building a Custom Text Processing Pipeline

### 1. Custom Tokenizer
import re

class SimpleTokenizer:
    def __init__(self):
        self.contractions = {"n't": "not", "'s": "is", "'re": "are"}  # etc.
        
    def tokenize(self, text):
        # Handle contractions
        for cont, expanded in self.contractions.items():
            text = text.replace(cont, f" {expanded}")
        
        # Basic word tokenization with regex
        tokens = re.findall(r"\w+(?:'\w+)?|\S", text)
        return tokens

# Test the tokenizer
tokenizer = SimpleTokenizer()
sample_text = "I can't believe it's working! Let's test-drive it."
print(tokenizer.tokenize(sample_text))

### 2. Custom Stemmer
class SimpleStemmer:
    def stem(self, word):
        suffixes = ['ing', 'ly', 'ed', 'ious', 'ies', 's']
        for suffix in suffixes:
            if word.endswith(suffix):
                return word[:-len(suffix)]
        return word

# Test the stemmer
stemmer = SimpleStemmer()
words = ["running", "happily", "jumped", "curious", "parties"]
print([stemmer.stem(w) for w in words])

### 3. Custom Lemmatizer
class SimpleLemmatizer:
    def __init__(self):
        self.lemma_dict = {
            'running': 'run',
            'better': 'good',
            'went': 'go'
            # Would normally load from a larger dictionary
        }
        
    def lemmatize(self, word, pos='n'):
        return self.lemma_dict.get(word.lower(), word)

# Test the lemmatizer
lemmatizer = SimpleLemmatizer()
print(lemmatizer.lemmatize("running"))
print(lemmatizer.lemmatize("better", pos='a'))

### 4. Complete Pipeline
class TextPreprocessor:
    def __init__(self):
        self.tokenizer = SimpleTokenizer()
        self.stemmer = SimpleStemmer()
        self.lemmatizer = SimpleLemmatizer()
        
    def preprocess(self, text, stem=False, lemma=True):
        tokens = self.tokenizer.tokenize(text)
        if stem:
            tokens = [self.stemmer.stem(t) for t in tokens]
        if lemma:
            tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        return tokens

# Compare with spaCy
preprocessor = TextPreprocessor()
print("Custom:", preprocessor.preprocess(sample_text))

import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_text)
print("spaCy:", [token.lemma_ for token in doc])

['I', 'ca', 'not', 'believe', 'it', 'is', 'working', '!', 'Let', 'is', 'test', '-', 'drive', 'it', '.']
['runn', 'happi', 'jump', 'cur', 'part']
run
good
Custom: ['I', 'ca', 'not', 'believe', 'it', 'is', 'working', '!', 'Let', 'is', 'test', '-', 'drive', 'it', '.']
spaCy: ['I', 'can', 'not', 'believe', 'it', 'be', 'work', '!', 'let', 'us', 'test', '-', 'drive', 'it', '.']
