# Week 6: NLP & Transformers (Solution)

Complete solution implementations for Week 6 exercises.

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Optional
import re
import string
import logging

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Part 1: Text Preprocessing - SOLUTION

In [None]:
class TextPreprocessor:
    def __init__(self, lowercase=True, remove_punctuation=True, remove_stopwords=True, lemmatize=True):
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.stopwords = set(stopwords.words('english'))
        if lemmatize:
            self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text: str) -> str:
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def tokenize(self, text: str) -> List[str]:
        return word_tokenize(text)
    
    def preprocess(self, text: str) -> List[str]:
        text = self.clean_text(text)
        if self.lowercase:
            text = text.lower()
        tokens = self.tokenize(text)
        if self.remove_punctuation:
            tokens = [t for t in tokens if t not in string.punctuation]
        if self.remove_stopwords:
            tokens = [t for t in tokens if t not in self.stopwords]
        if self.lemmatize:
            tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        return tokens
    
    def preprocess_documents(self, documents: List[str]) -> List[List[str]]:
        return [self.preprocess(doc) for doc in documents]

# Test
preprocessor = TextPreprocessor()
test_text = "Check out https://example.com! Email: test@email.com. This is GREAT!!!"
print(f"Original: {test_text}")
print(f"Processed: {preprocessor.preprocess(test_text)}")

## Part 2: Text Vectorization - SOLUTION

In [None]:
class TextVectorizer:
    def __init__(self, max_features: int = 5000):
        self.vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, 2))
    
    def fit_transform(self, documents: List[str]) -> np.ndarray:
        return self.vectorizer.fit_transform(documents).toarray()
    
    def transform(self, documents: List[str]) -> np.ndarray:
        return self.vectorizer.transform(documents).toarray()
    
    def get_feature_names(self) -> List[str]:
        return self.vectorizer.get_feature_names_out().tolist()
    
    def get_top_features(self, document_vector: np.ndarray, top_n: int = 10) -> List[Tuple[str, float]]:
        feature_names = self.get_feature_names()
        top_indices = np.argsort(document_vector)[-top_n:][::-1]
        return [(feature_names[i], document_vector[i]) for i in top_indices]

## Part 3: Embeddings - SOLUTION

In [None]:
class EmbeddingHandler:
    def __init__(self, model_name: str = 'bert-base-uncased'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()
    
    def get_sentence_embedding(self, text: str) -> np.ndarray:
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        return embedding
    
    def compute_similarity(self, text1: str, text2: str) -> float:
        emb1 = self.get_sentence_embedding(text1)
        emb2 = self.get_sentence_embedding(text2)
        return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))

## Part 4: Attention Mechanism - SOLUTION

In [None]:
class SimpleAttention(nn.Module):
    def __init__(self, hidden_size: int):
        super(SimpleAttention, self).__init__()
        self.hidden_size = hidden_size
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.hidden_size)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention_weights = torch.softmax(scores, dim=-1)
        output = torch.matmul(attention_weights, V)
        return output

## Part 5: NLP Evaluator - SOLUTION

In [None]:
class NLPEvaluator:
    @staticmethod
    def evaluate_classification(y_true: List[int], y_pred: List[int], class_names: List[str] = None) -> Dict:
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'report': classification_report(y_true, y_pred, target_names=class_names)
        }
        print(metrics['report'])
        return metrics
    
    @staticmethod
    def plot_confusion_matrix(y_true: List[int], y_pred: List[int], class_names: List[str]):
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.show()

## Part 6: Document Intelligence Engine - SOLUTION

In [None]:
class DocumentIntelligenceEngine:
    def __init__(self, model_name: str = 'distilbert-base-uncased'):
        self.preprocessor = TextPreprocessor()
        self.embedding_handler = EmbeddingHandler(model_name)
        self.sentiment_analyzer = pipeline('sentiment-analysis')
        self.summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
        self.ner = pipeline('ner', aggregation_strategy='simple')
    
    def analyze_sentiment(self, text: str) -> Dict:
        return self.sentiment_analyzer(text[:512])[0]
    
    def extract_keywords(self, text: str, top_n: int = 10) -> List[Tuple[str, float]]:
        vectorizer = TextVectorizer(max_features=100)
        tfidf_matrix = vectorizer.fit_transform([text])
        return vectorizer.get_top_features(tfidf_matrix[0], top_n)
    
    def summarize(self, text: str, max_length: int = 150) -> str:
        if len(text.split()) < 50:
            return text
        summary = self.summarizer(text[:1024], max_length=max_length, min_length=30, do_sample=False)
        return summary[0]['summary_text']
    
    def extract_entities(self, text: str) -> List[Dict]:
        return self.ner(text[:512])
    
    def find_similar_documents(self, query: str, documents: List[str], top_k: int = 5) -> List[int]:
        query_emb = self.embedding_handler.get_sentence_embedding(query)
        doc_embs = [self.embedding_handler.get_sentence_embedding(doc[:512]) for doc in documents]
        similarities = [np.dot(query_emb, doc_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(doc_emb)) 
                       for doc_emb in doc_embs]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return top_indices.tolist()
    
    def analyze_document(self, text: str) -> Dict:
        return {
            'sentiment': self.analyze_sentiment(text),
            'keywords': self.extract_keywords(text),
            'summary': self.summarize(text),
            'entities': self.extract_entities(text)
        }

## Example Usage

In [None]:
# Initialize engine
engine = DocumentIntelligenceEngine()

# Sample document
sample_doc = """
Apple Inc. announced record quarterly earnings today, with CEO Tim Cook praising 
the strong performance of iPhone sales globally. The tech giant's stock surged 
5% in after-hours trading as investors reacted positively to the news.
"""

# Analyze
results = engine.analyze_document(sample_doc)
print("Analysis Results:")
print(f"Sentiment: {results['sentiment']}")
print(f"Keywords: {results['keywords'][:5]}")
print(f"Entities: {results['entities']}")

## Summary

This solution demonstrates complete implementations of:
- Text preprocessing pipelines
- TF-IDF vectorization
- Transformer-based embeddings
- Attention mechanisms
- Document intelligence system
- Sentiment analysis, summarization, and NER

Use these as reference implementations for production NLP systems.