**L6 Pre-Processing Pipeline -- Lecture Demonstration**

Run time: approximately 12 minutes. This notebook accompanies Section 6.4 of the Week 6 lecture. For the full pipeline implementation including N-grams, TF-IDF weighting, and workshop exercises, see L6\_preprocessing\_pipeline\_WORKSHOP.ipynb.

In [None]:
import re, warnings
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.colors as mc
import numpy as np
warnings.filterwarnings('ignore')

# -- NLTK --------------------------------------------------------------------
import nltk
from nltk.stem import PorterStemmer

for _path, _pkg in [('tokenizers/punkt_tab', 'punkt_tab'),
                    ('corpora/stopwords',    'stopwords')]:
    try:
        nltk.data.find(_path)
    except LookupError:
        try:    nltk.download(_pkg, quiet=True)
        except Exception: pass

try:
    from nltk.tokenize import word_tokenize as _wt
    _NLTK_TOK = True
except ImportError:
    _NLTK_TOK = False

try:
    from nltk.corpus import stopwords as _sw
    _NLTK_SW = set(_sw.words('english'))
except Exception:
    _NLTK_SW = {
        'i','me','my','we','our','you','your','he','him','his','she','her',
        'it','its','they','them','their','what','which','who','this','that',
        'these','those','am','is','are','was','were','be','been','being',
        'have','has','had','do','does','did','a','an','the','and','but',
        'if','or','as','of','at','by','for','with','about','into','through',
        'during','before','after','to','from','in','out','on','off','then',
        'here','there','when','where','how','all','both','each','more',
        'other','some','than','so','also','any','now','per','re','just',
        'very','still','back','even','well','way','where','much','above',
    }

# -- spaCy -------------------------------------------------------------------
try:
    import spacy
    _nlp = spacy.load('en_core_web_sm')
    _SPACY = True
except (ImportError, OSError):
    _SPACY = False

def lemmatise(tokens):
    if _SPACY:
        return [t.lemma_.lower() for t in _nlp(' '.join(tokens))]
    out = []
    for w in tokens:
        if   w.endswith('ies') and len(w) > 4: out.append(w[:-3] + 'y')
        elif w.endswith('ing') and len(w) > 5: out.append(w[:-3])
        elif w.endswith('ed')  and len(w) > 4: out.append(w[:-2])
        elif w.endswith('es')  and len(w) > 3: out.append(w[:-1])
        elif w.endswith('s')   and len(w) > 3 and not w.endswith('ss'): out.append(w[:-1])
        else: out.append(w)
    return out

# -- Tokeniser ---------------------------------------------------------------
_stemmer = PorterStemmer()

def tokenise(text):
    lo = text.lower()
    if _NLTK_TOK:
        try:
            toks = _wt(lo)
        except Exception:
            toks = re.findall(r'[a-z]+', lo)
    else:
        toks = re.findall(r'[a-z]+', lo)
    return [t for t in toks if t.isalpha()]

# -- Stop-word lists ---------------------------------------------------------
# NLTK English list includes all six modals: will, may, could, should, would, must
MODALS    = {'will', 'may', 'could', 'should', 'would', 'must'}
NEGATIONS = {'not', 'no', 'nor', 'never', 'neither', 'without', 'cannot'}
PROTECTED = MODALS | NEGATIONS
STD_SW    = _NLTK_SW
FIN_SW    = _NLTK_SW - PROTECTED   # finance-adjusted: keeps modals and negation

# -- Loughran-McDonald word lists (representative subset) --------------------
LM_NEG = {
    'loss','losses','decline','decrease','decreased','failure','fail','failed',
    'risk','risks','uncertain','uncertainty','uncertainties','adverse','adversely',
    'violation','impair','impairment','restate','restated','litigation','default',
    'difficult','difficulties','inability','unable','weakness','weaknesses',
    'deteriorate','deterioration','negative','negatively','harm','harmful',
    'shortage','concern','concerns','volatile','volatility','severe',
    'insufficient','challenge','challenges','constrain','constraint','constraints',
    'persist','intervention','regulatory',
}
LM_POS = {
    'achieve','achieved','achievement','growth','grew','increase','increased',
    'increasing','improve','improved','improvement','strong','strength',
    'excellent','outstanding','positive','positively','opportunity','opportunities',
    'benefit','benefits','beneficial','confidence','confident','efficient',
    'effective','success','successful','generate','generates','generating',
    'elevated','substantially','significant','favorable','favourable','record',
}
LM_ALL   = LM_NEG | LM_POS
LM_STEMS = {_stemmer.stem(w) for w in LM_ALL}

# -- Dark theme --------------------------------------------------------------
BG   = '#1a1a2e'
AXES = '#0f172a'
TEXT = '#e2e8f0'
GRID = '#2d3748'

plt.rcParams.update({
    'figure.facecolor':   BG,
    'axes.facecolor':     AXES,
    'axes.edgecolor':     GRID,
    'axes.labelcolor':    TEXT,
    'xtick.color':        TEXT,
    'ytick.color':        TEXT,
    'text.color':         TEXT,
    'axes.titlecolor':    TEXT,
    'grid.color':         GRID,
    'grid.linewidth':     0.5,
    'axes.grid':          True,
    'axes.grid.axis':     'x',
    'font.family':        'sans-serif',
    'font.size':          9,
    'axes.titlesize':     10,
    'axes.spines.top':    False,
    'axes.spines.right':  False,
    'axes.spines.left':   False,
    'axes.spines.bottom': False,
})

def _colours(n):
    s, e = mc.hex2color('#7c3aed'), mc.hex2color('#1d4ed8')
    return [mc.to_hex([s[j] + (e[j] - s[j]) * i / max(n - 1, 1) for j in range(3)])
            for i in range(n)]

def hbar(ax, labels, values, title, xlabel='Count'):
    n = len(labels)
    ax.barh(range(n), values, color=_colours(n), height=0.7)
    ax.set_yticks(range(n))
    ax.set_yticklabels(labels, fontsize=8)
    ax.invert_yaxis()
    ax.set_xlabel(xlabel, fontsize=9)
    ax.set_title(title, fontsize=10, fontweight='bold', pad=8)

print('Setup complete.')


The excerpt below is drawn from the Enron email corpus compiled by Klimt and Yang (2004), released into the public domain by the Federal Energy Regulatory Commission during the 2001 investigation into Enron Corporation. It is used in this module because it combines the hedged, forward-looking register of corporate financial communication with the informal directness of internal correspondence, making it a productive corpus for demonstrating why generic preprocessing pipelines are ill-suited to financial text analysis.

In [None]:
EXCERPT = (
    "The California electricity markets continue to present significant opportunities "
    "for our wholesale trading operations. We believe that power prices will remain "
    "elevated through the first quarter and could increase further if supply constraints "
    "persist. Our risk management team has concluded that we should maintain current "
    "forward positions and not reduce our hedging exposure at this time.\n\n"
    "We are not in a position to forecast exact price movements with certainty, nor can "
    "we guarantee that our contractual obligations will be met under all market "
    "scenarios. Management believes the company is well positioned to navigate these "
    "uncertainties. Investors should be aware that forward-looking statements in this "
    "communication may differ materially from actual results.\n\n"
    "Revenue from gas and power trading operations increased substantially in the period "
    "under review. This increase reflects both higher commodity prices and the improved "
    "execution of our trading strategies. We must note that past performance should not "
    "be taken as indicative of future results. Strong growth in operating earnings "
    "reflects the underlying strength of our core business. We would encourage senior "
    "management to consider these results in the context of the current regulatory "
    "environment. The risk of further regulatory intervention remains significant, and "
    "market participants could face challenges if conditions deteriorate."
)

print(f'Corpus: {len(EXCERPT.split())} words.')
print()
print(EXCERPT)


**Block 1: Raw text and unprocessed frequency counts.**

In [None]:
# No cleaning whatsoever: whitespace split only
raw_tokens = EXCERPT.split()
raw_freq   = Counter(raw_tokens).most_common(20)
labels, values = zip(*raw_freq)

fig, ax = plt.subplots(figsize=(9, 5))
hbar(ax, list(labels), list(values), 'Top 20 Raw Tokens -- no cleaning applied')
plt.tight_layout()
plt.show()


This is what a word frequency model sees before any pre-processing. Consider whether this frequency distribution tells you anything meaningful about the document.

**Block 2: Stop-word removal -- standard list versus finance-adjusted list.**

NLTK's default English stop-word list includes all six modal verbs (will, may, could, should, would, must) and negation terms (not, no, nor). The finance-adjusted list removes these from the stop list, preserving them as tokens.

In [None]:
tokens = tokenise(EXCERPT)

# Standard stop-word removal
std_tokens = [t for t in tokens if t not in STD_SW and len(t) > 1]
std_freq   = Counter(std_tokens).most_common(20)

# Finance-adjusted: protects modal verbs and negation
fin_tokens = [t for t in tokens if t not in FIN_SW and len(t) > 1]
fin_freq   = Counter(fin_tokens).most_common(20)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
hbar(ax1, [w for w, _ in std_freq], [c for _, c in std_freq],
     'Standard Stop-Word Removal')
hbar(ax2, [w for w, _ in fin_freq], [c for _, c in fin_freq],
     'Finance-Adjusted (Modals + Negation Protected)')
plt.tight_layout()
plt.show()


The left panel removes modal verbs. The right panel protects them. Which output would you run a sentiment analysis on?

**Block 3: Stemming versus lemmatisation, and the effect on dictionary matching.**

In [None]:
tokens  = tokenise(EXCERPT)
content = [t for t in tokens if t not in STD_SW and len(t) >= 3]
top20   = [w for w, _ in Counter(content).most_common(20)]
stems   = [_stemmer.stem(w) for w in top20]
lemmas  = lemmatise(top20)

# -- Table (matplotlib, dark theme) -----------------------------------------
fig_t, ax_t = plt.subplots(figsize=(9, 5))
ax_t.axis('off')
fig_t.patch.set_facecolor(BG)
ax_t.set_facecolor(BG)

tbl = ax_t.table(
    cellText=list(zip(top20, stems, lemmas)),
    colLabels=['Token', 'Porter Stem', 'SpaCy Lemma'],
    loc='center', cellLoc='center'
)
tbl.auto_set_font_size(False)
tbl.set_fontsize(8.5)
tbl.scale(1, 1.35)
for (row, col), cell in tbl.get_celld().items():
    if row == 0:
        cell.set_facecolor('#3730a3')
        cell.set_text_props(color='white', fontweight='bold')
    elif row % 2 == 0:
        cell.set_facecolor(AXES)
        cell.set_text_props(color=TEXT)
    else:
        cell.set_facecolor(BG)
        cell.set_text_props(color=TEXT)
    cell.set_edgecolor(GRID)

ax_t.set_title('Top 20 Content Words: Token vs Porter Stem vs SpaCy Lemma',
               fontsize=10, fontweight='bold', color=TEXT, pad=12)
plt.tight_layout()
plt.show()

# -- LM match rates ----------------------------------------------------------
n             = len(top20)
lemma_matches = sum(1 for l in lemmas if l in LM_ALL)
stem_matches  = sum(1 for s in stems  if s in LM_STEMS)

fig_b, ax_b = plt.subplots(figsize=(8, 2.5))
fig_b.patch.set_facecolor(BG)
categories = ['Lemmatised', 'Stemmed (Porter)']
rates      = [lemma_matches / n * 100, stem_matches / n * 100]
ax_b.barh(categories, rates, color=['#7c3aed', '#1d4ed8'], height=0.45)
for i, (v, m) in enumerate(zip(rates, [lemma_matches, stem_matches])):
    ax_b.text(v + 0.5, i, f'{v:.0f}%  ({m}/{n})', va='center',
              fontsize=9, color=TEXT)
ax_b.set_xlabel('LM Dictionary Match Rate (%)', fontsize=9)
ax_b.set_xlim(0, 60)
ax_b.set_title('Loughran-McDonald Match Rate: Lemmatised vs Stemmed',
               fontsize=10, fontweight='bold', pad=8)
plt.tight_layout()
plt.show()


The match rate difference is not a minor inconvenience. It is the difference between a model that measures sentiment and one that measures noise.

**Block 4: Full recommended pipeline -- before and after.**

In [None]:
# Before: lowercase only, no other processing
before_tokens = re.findall(r'[a-z]+', EXCERPT.lower())
before_freq   = Counter(before_tokens).most_common(20)

# Full pipeline: tokenise, finance-adjusted stop-word removal,
# lemmatise, prune tokens shorter than 3 characters
pipeline_tokens = tokenise(EXCERPT)
pipeline_tokens = [t for t in pipeline_tokens if t not in FIN_SW]
pipeline_tokens = lemmatise(pipeline_tokens)
pipeline_tokens = [t for t in pipeline_tokens if len(t) >= 3 and t.isalpha()]
after_freq      = Counter(pipeline_tokens).most_common(20)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
hbar(ax1, [w for w, _ in before_freq], [c for _, c in before_freq],
     'Before: Lowercase Only')
hbar(ax2, [w for w, _ in after_freq],  [c for _, c in after_freq],
     'After: Full Recommended Pipeline')
plt.tight_layout()
plt.show()


**Block 5: The diagnostic cell -- a deliberately misconfigured pipeline.**

In [None]:
# Misconfigured pipeline:
#   Porter stemmer applied
#   Standard stop-word list (no modal verb or negation protection)
#   No length pruning
tokens      = tokenise(EXCERPT)
bad_tokens  = [t for t in tokens if t not in STD_SW]
bad_stemmed = [_stemmer.stem(t) for t in bad_tokens]
bad_freq    = Counter(bad_stemmed).most_common(20)

fig, ax = plt.subplots(figsize=(9, 5))
hbar(ax, [w for w, _ in bad_freq], [c for _, c in bad_freq],
     'Misconfigured Pipeline: Porter Stem + Standard Stop-Words + No Pruning')
plt.tight_layout()
plt.show()


Identify what is wrong with this pipeline and what the consequences would be for a Loughran-McDonald sentiment analysis.

**References**

Grimmer, J., Roberts, M.E. and Stewart, B.M. (2022) *Text as Data: A New Framework for Machine Learning and the Social Sciences*. Princeton: Princeton University Press.

Klimt, B. and Yang, Y. (2004) 'The Enron Corpus: A New Dataset for Email Classification Research', in *Machine Learning: ECML 2004*, Lecture Notes in Computer Science, vol. 3201. Berlin: Springer, pp. 217--226. doi: 10.1007/978-3-540-30115-8\_22.

Loughran, T. and McDonald, B. (2011) 'When is a Liability not a Liability? Textual Analysis, Dictionaries, and 10-Ks', *Journal of Finance*, 66(1), pp. 35--65. doi: 10.1111/j.1540-6261.2010.01625.x.

Manning, C.D. and Schutze, H. (1999) *Foundations of Statistical Natural Language Processing*. Cambridge, MA: MIT Press.

Porter, M.F. (1980) 'An algorithm for suffix stripping', *Program*, 14(3), pp. 130--137. doi: 10.1108/eb046814.