In [2]:
# Import the necessary libraries
import nltk
import urllib3
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import spacy

# Load the necessary NLTK resources and Spacy model
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')
nltk.download('punkt')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Internal packages for setting web proxy and BOE configuration, can skip if not using a Bank of England device
import boewebconnectpy
boewebconnectpy.set_boe_config()

Successfully set web config settings for urllib, urllib3, requests


In [4]:
r = urllib3.request("GET", "https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital", headers={"User-Agent": "Mozilla/5.0"})

r.status

200

In [21]:
def process_text(text, utf8=True):
    if utf8:
        text = text.decode('utf-8')
    text = text.replace('\n', ' ').replace('\r', '').replace('\t', '')
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'{[^>]+}', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.(\[a-zA-Z])', r'. \1', text)
    text  = sent_tokenize(text)
    return text

In [35]:
url_list = ["https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital",
            "https://www.bankofengland.co.uk/stress-testing/2024/stress-testing-uk-banking-system-scenarios-2024-desk-based",
            "https://www.bankofengland.co.uk/financial-stability-report/2024/november-2024#section6",
            "https://www.bankofengland.co.uk/stress-testing/2024/boes-approach-to-stress-testing-the-uk-banking-system"]

all_data = pd.DataFrame(columns=['OriginalSentence', 'Source'])
for url in url_list:
    r = urllib3.request("GET", url, headers={"User-Agent": "Mozilla/5.0"})
    if r.status == 200:
        print(f"Successfully fetched data from {url}")
        sentences = process_text(r.data)
        sentences.pop(0)
        sentence_data = pd.DataFrame(sentences, columns=['OriginalSentence'])
        sentence_data['Source'] = url
        sentence_data['Line'] = sentence_data.index + 1
        all_data = pd.concat([all_data, sentence_data], ignore_index=True)
    else:
        print(f"Failed to fetch data from {url}, status code: {r.status}")

all_data = all_data.drop_duplicates().reset_index(drop=True)
# Remove last line if contains it is 'Skip to main content'
all_data = all_data[~all_data['OriginalSentence'].str.contains('Skip to main content', case=False, na=False)]


Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital
Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2024/stress-testing-uk-banking-system-scenarios-2024-desk-based
Successfully fetched data from https://www.bankofengland.co.uk/financial-stability-report/2024/november-2024#section6
Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2024/boes-approach-to-stress-testing-the-uk-banking-system


In [7]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

print(stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [36]:
all_data["WordCount"] = all_data['OriginalSentence'].apply(lambda x: len(word_tokenize(x)))
all_data["NoStopwords"] = all_data['OriginalSentence'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stopwords]))

all_data = all_data[all_data['WordCount'] < 100]
all_data = all_data[all_data['OriginalSentence'] != 'Measuring the stability of the banking system: capital and liquidity at risk with solvency-liquidity...']

all_data.head()

Unnamed: 0,OriginalSentence,Source,Line,WordCount,NoStopwords
0,Stress testing is used by the Bank to determin...,https://www.bankofengland.co.uk/stress-testing...,1.0,29,Stress testing used Bank determine UK banking ...
1,By doing so the Bank aims to ensure banks can ...,https://www.bankofengland.co.uk/stress-testing...,2.0,82,Bank aims ensure banks absorb rather amplify s...
2,"Rather, like previous concurrent stress test s...",https://www.bankofengland.co.uk/stress-testing...,3.0,58,"Rather , like previous concurrent stress test ..."
3,"It is not a set of events that is expected, or...",https://www.bankofengland.co.uk/stress-testing...,4.0,17,"set events expected , likely , materialise ."
4,This tail risk scenario is used for the purpos...,https://www.bankofengland.co.uk/stress-testing...,5.0,23,tail risk scenario used purposes enhancing fin...


In [29]:
# Read in a file containing text to be tested against
test = open("Stress Test 2026.txt", "r")
test_data = test.read()
test.close()

test = process_text(test_data, utf8=False)
test_df = pd.DataFrame(test, columns=['OriginalSentence'])
test_df = test_df.head()

In [30]:
def check_similarity(sentence, all_data, min_threshold=0.75, perfect_threshold = 0.9, stopword = False):
    if stopword == False:
        sentence = sentence.join([word for word in word_tokenize(sentence) if word.lower() not in stopwords])
    doc1 = nlp(sentence)
    max_similarity = 0
    for idx, row in all_data.iterrows():
        if stopword == False:
            doc2 = nlp(row['NoStopwords'])
        else:
            doc2 = nlp(row['OriginalSentence'])
        similarity = doc1.similarity(doc2)
        if similarity > max_similarity:
            max_similarity = similarity
            if max_similarity >= perfect_threshold and max_similarity > min_threshold:
                Similar = row['OriginalSentence']
                SimilarSource = row['Source']
                SimilarLine = row['Line']
                SimilarityScore = max_similarity
                break
    if max_similarity > min_threshold:
        Similar = row['OriginalSentence']
        SimilarSource = row['Source']
        SimilarLine = row['Line']
        SimilarityScore = max_similarity
    else:
        Similar = "None found"
        SimilarSource = None
        SimilarLine = None
        SimilarityScore = None
    return Similar, SimilarSource, SimilarLine, SimilarityScore

In [31]:
test_df[["Similar", "SimilarSource", "SimilarLine", "SimilarityScore"]] = test_df['OriginalSentence'].apply(lambda x: check_similarity(x, all_data, min_threshold=0.8, perfect_threshold=0.98)).apply(pd.Series)

In [None]:
# Extract out numbers
test_df['OriginalNumeric'] = test_df['OriginalSentence'].apply(lambda x: nltk.pos_tag(word_tokenize(x))).apply(lambda x: ' '.join([word for word, pos in x if pos in ['CD']]))
test_df['SimilarNumeric'] = test_df['Similar'].apply(lambda x: nltk.pos_tag(word_tokenize(x))).apply(lambda x: ' '.join([word for word, pos in x if pos in ['CD']]))

In [None]:
nltk.pos_tag(word_tokenize("The bank has £50 billion in assets and 20 million customers."))


[('The', 'DT'),
 ('bank', 'NN'),
 ('has', 'VBZ'),
 ('£50', 'VBN'),
 ('billion', 'CD'),
 ('in', 'IN'),
 ('assets', 'NNS'),
 ('and', 'CC'),
 ('20', 'CD'),
 ('million', 'CD'),
 ('customers', 'NNS'),
 ('.', '.')]