In [None]:
# Import the necessary libraries
import nltk
import urllib3
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import spacy

# Load the necessary NLTK resources and Spacy model
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')
nltk.download('punkt')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Internal packages for setting web proxy and BOE configuration, can skip if not using a Bank of England device
import boewebconnectpy
boewebconnectpy.set_boe_config()

Successfully set web config settings for urllib, urllib3, requests


In [10]:
r = urllib3.request("GET", "https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital", headers={"User-Agent": "Mozilla/5.0"})

r.status

200

In [42]:
def process_text(text, utf8=True):
    if utf8:
        text = text.decode('utf-8')
    text = text.replace('\n', ' ').replace('\r', '').replace('\t', '')
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'{[^>]+}', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.(\w)', r'. \1', text)
    text  = sent_tokenize(text)
    return text

In [43]:
url_list = ["https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital",
            "https://www.bankofengland.co.uk/stress-testing/2024/stress-testing-uk-banking-system-scenarios-2024-desk-based",
            "https://www.bankofengland.co.uk/financial-stability-report/2024/november-2024#section6",
            "https://www.bankofengland.co.uk/stress-testing/2024/boes-approach-to-stress-testing-the-uk-banking-system"]

all_data = pd.DataFrame(columns=['OriginalSentence', 'Source'])
for url in url_list:
    r = urllib3.request("GET", url, headers={"User-Agent": "Mozilla/5.0"})
    if r.status == 200:
        print(f"Successfully fetched data from {url}")
        sentences = process_text(r.data)
        sentences.pop(0)
        sentence_data = pd.DataFrame(sentences, columns=['OriginalSentence'])
        sentence_data['Source'] = url
        sentence_data['Line'] = sentence_data.index + 1
        all_data = pd.concat([all_data, sentence_data], ignore_index=True)
    else:
        print(f"Failed to fetch data from {url}, status code: {r.status}")

all_data = all_data.drop_duplicates().reset_index(drop=True)
# Remove last line if contains it is 'Skip to main content'
all_data = all_data[~all_data['OriginalSentence'].str.contains('Skip to main content', case=False, na=False)]


Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital
Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2024/stress-testing-uk-banking-system-scenarios-2024-desk-based
Successfully fetched data from https://www.bankofengland.co.uk/financial-stability-report/2024/november-2024#section6
Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2024/boes-approach-to-stress-testing-the-uk-banking-system


In [14]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

print(stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [45]:
all_data["WordCount"] = all_data['OriginalSentence'].apply(lambda x: len(word_tokenize(x)))
all_data["NoStopwords"] = all_data['OriginalSentence'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stopwords]))

all_data = all_data[all_data['WordCount'] < 100]

all_data.head()

Unnamed: 0,OriginalSentence,Source,Line,WordCount,NoStopwords
0,"length>0)if(a&&""performance""in a&&a.",https://www.bankofengland.co.uk/stress-testing...,1.0,18,length > 0 ) ( & & '' performance '' & & .
1,"performance&&""function""==typeof a. performance.",https://www.bankofengland.co.uk/stress-testing...,2.0,10,performance & & '' function '' ==typeof a. per...
2,setResourceTimingBufferSize)a. performance.,https://www.bankofengland.co.uk/stress-testing...,3.0,5,setResourceTimingBufferSize ) a. performance .
3,setResourceTimingBufferSize();!function() Publ...,https://www.bankofengland.co.uk/stress-testing...,4.0,40,setResourceTimingBufferSize ( ) ; ! function (...
4,Stress testing is used by the Bank to determin...,https://www.bankofengland.co.uk/stress-testing...,5.0,29,Stress testing used Bank determine UK banking ...


In [33]:
# Read in a file containing text to be tested against
test = open("Stress Test 2026.txt", "r")
test_data = test.read()
test.close()

test = process_text(test_data, utf8=False)
test_df = pd.DataFrame(test, columns=['OriginalSentence'])
test_df.head()

Unnamed: 0,OriginalSentence
0,Hypothetical Stress Test Scenario: Severe Glob...
1,Bank of England 2026 stress test scenario Over...
2,"The scenario is not a forecast, but a coherent..."
3,Scenario Narrative In this hypothetical scenar...
4,"Equity markets fall by 45%, corporate bond spr..."


In [None]:
def check_similarity(sentence, all_data, threshold=0.75):
    doc1 = nlp(sentence)
    results = []
    max_similarity = 0
    for idx, row in all_data.iterrows():
        doc2 = nlp(row['OriginalSentence'])
        similarity = doc1.similarity(doc2)
        if similarity > max_similarity:
            max_similarity = similarity
    if max_similarity >= threshold:
        results.append((row['OriginalSentence'], row['Source'], row['Line'], similarity))
    else:
        results.append(("No similar sentence found. Check other sources.", None, None, None))
    return results

In [46]:
check_similarity("The Bank of England has announced new stress testing scenarios for 2025.", all_data, threshold=0.7)

[('Measuring the stability of the banking system: capital and liquidity at risk with solvency-liquidity...',
  'https://www.bankofengland.co.uk/stress-testing/2024/boes-approach-to-stress-testing-the-uk-banking-system',
  0.8102744221687317)]

In [None]:
test_df["SimilarityCheck"] = test_df['OriginalSentence'].apply(lambda x: check_similarity(x, all_data, threshold=0.8))