In [54]:
# Import the necessary libraries
import nltk
import urllib3
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import spacy

# Load the necessary NLTK resources and Spacy model
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')
nltk.download('punkt')
nlp = spacy.load("en_core_web_lg")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\339755\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
# Internal packages for setting web proxy and BOE configuration, can skip if not using a Bank of England device
import boewebconnectpy
boewebconnectpy.set_boe_config()

Successfully set web config settings for urllib, urllib3, requests


In [None]:
# Define a function to process the text
def process_text(text, utf8=True):
    '''
    Process the input text by cleaning and tokenizing it into sentences.
    Parameters:
        text (str): The input text to be processed.
        utf8 (bool): Flag indicating if the text is in UTF-8 encoding and needs decoding.
    '''
    if utf8:
        text = text.decode('utf-8')
    text = text.replace('\n', ' ').replace('\r', '').replace('\t', '')
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'{[^>]+}', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.(\[a-zA-Z])', r'. \1', text)
    text  = sent_tokenize(text)
    return text

In [None]:
# Create a list of relevant URLs to scrape based on the publication
url_list = ["https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital",
            "https://www.bankofengland.co.uk/stress-testing/2024/stress-testing-uk-banking-system-scenarios-2024-desk-based",
            "https://www.bankofengland.co.uk/financial-stability-report/2024/november-2024#section6",
            "https://www.bankofengland.co.uk/stress-testing/2024/boes-approach-to-stress-testing-the-uk-banking-system"]

# Initialise the dataframe to hold all sentences
all_data = pd.DataFrame(columns=['OriginalSentence', 'Source', 'Line'])
for url in url_list:
    # Fetch the webpage content
    r = urllib3.request("GET", url, headers={"User-Agent": "Mozilla/5.0"})
    # Check if the request was successful
    if r.status == 200:
        # Print a success message
        print(f"Successfully fetched data from {url}")
        # Process the text
        sentences = process_text(r.data)
        # Remove the first line as contains the webpage information
        sentences.pop(0)
        # Create a dataframe for the sentences
        sentence_data = pd.DataFrame(sentences, columns=['OriginalSentence'])
        # Add source URL and line number
        sentence_data['Source'] = url
        sentence_data['Line'] = sentence_data.index + 1
        # Append to the main dataframe
        all_data = pd.concat([all_data, sentence_data], ignore_index=True)
    else:
        # Print an error message with code for diagnostics
        print(f"Failed to fetch data from {url}, status code: {r.status}")

# Remove any duplicate sentences
all_data = all_data.drop_duplicates().reset_index(drop=True)
# Remove last line if contains it is 'Skip to main content' as is the bottom page navigation
all_data = all_data[~all_data['OriginalSentence'].str.contains('Skip to main content')]


Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2025/key-elements-bank-capital
Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2024/stress-testing-uk-banking-system-scenarios-2024-desk-based
Successfully fetched data from https://www.bankofengland.co.uk/financial-stability-report/2024/november-2024#section6
Successfully fetched data from https://www.bankofengland.co.uk/stress-testing/2024/boes-approach-to-stress-testing-the-uk-banking-system


In [None]:
# Load in the English stopwords from NLTK
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

198

In [None]:
# Count the number of the words in the original sentence
all_data["WordCount"] = all_data['OriginalSentence'].apply(lambda x: len(word_tokenize(x)))
# Remove the stop words from the sentence to reduce noise
all_data["NoStopwords"] = all_data['OriginalSentence'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stopwords]))
# Use pos tag to remove all numeric values including those in text form
all_data['Numeric'] = all_data['OriginalSentence'].apply(lambda x: nltk.pos_tag(word_tokenize(x))).apply(lambda x: ' '.join([word for word, pos in x if pos in ['CD']]))

# Remove very long sentences and any known outliers
all_data = all_data[all_data['WordCount'] < 100]
all_data = all_data[all_data['OriginalSentence'] != 'Measuring the stability of the banking system: capital and liquidity at risk with solvency-liquidity...']

all_data.head()

Unnamed: 0,OriginalSentence,Source,Line,WordCount,NoStopwords,Numeric
0,Stress testing is used by the Bank to determin...,https://www.bankofengland.co.uk/stress-testing...,1.0,29,Stress testing used Bank determine UK banking ...,
1,By doing so the Bank aims to ensure banks can ...,https://www.bankofengland.co.uk/stress-testing...,2.0,82,Bank aims ensure banks absorb rather amplify s...,2025.0
2,"Rather, like previous concurrent stress test s...",https://www.bankofengland.co.uk/stress-testing...,3.0,58,"Rather , like previous concurrent stress test ...",
3,"It is not a set of events that is expected, or...",https://www.bankofengland.co.uk/stress-testing...,4.0,17,"set events expected , likely , materialise .",
4,This tail risk scenario is used for the purpos...,https://www.bankofengland.co.uk/stress-testing...,5.0,23,tail risk scenario used purposes enhancing fin...,


In [None]:
# Read in a file containing text to be tested against
test = open("Stress Test 2026.txt", "r")
test_data = test.read()
test.close()

# Process the test text and save to a dataframe
test = process_text(test_data, utf8=False)
test_df = pd.DataFrame(test, columns=['OriginalSentence'])

# Extract out numbers
test_df['OriginalNumeric'] = test_df['OriginalSentence'].apply(lambda x: nltk.pos_tag(word_tokenize(x))).apply(lambda x: ' '.join([word for word, pos in x if pos in ['CD']]))
# Create a dataframe with only numeric values
numeric_df = test_df.copy()
numeric_df = numeric_df[numeric_df['OriginalNumeric'] != '']

In [None]:
def check_similarity(sentence, all_data, min_threshold=0.75, perfect_threshold = 0.9, stopword = False):
    '''
    Check the sentence against each sentence in the all_data dataframe for similarity using Spacy's NLP model.
    Parameters:
        sentence (str): The input sentence to be checked
        all_data (DataFrame): The dataframe containing sentences to compare against
        min_threshold (float): The minimum similarity threshold to consider a match
        perfect_threshold (float): The similarity threshold to consider a perfect match and stop searching for more
        stopword (bool): Flag indicating whether to include stopwords in the comparison
    Returns:
        Similar (str): The most similar sentence found
        SimilarSource (str): The source URL of the most similar sentence
        SimilarLine (int): The line number of the most similar sentence
        SimilarityScore (float): The similarity score of the most similar sentence
    '''
    # If stopwords are to be removed, filter them out from the input sentence
    if stopword == False:
        sentence = sentence.join([word for word in word_tokenize(sentence) if word.lower() not in stopwords])
    # Convert the input sentence to a Spacy document
    doc1 = nlp(sentence)
    # Initialise the maximum similarity score
    max_similarity = 0
    # Iterate through each row in the all_data dataframe to compute similarity
    for idx, row in all_data.iterrows():
        # Convert the current sentence to a Spacy document, with or without stopwords based on the flag
        if stopword == False:
            doc2 = nlp(row['NoStopwords'])
        else:
            doc2 = nlp(row['OriginalSentence'])
        # Calculate the similarity score between the two documents
        similarity = doc1.similarity(doc2)
        # Update the maximum similarity score and store relevant information if a new maximum is found
        if similarity > max_similarity:
            max_similarity = similarity
            if max_similarity >= perfect_threshold and max_similarity > min_threshold:
                # If a perfect match is found, break the loop early and return the result
                Similar = row['OriginalSentence']
                SimilarSource = row['Source']
                SimilarLine = row['Line']
                SimilarityScore = max_similarity
                break
    # After checking all sentences, return the result if above the minimum threshold, otherwise indicate no match found
    if max_similarity > min_threshold:
        Similar = row['OriginalSentence']
        SimilarSource = row['Source']
        SimilarLine = row['Line']
        SimilarityScore = max_similarity
    else:
        Similar = "None found"
        SimilarSource = None
        SimilarLine = None
        SimilarityScore = None
    return Similar, SimilarSource, SimilarLine, SimilarityScore

In [None]:
# Apply the similarity check function to each sentence in the test dataframe and store the results
test_df[["Similar", "SimilarSource", "SimilarLine", "SimilarityScore"]] = test_df['OriginalSentence'].apply(lambda x: check_similarity(x, all_data, min_threshold=0.8, perfect_threshold=0.98)).apply(pd.Series)

In [None]:
def check_numeric_match(row, all_data):
    '''
    Check the sentence against each sentence in the all_data dataframe for similarity using Spacy's NLP model.
    Parameters:
        row (Series): The input row containing the sentence and its numeric values
        all_data (DataFrame): The dataframe containing sentences to compare against
    Returns:
        Match (float): The numeric match found
        Similar (str): The most similar sentence found
        SimilarSource (str): The source URL of the most similar sentence
        SimilarLine (int): The line number of the most similar sentence
        SimilarityScore (float): The similarity score of the most similar sentence
    '''
    # Filter the all_data dataframe to only include rows with matching numeric values
    all_data['NumericMatch'] = all_data['Numeric'].apply(lambda x: re.search(row['OriginalNumeric'], x) is not None if x else False)
    # Create a copy for numeric matches
    numeric_matches = all_data.copy()
    # Filter only for numeric matches
    numeric_matches = numeric_matches[numeric_matches['NumericMatch'] == True]
    # If none found return none
    if numeric_matches.empty:
        return 'None found', None, None, None, None
    # Calculate similarity for the numeric matches
    doc1 = nlp(row['OriginalSentence'])
    numeric_matches['Similarity'] = numeric_matches['OriginalSentence'].apply(lambda x: nlp(x).similarity(doc1))
    # Get the best match based on similarity
    best_match = numeric_matches.loc[numeric_matches['Similarity'].idxmax()]
    # Save the results
    Match = best_match['Numeric']
    Similar = best_match['OriginalSentence']
    SimilarSource = best_match['Source']
    SimilarLine = best_match['Line']
    SimilarityScore = best_match['Similarity']
    return Match, Similar, SimilarSource, SimilarLine, SimilarityScore

In [None]:
# Find the numeric matches
numeric_df[["NumericMatch", "SimilarNumeric", "NumericSimilarSource", "NumericSimilarLine", "NumericSimilarityScore"]] = numeric_df.apply(lambda x: check_numeric_match(x, all_data), axis=1).apply(pd.Series)

In [None]:
# Save results to a CSV file
test_df[['AnalystNotes', 'AnalystSignOff', 'SignOffDate', 'IssuesObserved', 'AlternativeSourceUsed']] = ""
numeric_df[['AnalystNotes', 'AnalystSignOff', 'SignOffDate', 'IssuesObserved', 'AlternativeSourceUsed']] = ""
# Open the file to write to multiple sheets
with pd.ExcelWriter('StressTest2026.xlsx') as excel:
    test_df.to_excel(excel, sheet_name='Text', index= False)
    numeric_df.to_excel(excel, sheet_name='Numeric', index = False)