In [None]:
# check python version - this script works with 3.11.5, 3.9.7

!python --version

In [None]:
# IF USING GPU: check GPU version

!nvidia-smi

In [None]:
# Installations if required

#!python -m pip install -U setuptools pip
#!pip install spacy
#!pip install -U 'spacy[cuda12x]' # for GPU
#!pip install cupy-cuda12x # for GPU
#!pip install cupy # for GPU

# Numpy version 1.22.0 works best with this script

# !pip install numpy==1.22.0

In [None]:
# IF USING GPU: This should return True

print(spacy.prefer_gpu()) 

In [None]:
# import other required packages

import spacy
import pandas as pd
import json
import random
import re
import string
import numpy as np
 
from spacy.tokens import Doc
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding
from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
from spacy.pipeline.spancat import SpanCategorizer
from spacy.pipeline import SpanCategorizer
from spacy import displacy

from tqdm import tqdm

from sklearn.model_selection import train_test_split

In [None]:
# check spacy version - this script works with version 3.7.0, 3.7.4, 3.7.2

print(spacy.__version__)

In [None]:
# model

model = "m6"

In [None]:
# import train and test dataset

df_train = pd.read_csv('pt_level_train_set_'+model+'.csv')
df_test = pd.read_csv('pt_level_test_set_'+model+'.csv')

In [None]:
# check the dataframe

df_train.head(2)

In [None]:
# check the length of the dataframe

len(df_train)

In [None]:
list(df_train.columns)

In [None]:
# Convert column to string type - to avoid errors saying float type does not have x function

df_train['TextContent'] = df_train['TextContent'].astype(str)
df_train['Text'] = df_train['Text'].astype(str)


In [None]:
# Calculate the length of each document
df_train['document_length'] = df_train['TextContent'].apply(lambda x: len(x)) #TextContent or extracted_sentence

# Get minimum, maximum, median, and mean lengths
min_length = df_train['document_length'].min()
max_length = df_train['document_length'].max()
median_length = df_train['document_length'].median()
mean_length = df_train['document_length'].mean()

# Print the results
print("Minimum Length:", min_length)
print("Maximum Length:", max_length)
print("Median Length:", median_length)
print("Mean Length:", mean_length)


In [None]:
# check label distribution

df_train['label'].value_counts()

In [None]:
#1. Common text preprocessing
text = " Â  This is a message to be cleaned. It may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs   80/120  .  "
 
#convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace
    text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    #text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace
    text=re.compile('Â').sub('', text)
    text=re.compile('â').sub('', text)
    text=text.strip()
    
    return text
 
text=preprocess(text)
print(text)  #text is a string

In [None]:
# add or remove columns that require pre-processing

df_train['Text'] = df_train['Text'].apply(lambda x: preprocess(x))
df_train['TextContent'] = df_train['TextContent'].apply(lambda x: preprocess(x))


In [None]:
# Define a function to extract context around span - this is relevant if using context around span approach - if not, ignore

def extract_context_around_span(text, span, context_size=200):#works well with 200
    # Find the index of the span in the text
    span_index = text.find(span)
    
    if span_index != -1:
        # Calculate the start and end indices for the context around the span
        start_index = max(0, span_index - context_size)
        end_index = min(len(text), span_index + len(span) + context_size)
        
        # Extract the context around the span
        context_around_span = text[start_index:end_index]
        return context_around_span
    else:
        return None

In [None]:
# Apply the function to create a new 'ContextAroundSpan' column - ignore if you are not using the context approach

df_train['ContextAroundSpan'] = df_train.apply(lambda row: extract_context_around_span(row['TextContent'], row['Text']), axis=1) # Text or NewText

In [None]:
# Convert column to string type - to avoid errors saying float type does not have x function

df_train['ContextAroundSpan'] = df_train['ContextAroundSpan'].astype(str)

In [None]:
# check the dataframe

df_train.head(2)

In [None]:
# Calculate the length - ignore if you are not using the context approach
df_train['context_length'] = df_train['ContextAroundSpan'].apply(lambda x: len(x))

# Get minimum, maximum, median, and mean lengths
min_length = df_train['context_length'].min()
max_length = df_train['context_length'].max()
median_length = df_train['context_length'].median()
mean_length = df_train['context_length'].mean()

# Print the results
print("Minimum Length:", min_length)
print("Maximum Length:", max_length)
print("Median Length:", median_length)
print("Mean Length:", mean_length)

In [None]:
# Calculate the length of span
df_train['text_length'] = df_train['Text'].apply(lambda x: len(x))

# Get minimum, maximum, median, and mean lengths
min_length = df_train['text_length'].min()
max_length = df_train['text_length'].max()
median_length = df_train['text_length'].median()
mean_length = df_train['text_length'].mean()

# Print the results
print("Minimum Length:", min_length)
print("Maximum Length:", max_length)
print("Median Length:", median_length)
print("Mean Length:", mean_length)

In [None]:
# Function to find start and end indices of 'Text' within 'TextContent' or 'NewText' within 'ContextAroundSpan' or 'Text' within 'extracted_sentence'

# if you are using the context apprach then use the columns ContextAroundSpan and NewText.
# if you are using the extracted sentence approach then use the columns extracted_sentence and Text
# else, use TextContent and Text

def find_indices(row):
    start_index = row['ContextAroundSpan'].find(row['Text'])     # TextContent/ContextAroundSpan/extracted_sentence and Text/NewText/Text
    end_index = start_index + len(row['Text'])     # Text or NewText
    return start_index, end_index

In [None]:
# check some samples

df_train['TextContent'][6]

In [None]:
df_train['Text'][6]

In [None]:
df_train['ContextAroundSpan'][6] # ignore if you are not using the context approach

In [None]:
# Apply the function to create new columns for start and end indices - make sure you have updatef the find_indices function accordingy for text and span columns

df_train[['start_index', 'end_index']] = df_train.apply(find_indices, axis=1, result_type='expand')
df_train.head(2)

In [None]:
# distribution of labels - percentage - 'Domain ' or 'label'

df_train['label'].value_counts(normalize=True)

In [None]:
# distribution of labels - numbers

df_train['label'].value_counts()

In [None]:
# check the find_index function worked

start_index = df_train['ContextAroundSpan'][13].find(df_train['Text'][13]) # 'TextContent' or 'ContextAroundSpan' or 'extracted_sentence'
start_index

In [None]:
end_index = start_index + len(df_train['Text'][13])
end_index

In [None]:
df_train['start_index'][13]

In [None]:
# Load the base SpaCy model - Here, we load a base SpaCy English model without pre-trained word vectors

nlp = spacy.blank("en")

# define your span key name
span_key = "sc"

TRAINING SET PREPARATION


In [None]:
# there should be no errors here - convert the dataframe to spacy Example format

train_data = []

for index, row in df_train.iterrows():# UPDATE - df for context approach or train_df for sentence approach
    sentence = row['ContextAroundSpan'] # ContextAroundSpan or TextContent or extracted_sentence
    span = row['Text'] # NewText or Text or Text
    label = row['label']  # 'Domain ' or 'label'
    
    
    start = row['start_index'] 
    end = row['end_index'] 
    
    
    # Then create a SpaCy Doc object (doc) from the sentence
    doc = nlp.make_doc(sentence)

    # Ensure the span is found within the sentence and construct an entity tuple if the span is found.
    if start != -1 and end <= len(sentence):
        entities = [(start, end, label)] # such as entities = [(3389, 3412, "ADL")]
        tags = spacy.training.offsets_to_biluo_tags(nlp.make_doc(sentence), entities)
        # Create a SpaCy Example object and append it to the train_data list, forming the training data for the NER task. - why is label IS_DIGIT
        #example = Example.from_dict(doc, {'spans':{span_key: [(start, end, label)]}})  #{"entities": entities, "spans": {"sc": [(start, end, label)]}})
        example = Example.from_dict(doc, {"entities": entities, "spans": {"sc": [(start, end, label)]}})
        #example = Example.from_dict(doc, {"entities": entities})
        train_data.append(example)

# using this - {'spans':{span_key: [(start, end, label)]}}) - doesn't gove BIO tags

In [None]:
# check the lenght in Example format is same as the length of the dataframe

len(train_data)

In [None]:
# check the start index againt to make sure it is the same as previous check

df_train['start_index'][13]

In [None]:
# look at one of them to make sure entities has the BILOU tags and spans has the values start, end, label

train_data[13]

INITIATE SPANCAT

https://medium.com/data-analytics-at-nesta/a-deep-dive-into-spacys-span-categorisation-model-992024d047c2

In [None]:
# Load the base SpaCy model - Here, we load a base SpaCy English model without pre-trained word vectors

nlp = spacy.blank("en")

# define your span key name - we can also just leave it at entities as will be seen in the prep of train_data?
span_key = "sc"

In [None]:
#spancat config 
config = {
    #this refers to the minimum probability to consider a prediction positive
    "threshold": 0.5,
    #the span key refers to the key in doc.spans 
    "spans_key": span_key,
    #this refers to the maximum number of labels to consider positive per span
    "max_positive": None,
     #a model instance that is given a list of documents with start end indices representing the labelled spans
    "model": DEFAULT_SPANCAT_MODEL, # it was DEFAULT_SPANCAT_MODEL, try spacy.SpanCategorizer.v1 didnt work-try DEFAULT_SPANCAT_SINGLELABEL_MODEL-weird preds
    #A function that suggests spans. This suggester is fixed n-gram length of up to 3 tokens
    #Suggest all spans of at least length min_size and at most length max_size (both inclusive).
    "suggester": {"@misc": "spacy.ngram_range_suggester.v1", "min_size":1, "max_size":30},
}
#add spancat component to nlp object
nlp.add_pipe("spancat", config=config)
#get spancat component 
span=nlp.get_pipe('spancat')

#Add labels to spancat component 
for label in set(df_train['label']): # 'Domain ' or 'label'
    span.add_label(label)

    '''
    This component comes in two forms: spancat and spancat_singlelabel (added in spaCy v3.5.1). 
    When you need to perform multi-label classification on your spans, use spancat. 
    The spancat component uses a Logistic layer where the output class probabilities are independent 
    for each class. However, if you need to predict at most one true class for a span, then use spancat_singlelabel. 
    It uses a Softmax layer and treats the task as a multi-class problem.

    https://spacy.io/api/spancategorizer2"}'''

In [None]:
#get pipe you want to train on 
pipe_exceptions = ["spancat"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# initialise spacy object 
nlp.initialize()
sgd = nlp.create_optimizer()

In [None]:
# Get date and time just before start of training

import datetime

# Get the current date and time
now = datetime.datetime.now()

# Display a message indicating what is being printed
print("Current date and time : ")

# Print the current date and time in a specific format
print(now.strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
#start training the spancat component 

all_losses = []
with nlp.disable_pipes(*unaffected_pipes):
    for iteration in tqdm(range(150)): #Loops over x (10,20, 50) iterations for training. tqdm is a library used to display progress bars during iterations.
        # 20 is probably a good number of iterations - losses dont change much after this.
        # shuffling examples before every iteration
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) 
        #Divides the training data into batches. minibatch is a function that generates batches, and compounding is a function used to generate exponentially increasing batch sizes.
        for batch in batches:
            nlp.update(list(batch), losses=losses, drop=0.1, sgd=sgd)
        print("epoch: {} Losses: {}".format(iteration, str(losses)))
        all_losses.append(losses['spancat'])

In [None]:
# Save the losses as csv to examine for overfit

losses = pd.DataFrame(all_losses)
losses.to_csv('all_losses_'+model+'.csv')

In [None]:
# Save the trained model

nlp.to_disk("trained_spancat_model_"+model+"_bypt")

In [None]:
# save a txt file with the date and time when the training ends

from datetime import datetime
 
# get current date and time
current_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
print("Current date & time : ", current_datetime)
 
# convert datetime obj to string
str_current_datetime = str(current_datetime)
 
# create a file object along with extension
file_name = "timecompletion"+str_current_datetime+model+".txt"  # UPDATE MODEL NUMBER
file = open(file_name, 'w')
 
print("File created : ", file.name)
file.close()

In [None]:
# Load the trained model

nlp = spacy.load("trained_spancat_model_"+model+"_bypt")

In [None]:
# Load the test data

df_test = pd.read_csv('pt_level_test_set_'+model+'.csv')
df_test.head(2)

In [None]:
len(df_test)

In [None]:
# Define a function to extract context around span - this is relevant if using context around span approach - if not, ignore

def extract_context_around_span(text, span, context_size=200):#works well with 200
    # Find the index of the span in the text
    span_index = text.find(span)
    
    if span_index != -1:
        # Calculate the start and end indices for the context around the span
        start_index = max(0, span_index - context_size)
        end_index = min(len(text), span_index + len(span) + context_size)
        
        # Extract the context around the span
        context_around_span = text[start_index:end_index]
        return context_around_span
    else:
        return None

In [None]:
# Convert column to string type - to avoid errors saying float type does not have x function

df_test['TextContent'] = df_test['TextContent'].astype(str)
df_test['Text'] = df_test['Text'].astype(str)

In [None]:
# Calculate the length of each document
df_test['document_length'] = df_test['TextContent'].apply(lambda x: len(x))

# Get minimum, maximum, median, and mean lengths
min_length = df_test['document_length'].min()
max_length = df_test['document_length'].max()
median_length = df_test['document_length'].median()
mean_length = df_test['document_length'].mean()

# Print the results
print("Minimum Length:", min_length)
print("Maximum Length:", max_length)
print("Median Length:", median_length)
print("Mean Length:", mean_length)

In [None]:
df_test['label'].value_counts()

In [None]:
df_test['label'].value_counts(normalize=True)

In [None]:
#1. Common text preprocessing
text = " Â  This is a message to be cleaned. It may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs   80/120  .  "
 
#convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace
    text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    #text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace
    text = re.compile('Â').sub('', text)  
    text = re.compile('â').sub('', text) 
    text=text.strip()  #get rid of leading/trailing whitespace
    
    return text
 
text=preprocess(text)
print(text)  #text is a string

In [None]:
df_test['Text'] = df_test['Text'].apply(lambda x: preprocess(x))
df_test['TextContent'] = df_test['TextContent'].apply(lambda x: preprocess(x))


In [None]:
# skip if not doing context approach - or if v2 of test set file

df_test['ContextAroundSpan'] = df_test.apply(lambda row: extract_context_around_span(row['TextContent'], row['Text']), axis=1)

In [None]:
df_test['ContextAroundSpan'] = df_test['ContextAroundSpan'].astype(str)

In [None]:
df_test.head(2)

In [None]:
#df_test.to_csv('test_error.csv')

In [None]:
#df_test['ContextAroundSpan'].isnull().sum()

In [None]:
#df_test = df_test.dropna(subset=['ContextAroundSpan'], inplace=True)

In [None]:
df_test[['start_index', 'end_index']] = df_test.apply(find_indices, axis=1, result_type='expand')
df_test.head(2)

In [None]:
# check the find_index function worked - all indexes don't work when test_df is split from spacy sentences file at the beginning

start_index = df_test['TextContent'][13].find(df_test['Text'][13])
#start_index = df['ContextAroundSpan'][1].find(df['NewText'][1])
start_index

In [None]:
end_index = start_index + len(df_test['Text'][13])
#end_index = start_index + len(df['NewText'][1])
end_index

In [None]:
def get_spancat_predictions(document):
    doc = nlp(document)
    spancat_predictions = [(span.text, span.label_) for span in doc.spans['sc']]
    return spancat_predictions


In [None]:
# Apply the function to the 'TextContent' or 'ContextAroundSpan' or 'extracted_sentence' column of the DataFrame and store the predictions in a new column
df_test['SpanCat_Predictions'] = df_test['ContextAroundSpan'].apply(get_spancat_predictions)

In [None]:
df_test.head(2)

In [None]:
df_test.to_csv('test_data_w_'+model+'.csv')

Or quick test on few examples

In [None]:
# New sentences to test the model - tried both TextContent (documents) and NewText(spans)
new_sentences = list(df_test['ContextAroundSpan'][0:5])
 
# Process each new sentence with the trained model
for sentence in new_sentences:
    doc = nlp(sentence)
 
    # Access SpanCat predictions (assuming they are in the 'spancat' or 'sc' attribute of Doc)
    #spancat_predictions = [span.text for span in doc.spans['sc']]
    spancat_predictions = [(span.text, span.label_) for span in doc.spans['sc']]
 
    print(f"Sentence: '{sentence}'")
    print("SpanCat Predictions:", spancat_predictions)
    print()