# SEC Filings

The data is provided in zipped text files. Unzip the file and place the 'SEC-filings' directory into the './data' directory. 

A related dataset in similar format is the conll2003 dataset, available from [HuggingFace](https://huggingface.co/datasets/conll2003). This could be used as a source of additional training data, but note it contains a different kind of text (different 'domain'). 

In [9]:
import re
import numpy as np

from sklearn.model_selection import train_test_split

import joblib

In [4]:
def read_sec_filings(split):
    # Use this function to load the SEC filings data from text files
    
    if split == 'train':
        with open('./data/SEC-filings/train/FIN5.txt') as fp:
            lines = fp.readlines()
    else:
        with open('./data/SEC-filings/test/FIN3.txt') as fp:
            lines = fp.readlines()
   
    # store the tokens and labels for all sentences
    sentences = []
    labels = []

    # the tokens and labels for the current sentence
    current_sen = []
    current_labels = []

    for i in range(2, len(lines)):
        # print(f'This is line {i}')
        # print(lines[i])

        if len(lines[i]) > 1:  # Line with some data on: The data consists of tokens and tags.
            data = re.split(' ', lines[i])  # tokenise the line
            # print(data)
            current_sen.append(data[0])  # append the token 
            
            # data[1] contains POS tags -- you can also use these in your model.
            
            current_labels.append(data[3].strip())  # append the NER tag
        elif len(current_sen) > 1:  # this marks the end of a sentence
            # end of sentence
            sentences.append(current_sen)  # save the tokens for this sentence
            current_sen = []  # reset

            labels.append(current_labels)  # save the tags for this sentence
            current_labels = []

    if len(current_sen) > 1:  # save the last sentence
        sentences.append(current_sen)
        labels.append(current_labels)
    
    print(f'Number of sentences loaded = {len(sentences)}')
    print(f'Number of unique labels: {np.unique(np.concatenate(labels))}')
    
    return sentences, labels

In [5]:
print('Loading the original training set: ')
sentences_ner, labels_ner = read_sec_filings('train')

print('\nLoading the test set: ')
test_sentences_ner, test_labels_ner = read_sec_filings('test')

Loading the original training set: 
Number of sentences loaded = 1152
Number of unique labels: ['I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']

Loading the test set: 
Number of sentences loaded = 303
Number of unique labels: ['I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


As before, we could create a validation split from the training set:

In [6]:
train_sentences_ner, val_sentences_ner, train_labels_ner, val_labels_ner = train_test_split(
    sentences_ner, 
    labels_ner, 
    test_size=0.2,
    random_state=73
    # stratify=labels_ner  # there are too few examples of some classes to stratify
)

print(f'Number of training sentences = {len(train_sentences_ner)}')
print(f'Number of validation sentences = {len(val_sentences_ner)}')

Number of training sentences = 921
Number of validation sentences = 231


### save files

In [12]:
joblib.dump(train_sentences_ner, './data/train_sentences_ner.pkl')
joblib.dump(train_labels_ner, './data/train_labels_ner.pkl')

joblib.dump(test_sentences_ner, './data/test_sentences_ner.pkl')
joblib.dump(test_labels_ner, './data/test_labels_ner.pkl')

joblib.dump(val_sentences_ner, './data/val_sentences_ner.pkl')
joblib.dump(val_labels_ner, './data/val_labels_ner.pkl')

['./data/val_labels_ner.pkl']