In [1]:
import pandas as pd

In [2]:
import os
from os import path
datafolder = "../data_0/"
assert path.isdir(datafolder)
html_folder = path.join(datafolder, "html")
assert path.isdir(html_folder)
pdf_folder = path.join(datafolder, "ocr")
assert path.isdir(pdf_folder)

In [57]:
labels = pd.read_csv( datafolder + "labels.csv", header=0, sep=",", quoting=1, thousands=",")
cities = pd.read_csv( datafolder + "cities.csv", header=0, sep=",", quoting=1)

#later edit: actually, we need the duplicates when we get the ISIN from the file name
#remove duplicate isins; the other column values should be identical for the same ISIN, so it's just duplicate data
#labels = labels.drop_duplicates('isin')

In [24]:
labels.head()

Unnamed: 0,fileId,isin,NOMINAL.CURR,MIN.TRAD.AMT,MULT.TRAD.AMT,ZERO.COUPN.FLAG,SEC.SUB.ID,FUNG.FL,OpCurrency,issuerName,issuerCity,issuerCountry,guarantorName,guarantorCity,guarantorCountry
0,0900045c83095a2a,XS1166358447,USD,0,10000,N,USUB,N,USD,GOLDMAN SACHS INTERNATIONAL,LONDON,UNITED KINGDOM,GOLDMAN SACHS (ASIA) LLC,NEW YORK,UNITED STATES
1,0900045c838362f4,XS1281367034,JPY,0,100000000,N,USUB,N,JPY,UBS AG,LONDON,SWITZERLAND,,,
2,0900045c83a280ec,XS1051339288,USD,0,100000,N,USUB,N,USD,"DEUTSCHE BANK AG, GREAT WINCHE",LONDON,GERMANY,,,
5,0900045c83a3755e,XS1051339106,JPY,0,25000000,N,USUB,N,JPY,"DEUTSCHE BANK AG, GREAT WINCHE",LONDON,GERMANY,,,
8,0900045c83a37568,XS1051337589,JPY,0,500000,N,USUB,N,JPY,"DEUTSCHE BANK AG, GREAT WINCHE",LONDON,GERMANY,,,


In [25]:
labels.describe()

Unnamed: 0,MIN.TRAD.AMT,MULT.TRAD.AMT
count,1200.0,1200.0
mean,4523.333333,10230850.0
std,45974.404043,28176580.0
min,0.0,1.0
25%,0.0,1000.0
50%,0.0,200000.0
75%,0.0,10000000.0
max,1000000.0,500300000.0


In [6]:
cities.head()

Unnamed: 0,fileId,isin,City.Id,City.Name
0,0900045c83095a2a,XS1166358447,LONDO,LONDON
1,0900045c83095a2a,XS1166358447,NEW Y,NEW YORK
2,0900045c838362f4,XS1281367034,LONDO,LONDON
3,0900045c838362f4,XS1281367034,TOKYO,TOKYO
4,0900045c83a280ec,XS1051339288,LONDO,LONDON


In [118]:
# Import various modules for string cleaning
import re
import nltk
from nltk.corpus import stopwords
import pickle

def clean_html(raw_html):
    """
    This method with regexp worked better for these html files than libraries like html2text or BeautifulSoup
    """
    #we only look inside the body of the html, the head contains style code
    body = re.findall(r'<body[^>]*?>(.*?)</body>', raw_html)
    
    #there are some problems with split words when replacing the span with space
    #so we remove the span tag
    cleantext = re.sub('<\/?span[^>]*>', '', body[0])
    
    #remove all the tags
    cleantext = re.sub('<.*?>', ' ', cleantext)
    
    #remove dots with a space after them
    cleantext = re.sub('\. ', ' ', cleantext)
    
    #remove commas with a space after them
    cleantext = re.sub('\, ', ' ', cleantext)
    
    #remove shlases with a space after them
    cleantext = re.sub('/ ', ' ', cleantext)

    #remove html quotes, ampersands, white spaces
    cleantext = re.sub('&amp;|&quot;|&nbsp;', ' ', cleantext)
    
    #clean minus signs with spaces after (I want to keep composed words in the text if there are any)
    cleantext = re.sub(' – | - ', ' ', cleantext)
    
    #remove carriage returns
    cleantext = cleantext.replace('\n',' ')
    
    #remove special characters
    cleantext = re.sub('[\#\'\"\(\)*&%!\+=;:\]\[@\“\”]', ' ', cleantext)
    
    return cleantext 

def document_to_wordlist( docName, remove_stopwords=False ):
    with open(path.join(html_folder, docName), 'r') as html_file:
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        
        raw_text = html_file.read()
        
        doc_text = clean_html(raw_text)
        
        # Convert words to lower case and split them
        words = nltk.word_tokenize(doc_text.lower())
        
        #do we actually need part of speech tagging?
        #nltk.pos_tag(words)
        
        # Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]

        # Return a list of words
        return(words)

def get_isin_for_file(labels, file_name, printResult = False):
    #function to get the isin from the labels, given a file name
    file_id = file_name.split('_')[0]
    idx = labels['fileId'] == file_id
    isin = labels['isin'][idx].values[0]
    
    if(printResult):
        print("%s - %s" % (file_name, isin))
    return isin

def get_roc_for_isin(cities, isin):
    #function to get the roc from the cities, given an isin
    idx = cities['isin'] == isin
    return cities['City.Name'][idx].values

def get_label_value_for_isin(labels, isin, attr_name):
    #function to return the value of the label for a given isin
    idx = labels['isin'] == isin
    return labels[attr_name][idx].values[0]

def save_content_grouped_by_isin(html_folder, labels):
    
    contents_by_isin = {}
    
    print('Processing files')

    for file_name in os.listdir(html_folder):
        
        print('.', end='')
        
        if file_name.endswith(".html"):

            #first find the isin corresponding to this file
            isin = get_isin_for_file(labels, file_name)

            if(isin):

                #found the isin to associate the document with
                file_content = document_to_wordlist(file_name)

                try:
                    #if there is already existing data for this isin, append the new data
                    existing_isin_data = contents_by_isin[isin]
                    contents_by_isin[isin] = existing_isin_data + file_content

                except KeyError:
                    contents_by_isin[isin] = file_content
    
    pickle_file = 'contents_by_isin.pickle'
    
    print('\nSaved file contents indexed by ISIN to:', pickle_file)
    
    pickle.dump(contents_by_isin, open(pickle_file, 'wb'))   

In [119]:
words = document_to_wordlist("0900045c80c920a2_ixs0628152059_f_pc_n_ocr.html")
print(words)

['these', 'securities', 'do', 'not', 'constitute', 'collective', 'investment', 'schemes', 'in', 'the', 'meaning', 'of', 'the', 'swiss', 'federal', 'act', 'on', 'collective', 'investment', 'schemes', 'cisa', 'accordingly', 'holders', 'of', 'the', 'securities', 'do', 'not', 'benefit', 'from', 'protection', 'under', 'cisa', 'or', 'supervision', 'by', 'the', 'swiss', 'financial', 'market', 'supervisory', 'authority', 'finma', 'final', 'terms', 'barclays', 'bank', 'plc', 'incorporated', 'with', 'limited', 'liability', 'in', 'england', 'and', 'wales', 'barclays', 'capital', 'cayman', 'limited', 'incorporated', 'with', 'limited', 'liability', 'in', 'the', 'cayman', 'islands', 'global', 'structured', 'securities', 'programme', 'for', 'the', 'issue', 'of', 'securities', 'barclays', 'bank', 'plc', 'eur', '20,000,000', 'fixed', 'rate', 'notes', 'due', 'july', '2017', 'the', 'notes', 'series', 'nx00053246', 'under', 'the', 'global', 'structured', 'securities', 'programme', 'issue', 'price', '100',

In [121]:
save_content_grouped_by_isin(html_folder, labels)

Processing files
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [122]:
#check that we can read the data
data = pickle.load(open('contents_by_isin.pickle', 'rb'))
first_pairs = {k : data[k] for k in list(data.keys())[:1]}
print(first_pairs)

{'XS1169811491': ['pricing', 'supplement', 'dated', '27', 'january', '2015', 'credit', 'suisse', 'international', '4-year', '2-month', 'usd', 'notes', 'due', 'march', '2019', 'linked', 'to', 'the', 'credit', 'suisse', 'spear', 'dynamic', 'asia', 'index', 'total', 'return', 'usd', 'series', 'x', 'the', 'securities', 'issued', 'pursuant', 'to', 'the', 'structured', 'products', 'programme', 'for', 'the', 'issuance', 'of', 'notes', 'certificates', 'and', 'warrants', 'part', 'a', 'contractual', 'terms', 'terms', 'used', 'herein', 'shall', 'be', 'deemed', 'to', 'be', 'defined', 'as', 'such', 'for', 'the', 'purposes', 'of', 'the', 'programme', 'memorandum', 'dated', '10', 'july', '2013', 'this', 'document', 'constitutes', 'the', 'pricing', 'supplement', 'of', 'the', 'securities', 'described', 'herein', 'copies', 'of', 'the', 'programme', 'memorandum', 'may', 'be', 'obtained', 'from', 'the', 'registered', 'office', 'of', 'the', 'issuer', 'and', 'the', 'offices', 'of', 'the', 'agents', 'specifi