In [42]:
import os
from os import path

datafolder = "../data_0/"
assert path.isdir(datafolder)

data1folder = '../data_1'
assert path.isdir(data1folder)

html1_folder = path.join(data1folder, "html")
assert path.isdir(html1_folder)

html_folder = path.join(datafolder, "html")
assert path.isdir(html_folder)

In [54]:
fileids_isins = []
ISIN_REGEXP = r'[a-zA-Z]{2}[0-9]{10}'
FILEID_REGEXP = r'[0-9a-zA-Z]{16}'

for file_name in os.listdir(html1_folder):
    if file_name.endswith(".html"):
        
        #get the file id
        file_ids = re.findall(FILEID_REGEXP, file_name)
        if len(file_ids) >= 1:
            file_id = file_ids[0]
        
            #get the ISIN
            isins = re.findall(ISIN_REGEXP, file_name)

            if len(isins) > 1 :
                raise ValueError('Multpiple isin in file name %s' % file_name)

            elif len(isins) == 1:
                fileids_isins.append([file_id, isins[0].upper()])

            else:
                with open(html1_folder + '/' + file_name, 'r', encoding='utf8') as html_file:
                    full_text = html_file.read()
                    isins = re.findall(ISIN_REGEXP, full_text)
                    if len(isins) > 1:
                        fileids_isins.append([file_id, isins[0].upper()])
                    elif len(isins) == 1:
                        fileids_isins.append([file_id, isins[0].upper()])
                    else:
                        print('Can\'t find isin for file %s' % file_name)
                
print('total files in folder: %d' % len(os.listdir(html1_folder)))            
print('total files with isin found: %d' % len(fileids_isins))

print(fileids_isins)

Can't find isin for file 0900045c839b32a0_205683_Termsheet_AP_GLOBAL__Airbag_AAPL_UA_3_SMBCN_JAPAN_AAPL_ocr.html
Can't find isin for file 0900045c83b04046_206661_Termsheet_AP_GLOBAL__VMN_SX5E_NKY_28Dec18c_NKY__2_15.1_ocr.html
total files in folder: 277
total files with isin found: 275
[['0900045c8112c744', 'XS0614670254'], ['0900045c8120819c', 'XS0806150867'], ['0900045c81283c6c', 'XS0843588517'], ['0900045c826d7db3', 'XS1096307050'], ['0900045c8270a007', 'XS1098409714'], ['0900045c827de7ac', 'XS1107430925'], ['0900045c82be85ff', 'XS1149445212'], ['0900045c82cb4d13', 'XS1169810923'], ['0900045c82cd0a3b', 'XS1173870111'], ['0900045c82cd0ca7', 'XS1173869295'], ['0900045c82cd14a2', 'XS1153462293'], ['0900045c82cd9a30', 'XS1144665905'], ['0900045c82d6cdf3', 'XS1188093865'], ['0900045c82d9c9d4', 'XS1194001803'], ['0900045c82dadcb8', 'XS1188118282'], ['0900045c82dbc0cf', 'XS1161871089'], ['0900045c82de7dea', 'XS1061485386'], ['0900045c82e98f63', 'XS1194986516'], ['0900045c82eb8206', 'XS11825

In [2]:
import pandas as pd

labels = pd.read_csv( datafolder + "labels.csv", header=0, sep=",", quoting=1, thousands=",")
cities = pd.read_csv( datafolder + "cities.csv", header=0, sep=",", quoting=1)

In [3]:
import re
import nltk
from nltk.corpus import stopwords
import pickle
from bs4 import BeautifulSoup

def clean_html(raw_html):
    """
    This method with regexp worked better for these html files than libraries like html2text or BeautifulSoup
    """
    #we only look inside the body of the html, the head contains style code
    body = re.findall('<body[^>]*?>(.*?)</body>', raw_html)
    
    #there are some problems with split words when replacing the span with space
    #so we remove the span tag
    cleantext = re.sub('<\/?span[^>]*>', '', body[0])
    
    #remove all the tags
    cleantext = re.sub('<.*?>', ' ', cleantext)
    
    #remove shlases with a space after them
    cleantext = re.sub('/ ', ' ', cleantext)

    #remove html quotes, ampersands, white spaces
    cleantext = re.sub('&amp;|&quot;|&nbsp;', ' ', cleantext)
    
    #clean minus signs with spaces after (I want to keep composed words in the text if there are any)
    cleantext = re.sub(' – | - ', ' ', cleantext)
    
    #remove carriage returns
    cleantext = cleantext.replace('\n',' ')
    
    #remove special characters
    cleantext = re.sub('[–\#\'\"\(\)*&%!\+=;:\]\[@\“\”-]', ' ', cleantext)
    
    return cleantext

def document_to_words(doc_path):
    with open(doc_path, 'r', encoding='utf8') as html_file:
        full_text = html_file.read()
        body = re.findall(r'<body[^>]*?>(.*?)</body>', full_text)
        #clean_text = re.sub('<\/?span[^>]*>', '', body[0])
        #clean_text = BeautifulSoup(body[0], 'html.parser').get_text()
        clean_text = re.sub('<.*?>', ' ', body[0])
        clean_text = re.sub('\s+', ' ', clean_text)
        clean_text = clean_text.lower()
        clean_text = re.sub('won\'t', 'will not', clean_text)
        clean_text = re.sub('can\'t', 'cannot', clean_text)
        clean_text = re.sub('[^a-z]', ' ', clean_text)
        words = nltk.word_tokenize(clean_text)
        #stops = set(stopwords.words("english"))
        #relevant_words = [w for w in words if w not in stops and w != '' and w != ' ']
        relevant_words = words
        return relevant_words

def get_isin_for_file(labels, file_name, print_result = False):
    #function to get the isin from the labels, given a file name
    file_id = file_name.split('_')[0]
    idx = labels['fileId'] == file_id
    isin = labels['isin'][idx].values[0]
    
    if(print_result):
        print("%s - %s" % (file_name, isin))
    return isin

def get_roc_for_isin(cities, isin):
    #function to get the roc from the cities, given an isin
    idx = cities['isin'] == isin
    return cities['City.Name'][idx].values

def get_labels_for_isin(labels, isin):
    idx = labels['isin'] == isin
    return labels[:][idx].values

def get_label_value_for_isin(labels, isin, attr_name):
    #function to return the value of the label for a given isin
    idx = labels['isin'] == isin
    return labels[attr_name][idx].values[0]

def group_content_by_isin(html_folder, labels, pickle_results=False):
    
    contents_by_isin = {}
    
    print('Processing files')

    for file_name in os.listdir(html_folder):
        
        print('.', end='')
        
        if file_name.endswith(".html"):

            #first find the isin corresponding to this file
            isin = get_isin_for_file(labels, file_name)

            if(isin):

                #found the isin to associate the document with
                file_content = document_to_words(path.join(html_folder, file_name))

                try:
                    #if there is already existing data for this isin, append the new data
                    existing_isin_data = contents_by_isin[isin]
                    contents_by_isin[isin] = existing_isin_data + file_content

                except KeyError:
                    contents_by_isin[isin] = file_content
    
    print('\nFinished grouping file contents indexed by ISIN')
    
    if pickle_results:
        pickle_file = 'contents_by_isin.pickle'
        pickle.dump(contents_by_isin, open(pickle_file, 'wb'))
        
        print('Saved file contents indexed by ISIN to:', pickle_file)
        
    return contents_by_isin

In [4]:
#check how the tokenized sentences look like
sentences = document_to_words("../data_0/html/0900045c81137ab3_ixs0774414683_f_pc_n_ocr.html")
print(sentences)

['final', 'terms', 'dated', 'july', 'bnp', 'paribas', 'arbitrage', 'issuance', 'b', 'v', 'incorporated', 'in', 'the', 'netherlands', 'as', 'issuer', 'bnp', 'paribas', 'incorporated', 'in', 'france', 'as', 'guarantor', 'warrant', 'and', 'certificate', 'programme', 'usd', 'quot', 'call', 'spread', 'quot', 'certificates', 'relating', 'to', 'gold', 'due', 'july', 'isin', 'code', 'xs', 'bnp', 'paribas', 'arbitrage', 's', 'n', 'c', 'as', 'manager', 'the', 'base', 'prospectus', 'referred', 'to', 'below', 'as', 'completed', 'by', 'these', 'final', 'terms', 'has', 'been', 'prepared', 'on', 'the', 'basis', 'that', 'any', 'offer', 'of', 'securities', 'in', 'any', 'member', 'state', 'of', 'the', 'european', 'economic', 'area', 'which', 'has', 'implemented', 'the', 'prospectus', 'directive', 'each', 'a', 'quot', 'relevant', 'member', 'state', 'quot', 'will', 'be', 'made', 'pursuant', 'to', 'an', 'exemption', 'under', 'the', 'prospectus', 'directive', 'as', 'implemented', 'in', 'that', 'relevant', '

In [5]:
group_content_by_isin(html_folder, labels, pickle_results=True)
print("OK!")

Processing files
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [4]:
#check that we can read the data
data = pickle.load(open('contents_by_isin.pickle', 'rb'))
first_pair = {k : data[k] for k in list(data.keys())[:1]}

print(first_pair)

{'XS1051351143': ['offering', 'circular', 'deutsche', 'bank', 'ag', 'london', 'up', 'to', 'jpy', 'notes', 'relating', 'to', 'next', 'funds', 'nikkei', 'leveraged', 'index', 'exchange', 'traded', 'fund', 'due', 'june', 'issued', 'under', 'its', 'programme', 'issue', 'price', 'of', 'the', 'nominal', 'amount', 'isin', 'xs', 'deutsche', 'bank', 'ag', 'london', 'the', 'issuer', 'is', 'the', 'london', 'branch', 'of', 'deutsche', 'bank', 'ag', 'which', 'is', 'incorporated', 'under', 'the', 'laws', 'of', 'germany', 'deutsche', 'bank', 'ag', 'london', 'is', 'registered', 'as', 'a', 'foreign', 'company', 'in', 'england', 'and', 'wales', 'under', 'its', 'x', 'markets', 'programme', 'the', 'programme', 'the', 'issuer', 'may', 'issue', 'certain', 'securities', 'relating', 'to', 'shares', 'and', 'or', 'indices', 'and', 'or', 'debt', 'securities', 'and', 'or', 'fund', 'units', 'and', 'or', 'commodities', 'and', 'or', 'currencies', 'the', 'issuer', 'has', 'determined', 'to', 'issue', 'up', 'to', 'jpy'

In [5]:
def contains(small, big):
    for i in range(len(big) - len(small) + 1):
        for j in range(len(small)):
            if big[i + j] != small[j]:
                break
        else:
            return i, i + len(small)
    return False

def compute_zero_coupon_flags(data, isin=None, debug=True):
    #how many words to keep around the found keyword to the left and right
    look_around = 4
    
    keywords_context = ['interest', 'coupon']
    positive_interest_context = [['interest', 'applicable'], \
                                 #['interest', 'bearing'], \
                                 ['bears', 'interest'], \
                                 ['interest', 'payment', 'date']]
    zero_interest_context = [['non', 'interest', 'bearing'], \
                                 ['zero', 'coupon', 'note', 'provisions', 'not', 'applicable'], \
                                 #['interest', 'non', 'applicable'],\
                                 ['zero', 'coupon', 'applicable']]

    #here we store the calculated zero coupon flags for each isin
    isin_zero_coupon_flags = {}

    for isin_v, content in data.items():
        zero_coupon_flag = 'N'
        
        pos_occ = 0
        neg_occ = 0
        
        if (isin and isin == isin_v) or not isin:
            
            for i in range(len(content)):
                word = content[i]
            
                if word in keywords_context:
        
                    #retain look_around words aroun the keyword
                    context = content[i - look_around : i + look_around]
                   
                    if context:
                        
                        for pos_seq in positive_interest_context:
                            if contains(pos_seq, context):
                                pos_occ += 1
                                if debug:
                                    print('N: %s' % context)
                                
                        for neg_seq in zero_interest_context:
                            
                            if contains(neg_seq, context):
                                neg_occ += 3
                                if debug:
                                    print('Y: %s' % context)
            
            if neg_occ > pos_occ:
                zero_coupon_flag = 'Y'
                
            if debug:
                print('%s | %s | neg_occ = %d | pos_occ = %d' % (isin_v, zero_coupon_flag, neg_occ, pos_occ))    
            
            isin_zero_coupon_flags[isin_v] = zero_coupon_flag    
        
    zc_flags = [[k,v] for k, v in isin_zero_coupon_flags.items()]

    return zc_flags

In [8]:
compute_zero_coupon_flags(data, 'XS1342870125')

N: ['interest', 'period', 's', 'specified', 'interest', 'payment', 'date', 's']
XS1342870125 | N | neg_occ = 0 | pos_occ = 1


[['XS1342870125', 'N']]

In [6]:
data = pickle.load(open('contents_by_isin.pickle', 'rb'))
zc_flags = compute_zero_coupon_flags(data, debug=False)
zc_calculated_df = pd.DataFrame(zc_flags, columns = ['isin','ZERO.COUPN.FLAG'])
zc_calculated_df.sort_values('isin', axis=0, inplace=True)
zc_calculated_df.describe()

Unnamed: 0,isin,ZERO.COUPN.FLAG
count,1200,1200
unique,1200,2
top,XS1326562078,N
freq,1,1133


In [7]:
zc_labeled_df = pd.DataFrame(labels[['isin','ZERO.COUPN.FLAG']])
zc_labeled_df.set_index(['isin'])
zc_labeled_df = zc_labeled_df.drop_duplicates()
zc_labeled_df.sort_values('isin', axis=0, inplace=True)

zc_labeled_df.describe()

Unnamed: 0,isin,ZERO.COUPN.FLAG
count,1200,1200
unique,1200,2
top,XS1326562078,N
freq,1,880


In [11]:
import numpy as np

#convert the dataframes to numpy because I get an error when comparing them directly
zc_calculated = zc_calculated_df.as_matrix()
zc_labeled = zc_labeled_df.as_matrix()

errors = np.sum(zc_labeled[:, 1] != zc_calculated[:, 1])
print('Total errors: ', errors)

all_isins = zc_labeled.shape[0]
print('Accuracy: %.2f %%' % (((all_isins - errors) / all_isins)* 100))

Total errors:  273
Accuracy: 77.25 %


In [12]:
#view the isins which were not correctly calculated
isins_in_error = np.where(zc_labeled[:, 1] != zc_calculated[:, 1]) 
print(np.append(zc_calculated[isins_in_error], zc_labeled[isins_in_error], axis = 1))

[['XS0774414683' 'N' 'XS0774414683' 'Y']
 ['XS0877216399' 'N' 'XS0877216399' 'Y']
 ['XS1051329560' 'N' 'XS1051329560' 'Y']
 ..., 
 ['XS1340537718' 'N' 'XS1340537718' 'Y']
 ['XS1349021326' 'N' 'XS1349021326' 'Y']
 ['XS1349022563' 'N' 'XS1349022563' 'Y']]


In [13]:
def display_context(data, isin, words):
    look_around = 5

    for isin_v, content in data.items():
        if isin == isin_v:
            for i in range(len(content)):
                word_v = content[i]
                if word_v in words:
                    context = content[i - look_around : i + look_around]
                    if context:
                        print('%s | %s | %s' % (isin_v, word_v, context))

In [14]:
display_context(data, 'XS0774414683', ['coupon', 'interest'])

XS0774414683 | interest | ['are', 'not', 'partly', 'paid', 'certificates', 'interest', 'not', 'applicable', 'fixed', 'rate']
XS0774414683 | interest | ['rate', 'provisions', 'not', 'applicable', 'linked', 'interest', 'certificates', 'not', 'applicable', 'payment']
XS0774414683 | interest | ['s', 'not', 'applicable', 'index', 'linked', 'interest', 'certificates', 'not', 'applicable', 'share']
XS0774414683 | interest | ['certificates', 'not', 'applicable', 'share', 'linked', 'interest', 'certificates', 'not', 'applicable', 'eti']
XS0774414683 | interest | ['certificates', 'not', 'applicable', 'eti', 'linked', 'interest', 'certificates', 'not', 'applicable', 'debt']
XS0774414683 | interest | ['certificates', 'not', 'applicable', 'debt', 'linked', 'interest', 'certificates', 'not', 'applicable', 'commodity']
XS0774414683 | interest | ['certificates', 'not', 'applicable', 'commodity', 'linked', 'interest', 'certificates', 'not', 'applicable', 'inflation']
XS0774414683 | interest | ['not', '

In [15]:
print(data['XS0774414683'])

['final', 'terms', 'dated', 'july', 'bnp', 'paribas', 'arbitrage', 'issuance', 'b', 'v', 'incorporated', 'in', 'the', 'netherlands', 'as', 'issuer', 'bnp', 'paribas', 'incorporated', 'in', 'france', 'as', 'guarantor', 'warrant', 'and', 'certificate', 'programme', 'usd', 'quot', 'call', 'spread', 'quot', 'certificates', 'relating', 'to', 'gold', 'due', 'july', 'isin', 'code', 'xs', 'bnp', 'paribas', 'arbitrage', 's', 'n', 'c', 'as', 'manager', 'the', 'base', 'prospectus', 'referred', 'to', 'below', 'as', 'completed', 'by', 'these', 'final', 'terms', 'has', 'been', 'prepared', 'on', 'the', 'basis', 'that', 'any', 'offer', 'of', 'securities', 'in', 'any', 'member', 'state', 'of', 'the', 'european', 'economic', 'area', 'which', 'has', 'implemented', 'the', 'prospectus', 'directive', 'each', 'a', 'quot', 'relevant', 'member', 'state', 'quot', 'will', 'be', 'made', 'pursuant', 'to', 'an', 'exemption', 'under', 'the', 'prospectus', 'directive', 'as', 'implemented', 'in', 'that', 'relevant', '

In [98]:
def scan_folder(folder):
    dictionary = []
    
    for file_name in os.listdir(folder):
        print('.', end='')
        if file_name.endswith(".html"):
            with open(path.join(folder, file_name), 'r', encoding='utf8') as html_file:
                full_text = html_file.read()
                body = re.findall(r'<body[^>]*?>(.*?)</body>', full_text)
                clean_text = re.sub('<\/?span[^>]*>', ' ', body[0])
                clean_text = re.sub('<.*?>', ' ', clean_text)
                clean_text = clean_text.lower()
                words = nltk.word_tokenize(clean_text)
                words = [re.sub('[^a-z]', '', w) for w in words]
                stops = set(stopwords.words("english"))
                relevant_words = [w for w in words if w not in stops and w != '']
                dictionary += relevant_words
                
    return dictionary

def compute_words_dictionary():
    print('Computing full dictionary')
    
    dictionary = []
    
    dictionary += scan_folder(html_folder)
    dictionary += scan_folder(html1_folder)
    
    all_words = sorted(set(dictionary))
    
    print('\nDictionary has %d words' % len(all_words))
  
    pickle_file = 'all_words.pickle'
    pickle.dump(all_words, open(pickle_file, 'wb'))
        
    print('Saved dictionary to:', pickle_file)
    

In [99]:
compute_words_dictionary()
print('OK!')

Computing full dictionary
..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [5]:
dictionary = pickle.load(open('all_words.pickle', 'rb'))
print(list(dictionary)[:100])

['aa', 'aaa', 'aab', 'aachen', 'aacr', 'aaencv', 'aaj', 'aapl', 'aaploq', 'aazn', 'ab', 'abban', 'abbl', 'abbott', 'abbreviation', 'abbv', 'abbvie', 'abcmart', 'abe', 'aber', 'abetted', 'abetting', 'abga', 'abgaben', 'abgeb', 'abgendert', 'abgesichertes', 'abgezogen', 'abha', 'abhang', 'abhngig', 'abhngigkeit', 'abhngt', 'abi', 'abilis', 'abilising', 'abilities', 'ability', 'abkommen', 'abl', 'ablauf', 'able', 'ably', 'abn', 'abnahme', 'abo', 'abolish', 'abolished', 'abolishment', 'abolition', 'aboveabove', 'abovementioned', 'abovenoted', 'abovereferenced', 'abp', 'abqewicke', 'abrechnungs', 'abrechnungskurs', 'abrechnungskus', 'abrechnungstermin', 'abroad', 'abrogation', 'abrolutedi', 'abrufbar', 'abs', 'absatz', 'abschlag', 'abschliessend', 'abschlsse', 'abschlu', 'abschluss', 'abschnitt', 'abschnitten', 'abschre', 'abschreibungen', 'absehbarer', 'absehen', 'absence', 'absent', 'absicheru', 'absicherung', 'absicherungs', 'absicherungstransaktionen', 'absicht', 'absichtlich', 'abso', 

In [9]:
import numpy as np
dictionary = np.array(list(pickle.load(open('all_words.pickle', 'rb'))))

def convert_word_to_vector(word, dictionary):
    size = len(dictionary)
    word_vec = np.zeros(size)
    word_vec[np.where(dictionary == word)] = 1
    return word_vec

In [64]:
print(convert_word_to_vector('aa',dictionary))

[ 1.  0.  0. ...,  0.  0.  0.]


In [93]:
np.where(x == 0.7)

(array([], dtype=int64),)

In [104]:
def load_and_sort_data():
    data = pickle.load(open('contents_by_isin.pickle', 'rb'))
    data_by_isin = [[k, v] for k, v in data.items()]
    data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
    data_by_isin_df.sort_values('isin', axis=0, inplace=True)
    
    dictionary = np.array(list(pickle.load(open('all_words.pickle', 'rb'))))
    
    dataset = data_by_isin_df.as_matrix(columns=['content'])
    ds = []
    for row in dataset[:, :]:
        for line in row:
            ds.append(line)

    dataset = np.array(ds)
    
    data_labeled_df = pd.DataFrame(labels[['isin','ZERO.COUPN.FLAG']])
    data_labeled_df.set_index(['isin'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('isin', axis=0, inplace=True)
    labelset = data_labeled_df.as_matrix(columns=['ZERO.COUPN.FLAG'])
    labelset = labelset.flatten()
    
    return dataset, labelset


def compute_zero_coupon_flags_with_ml(data, labels):
    pass

In [105]:
isin_data, label_data = load_and_sort_data()
print(isin_data.shape)
print(label_data.shape)

(1200,)
(1200,)


In [16]:
def document_to_text(doc_path):
    with open(doc_path, 'r', encoding='utf8') as html_file:
        full_text = html_file.read()
        body = re.findall(r'<body[^>]*?>(.*?)</body>', full_text)
        clean_text = re.sub('<.*?>', ' ', body[0])
        clean_text = re.sub('\s+', ' ', clean_text)
        clean_text = clean_text.lower()
        clean_text = re.sub('won\'t', 'will not', clean_text)
        clean_text = re.sub('can\'t', 'cannot', clean_text)
        clean_text = re.sub('[^a-z]', ' ', clean_text)
        clean_text = re.sub('\s+', ' ', clean_text)
        return clean_text
    
def group_docs_by_isin(html_folder, labels, pickle_results=False):
    
    contents_by_isin = {}
    
    print('Processing files')

    for file_name in os.listdir(html_folder):
        
        print('.', end='')
        
        if file_name.endswith(".html"):

            #first find the isin corresponding to this file
            isin = get_isin_for_file(labels, file_name)

            if(isin):

                #found the isin to associate the document with
                file_content = document_to_text(path.join(html_folder, file_name))

                try:
                    #if there is already existing data for this isin, append the new data
                    existing_isin_data = contents_by_isin[isin]
                    contents_by_isin[isin] = existing_isin_data + file_content

                except KeyError:
                    contents_by_isin[isin] = file_content
    
    print('\nFinished grouping file contents indexed by ISIN')
    
    if pickle_results:
        pickle_file = 'docs_by_isin.pickle'
        pickle.dump(contents_by_isin, open(pickle_file, 'wb'))
        
        print('Saved file contents indexed by ISIN to:', pickle_file)
        
    return contents_by_isin

def load_and_sort_text_data():
    data = pickle.load(open('docs_by_isin.pickle', 'rb'))
    data_by_isin = [[k, v] for k, v in data.items()]
    data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
    data_by_isin_df.sort_values('isin', axis=0, inplace=True)
  
    dataset = data_by_isin_df.as_matrix(columns=['content'])
    
    data_labeled_df = pd.DataFrame(labels[['isin','ZERO.COUPN.FLAG']])
    data_labeled_df.set_index(['isin'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('isin', axis=0, inplace=True)
    labelset = data_labeled_df.as_matrix(columns=['ZERO.COUPN.FLAG'])
    labelset = labelset.flatten()
    
    return dataset, labelset

In [13]:
group_docs_by_isin(html_folder, labels, pickle_results=True)
print('OK!')

Processing files
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [14]:
#check that we can read the data
data = pickle.load(open('docs_by_isin.pickle', 'rb'))
first_pair = {k : data[k] for k in list(data.keys())[:1]}

print(first_pair)

{'XS1051351143': ' offering circular deutsche bank ag london up to jpy notes relating to next funds nikkei leveraged index exchange traded fund due june issued under its programme issue price of the nominal amount isin xs deutsche bank ag london the issuer is the london branch of deutsche bank ag which is incorporated under the laws of germany deutsche bank ag london is registered as a foreign company in england and wales under its x markets programme the programme the issuer may issue certain securities relating to shares and or indices and or debt securities and or fund units and or commodities and or currencies the issuer has determined to issue up to jpy notes relating to next funds nikkei leveraged index exchange traded fund due june the securities upon the product terms and conditions set out in section i of this document the product conditions and the general terms and conditions set out in section ii of this document the general conditions which together with the product condit

In [17]:
dataset, labelset = load_and_sort_text_data()
print(dataset[0])
print(labelset[0])

N


In [18]:
import sklearn
import numpy as np

trainset_size = int(round(len(dataset) * 0.70))
print(trainset_size)

840


In [39]:
x_train = np.array([''.join(el) for el in dataset[0:trainset_size]])
y_train = np.array([''.join(el) for el in labelset[0:trainset_size]])

x_test = np.array([''.join(el) for el in dataset[trainset_size + 1 : -1]])
y_test = np.array([''.join(el) for el in labelset[trainset_size + 1 : -1]])

In [20]:
print(x_train[0])
print(y_train[0])

N


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (1,2), strip_accents = 'unicode', stop_words = 'english')

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

clf = MultinomialNB().fit(x_train, y_train)
y_nb_predicted = clf.predict(x_test)

cm = confusion_matrix(y_test, y_nb_predicted, labels=['N', 'Y'])
print(cm)

print(classification_report(y_test, y_nb_predicted, target_names=['N', 'Y']))

[[291   0]
 [ 65   2]]
             precision    recall  f1-score   support

          N       0.82      1.00      0.90       291
          Y       1.00      0.03      0.06        67

avg / total       0.85      0.82      0.74       358

