In [40]:
import os
from os import path

datafolder = "../train/"
assert path.isdir(datafolder)

data1folder = '../final_test'
assert path.isdir(data1folder)

html1_folder = path.join(data1folder, "html")
assert path.isdir(html1_folder)

html_folder = path.join(datafolder, "html")
assert path.isdir(html_folder)

In [41]:
import pandas as pd

labels = pd.read_csv( datafolder + "outcome/ISIN_train.csv", header=0, sep=",", quoting=1, thousands=",")
files_isins = pd.read_csv( datafolder + "docID/docid_train.csv", header=0, sep=",", quoting=1, thousands=",")
labels.head()

Unnamed: 0,ISIN,ISSUER.NAME,ZCP.FL,MIN.TRAD.AMT,MLT.TRAD.AMT,OPS.CURR
0,ARCBAS031621,CIUDAD DE BUENOS AIRES,N,0,1000,ARS
1,AT0000248448,UNICREDIT BANK AUSTRIA AG,N,0,100000,EUR
2,AT0000A0MPB1,BAWAG PSK BANK FUR ARBEIT UND WIRTSCHAFT UND O...,N,0,100,EUR
3,AT0000A17HH9,RAIFFEISEN CENTROBANK AG,N,0,1000,USD
4,AT0000A1HE76,RAIFFEISEN CENTROBANK AG,Y,0,1000,CZK


## Utility functions

In [42]:
import re
import pickle
import nltk
from nltk.corpus import stopwords

keywords_contexts = {'interest' : [['payment', 'date'], \
                                   ['applicable'], \
                                   ['bearing'], \
                                   ['bears']
                                  ], \
                     'coupon' : [['zero'], \
                                 ['provisions']
                                ]
                    }

def contains(small_list, big_list):
    for word in small_list:
        if word not in big_list:
            return False
    return True

def keywords_in_context(keywords, context):
    for keywords_list in keywords:
        if contains(keywords_list, context):
            return True
        
    return False

def sort_contexts(contexts):
    c = sorted(contexts, key=len)
    c.reverse()
    return c

# keywords_contexts = sort_contexts(keywords_contexts)

def extract_relevant_info(words, keywords_contexts, window_before, window_after):
    relevant_info = []

    for i in range(len(words)):
        word = words[i]

        if word in keywords_contexts.keys() and i + window_after < len(words):
            
            #retain look_around words around the keyword
            window = words[i - window_before : i + window_after]
            
#             print('%s - %s' % (word, window))
            
#             keywords_groups = keywords_contexts[word]
#             for group in keywords_groups:
#                 keywords_present = keywords_in_context(group, window)
#                 if keywords_present:
#                     relevant_info += window
#                     break
                    
            relevant_info += window
            
    return relevant_info

def get_isin_for_file(labels, file_name, print_result = False):
    isin = None
    #function to get the isin from the labels, given a file name
    file_id = file_name.split('_')[0]
    idx = labels['DOCID'] == file_id
    isins = labels['ISIN'][idx]
    if isins.values: 
        isin = isins.values[0]
    
    if(print_result):
        print("%s - %s" % (file_name, isin))
    return isin

def get_labels_for_isin(labels, isin):
    idx = labels['isin'] == isin
    return labels[:][idx].values

def document_to_text(doc_path):
    with open(doc_path, 'r', encoding='utf8') as html_file:
        full_text = html_file.read()
        body = re.findall(r'<body[^>]*?>(.*?)</body>', full_text)
        cleantext = re.sub('<\/?span[^>]*>', '', body[0])
        clean_text = re.sub('<.*?>', ' ', cleantext)
        clean_text = clean_text.lower()
        clean_text = re.sub('won\'t', 'will not', clean_text)
        clean_text = re.sub('can\'t', 'cannot', clean_text)
        clean_text = re.sub('[^a-z]', ' ', clean_text)
        clean_text = re.sub('\s+', ' ', clean_text)
        words = nltk.word_tokenize(clean_text)
        stops = set(stopwords.words('english'))
        relevant_words = [w for w in words if w not in stops and w != '' and w != ' ']
        return relevant_words
        
        return clean_text
    
def group_docs_by_isin(html_folder, files_isins, pickle_results=False, pickle_file='docs_by_isin.pickle'):
    
    contents_by_isin = {}
    
    print('Processing files')

    for file_name in os.listdir(html_folder):
        
        print('.', end='')
        
        if file_name.endswith(".html"):

            #first find the isin corresponding to this file
            isin = get_isin_for_file(files_isins, file_name)
            
            if(isin is not None):

                #found the isin to associate the document with
                file_content = document_to_text(path.join(html_folder, file_name))
                window_before = 5
                window_after = 5
                
                file_content = extract_relevant_info(file_content, keywords_contexts, window_before, window_after)
#                 print(file_content)
                file_content = ' '.join(file_content)
#                 print(file_content)

                try:
                    #if there is already existing data for this isin, append the new data
                    existing_isin_data = contents_by_isin[isin]
                    contents_by_isin[isin] = existing_isin_data + file_content

                except KeyError:
                    contents_by_isin[isin] = file_content
    
#         break
    print('\nFinished grouping file contents indexed by ISIN')
    
    if pickle_results:
        pickle.dump(contents_by_isin, open(pickle_file, 'wb'))
        
        print('Saved file contents indexed by ISIN to:', pickle_file)
        
    return contents_by_isin

def load_and_sort_training_data(file_name):
    data = pickle.load(open(file_name, 'rb'))
    data_by_isin = [[k, v] for k, v in data.items()]
    data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
    data_by_isin_df.sort_values('isin', axis=0, inplace=True)
  
    dataset = data_by_isin_df.as_matrix(columns=['content'])[:,0]

    labels = pd.read_csv( datafolder + 'outcome/ISIN_train.csv', header=0, sep=',', quoting=1, thousands=',')
    data_labeled_df = pd.DataFrame(labels[['ISIN','ZCP.FL']])
    data_labeled_df.set_index(['ISIN'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('ISIN', axis=0, inplace=True)
    labelset = data_labeled_df.as_matrix(columns=['ZCP.FL'])
    labelset = labelset.flatten()
    
    return dataset, labelset

def find_isins_in_folder(html_folder):
    print('Extracting list of ISIN and file ids from folder %s' % html_folder)
    
    fileids_isins = []
    ISIN_REGEXP = r'[a-zA-Z]{2}[0-9]{10}'
    FILEID_REGEXP = r'[0-9a-zA-Z]{16}'

    for file_name in os.listdir(html_folder):
        if file_name.endswith(".html"):

            #get the file id
            file_ids = re.findall(FILEID_REGEXP, file_name)
            if len(file_ids) >= 1:
                file_id = file_ids[0]

                #get the ISIN
                isins = re.findall(ISIN_REGEXP, file_name)

                if len(isins) > 1 :
                    raise ValueError('Multpiple isin in file name %s' % file_name)

                elif len(isins) == 1:
                    fileids_isins.append([file_id, isins[0].upper()])

                else:
                    with open(html1_folder + '/' + file_name, 'r', encoding='utf8') as html_file:
                        full_text = html_file.read()
                        isins = re.findall(ISIN_REGEXP, full_text)
                        if len(isins) > 1:
                            fileids_isins.append([file_id, isins[0].upper()])
                        elif len(isins) == 1:
                            fileids_isins.append([file_id, isins[0].upper()])
                        else:
                            print('Can\'t find isin for file %s' % file_name)
        
    print('Total files in folder: %d' % len(os.listdir(html_folder)))            
    print('Total files with isin found: %d' % len(fileids_isins))

    return pd.DataFrame(fileids_isins, columns = ['fileId','isin'])

def load_isins_files(doc_isin_path):
    all_labels_df = pd.read_csv(doc_isin_path, header=0, sep=",", quoting=1, thousands=",")
    all_labels_df.set_index(['ISIN'])
    all_labels_df.sort_values('ISIN', axis=0, inplace=True)
    all_labels = pd.DataFrame(all_labels_df[['ISIN', 'DOCID']])
    return all_labels

## Read the text from files, clean the text, group the file contents by ISIN, save the results

In [23]:
group_docs_by_isin(html_folder, files_isins, pickle_results=True, pickle_file='zero_coupon_docs_by_isin.pickle')
print('OK!')

Processing files
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Load the cleaned data and the labels and align them by ISIN

In [24]:
import pickle
import pandas as pd

dataset, labelset = load_and_sort_training_data('zero_coupon_docs_by_isin.pickle')

In [25]:
print(dataset.shape)

(5456,)


In [7]:
t = dataset[2][0]
extract_relevant_info(t, keywords_contexts, 5, 5)

[]

## Split the dataset into train and validation

In [26]:
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataset, labelset,\
                                                                     test_size=0.3, random_state=53)

## Find the best hyperparameters for a pipeline that classifies the text

In [None]:
from pprint import pprint
from time import time
import logging
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn import cross_validation

dataset, labelset = load_and_sort_training_data('zero_coupon_docs_by_isin.pickle')

X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataset, labelset,\
                                                                     test_size=0.3, random_state=53)

pipeline = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
                     ('clf', SGDClassifier())])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.0001, 0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Train the pipeline with the hyperparameters that were found

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier

from sklearn import svm

text_clf = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
#                      ('clf', svm.SVC())
                     ('clf', SGDClassifier())
#                      ('clf', RandomForestClassifier())
                    ])


text_clf = text_clf.set_params(\
                               clf__alpha = 1e-05, \
                               clf__n_iter = 50, \
                               clf__penalty = 'l2', \
                               clf__n_jobs = '-1', \
#                                clf__decision_function_shape = 'ovo', \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = True, \
                               vect__max_df = 1.0, \
                               vect__ngram_range = (1, 2), \
                               vect__stop_words = 'english')

"""
text_clf = text_clf.set_params(clf__criterion = 'gini', \
                               clf__max_features = 'auto', \
                               clf__n_estimators = 50, \
                               clf__n_jobs = -1, \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = False, \
                               vect__max_df = 1, \
                               vect__max_features = None, \
                               vect__ngram_range = (1, 2)
                              )
"""
text_clf = text_clf.fit(X_train.ravel(), y_train.ravel())

y_train_pred = text_clf.predict(X_train.ravel())

In [28]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

print(np.mean(y_train == y_train_pred))

cm = confusion_matrix(y_train, y_train_pred, labels=['N', 'Y'])
print(cm)

print(classification_report(y_train, y_train_pred, target_names=['N', 'Y']))

0.983241686305
[[2939    1]
 [  63  816]]
             precision    recall  f1-score   support

          N       0.98      1.00      0.99      2940
          Y       1.00      0.93      0.96       879

avg / total       0.98      0.98      0.98      3819



## Predict the values on the test data

In [29]:
y_test_pred = text_clf.predict(X_test.ravel())

## Analyze the prediction results

In [30]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(np.mean(y_test_pred == y_test))

cm = confusion_matrix(y_test, y_test_pred, labels=['N', 'Y'])
print(cm)

print(classification_report(y_test, y_test_pred, target_names=['N', 'Y']))

0.973121563836
[[1244    8]
 [  36  349]]
             precision    recall  f1-score   support

          N       0.97      0.99      0.98      1252
          Y       0.98      0.91      0.94       385

avg / total       0.97      0.97      0.97      1637



In [None]:
def compile_mismatched_data_summary(y_test, y_test_pred, labels_path, doc_isin_path, label_name):
    labels = pd.read_csv(labels_path, header=0, sep=",", quoting=1, thousands=",")
    data_labeled_df = pd.DataFrame(labels[['ISIN', label_name]])
    data_labeled_df.set_index(['ISIN'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('ISIN', axis=0, inplace=True)

    idx = np.where(y_test != y_test_pred)
    mismatched_isins = data_labeled_df.as_matrix()[idx]

    all_labels_df = pd.read_csv(doc_isin_path, header=0, sep=",", quoting=1, thousands=",")
    all_labels_df.set_index(['ISIN'])
    all_labels_df.sort_values('ISIN', axis=0, inplace=True)

    all_labels = all_labels_df.as_matrix(columns=['DOCID', 'ISIN'])

    file_ids = []
    for isin in mismatched_isins[:,0]:
        idxx = all_labels_df['ISIN'] == isin
        file_id = all_labels_df['DOCID'][idxx].values[0]
        file_ids.append(file_id)
        
    summary = np.c_[np.array(file_ids), mismatched_isins[:,0], y_test[idx], y_test_pred[idx]]
    
    summary_df = pd.DataFrame(summary, columns=['fileId', 'isin', 'actual', 'predicted'])
    
    return summary_df

compile_mismatched_data_summary(y_test, y_test_pred, datafolder + 'outcome/ISIN_train.csv', \
                                datafolder + 'docID/docid_train.csv', 'ZCP.FL')

In [None]:
from code.dataExtract import data
import code.dataExtract as de

d = data(folder='../train/')

def create_results_dict(y_test_pred, labels_path, d):
    labels = pd.read_csv(labels_path, header=0, sep=",", quoting=1, thousands=",")
    data_labeled_df = pd.DataFrame(labels[['ISIN']])
    data_labeled_df.set_index(['ISIN'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('ISIN', axis=0, inplace=True)
    isins = data_labeled_df.as_matrix()
    print(isins.shape)
    print(y_test_pred.shape)
    
create_results_dict(y_test_pred, datafolder + 'outcome/ISIN_train.csv', d)

## Train a vectorizer and classifier on all the available data and save them for future use

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib

dataset, labelset = load_and_sort_training_data('zero_coupon_docs_by_isin.pickle')

text_clf = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
                     ('clf', SGDClassifier())])

text_clf = text_clf.set_params(clf__alpha = 1e-05, \
                               clf__n_iter = 50, \
                               clf__penalty = 'l2', \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = True, \
                               vect__max_df = 1.0, \
                               vect__ngram_range = (1, 2), \
                               vect__stop_words = 'english')

text_clf = text_clf.fit(dataset.ravel(), labelset)

joblib.dump(text_clf, 'zero_coupon_flag_pipeline.pkl') 

['zero_coupon_flag_pipeline.pkl',
 'zero_coupon_flag_pipeline.pkl_01.npy',
 'zero_coupon_flag_pipeline.pkl_02.npy',
 'zero_coupon_flag_pipeline.pkl_03.npy',
 'zero_coupon_flag_pipeline.pkl_04.npy',
 'zero_coupon_flag_pipeline.pkl_05.npy',
 'zero_coupon_flag_pipeline.pkl_06.npy']

## Load and process unlabeled data

In [32]:
unlabeled_isins = pd.read_csv('../final_test/docID/docid_final_test.csv', header=0, sep=",", quoting=1, thousands=",")

# unlabeled_isins = load_isins_files('../int_test/docID/docid_int_test.csv')

unlabeled_isins.head()

Unnamed: 0,DOCID,ISIN
0,0900045c8059e795,CH0025370906
1,0900045c8092c413,XS0545673914
2,0900045c80c5dd9f,US40430CLG23
3,0900045c80d7461e,US44328MAX20
4,0900045c80dd8470,XS0729081124


In [11]:
from code.dataExtract import data
import code.dataExtract as de
# Read the data
d = data(folder='../int_test/') 
print(d.docid)

{'XS1389956829': OrderedSet(['0900045c83f5976a']), 'XS1317268990': OrderedSet(['0900045c83faf4ff']), 'CH0306892180': OrderedSet(['0900045c83fb87c3']), 'XS1257750049': OrderedSet(['0900045c833fd701']), 'XS1135633466': OrderedSet(['0900045c832a7386']), 'XS1374651245': OrderedSet(['0900045c83dee5dd']), 'XS1289436922': OrderedSet(['0900045c83d87026']), 'XS1072479048': OrderedSet(['0900045c82510a85']), 'XS1381941597': OrderedSet(['0900045c83e90274']), 'XS1362987718': OrderedSet(['0900045c83d469e0']), 'XS1351712127': OrderedSet(['0900045c83c2447b']), 'XS1063913195': OrderedSet(['0900045c8233fe1c']), 'XS1400179906': OrderedSet(['0900045c83fa9734']), 'XS1349974193': OrderedSet(['0900045c83c859e5']), 'XS1279305467': OrderedSet(['0900045c83e7d985']), 'XS1317186598': OrderedSet(['0900045c83e8ce13']), 'XS1374208137': OrderedSet(['0900045c83e0b27b']), 'XS1295011461': OrderedSet(['0900045c83b42a61']), 'CH0308778791': OrderedSet(['0900045c83fe690d']), 'XS1307677457': OrderedSet(['0900045c83df6efd']),

In [33]:
# print(unlabeled_isins)
group_docs_by_isin('../final_test/html', unlabeled_isins, pickle_results=True, pickle_file='unlabeled_docs_by_isin.pickle')
print('OK!')
# group_docs_by_isin(html_folder, files_isins, pickle_results=True, pickle_file='zero_coupon_docs_by_isin.pickle')
# print('OK!')

Processing files
................................................................................................................................................................................................................................................................................................................................................................................................................................................
Finished grouping file contents indexed by ISIN
Saved file contents indexed by ISIN to: unlabeled_docs_by_isin.pickle
OK!


In [34]:
import numpy as np
import pandas as pd
import pickle

unlabeled_data = pickle.load(open('unlabeled_docs_by_isin.pickle', 'rb'))
data_by_isin = [[k, v] for k, v in unlabeled_data.items()]
data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
data_by_isin_df.sort_values('isin', axis=0, inplace=True)
  
unlabeled_dataset = data_by_isin_df.as_matrix(columns=['content']).flatten()
isins = data_by_isin_df.as_matrix(columns=['isin']).flatten()

In [35]:
print(unlabeled_dataset.shape)
print(isins.shape)

(424,)
(424,)


In [36]:
#load the classifier
clf = joblib.load('zero_coupon_flag_pipeline.pkl')
zero_coupon_flags = clf.predict(unlabeled_dataset)

In [49]:
print(zero_coupon_flags.shape)
output_array = np.vstack((isins, zero_coupon_flags)).T
# print(output_array)

results_df = pd.DataFrame(output_array, columns=['ISIN', 'ZCP.FL'])
results_df.to_csv('zero_coupon_flags.csv', index=False)

(426,)


In [38]:
import csv
results_df = pd.DataFrame(np.vstack((isins, zero_coupon_flags)).T, columns=['ISIN', 'ZCP.FL'])

results_df.to_csv('final_zcp.csv', index=False, header=False, quoting=csv.QUOTE_ALL)

In [39]:
results_df.describe()

Unnamed: 0,ISIN,ZCP.FL
count,424,424
unique,424,2
top,XS1326148142,N
freq,1,338


In [67]:
import numpy as np
import pandas as pd
import pickle
import csv
from sklearn.externals import joblib

def predict_zero_coupon_flag(html_folder, unlabeled_isins, output_file, classifier = 'zero_coupon_flag_pipeline.pkl'):
    
    unlabeled_data = group_docs_by_isin(html_folder, unlabeled_isins)
    
    print('Processing document contents in correct format')
    data_by_isin = [[k, v] for k, v in unlabeled_data.items()]
    data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
    data_by_isin_df.sort_values('isin', axis=0, inplace=True)

    unlabeled_dataset = data_by_isin_df.as_matrix(columns=['content']).flatten()
    isins = data_by_isin_df.as_matrix(columns=['isin']).flatten()
    
    print('Loading the vectorizer and trained classifier')
    #load the classifier
    text_clf = joblib.load(classifier)
    zero_coupon_flags = text_clf.predict(unlabeled_dataset)
    
    print('Predicting zero coupon flags')
    
    results_df = pd.DataFrame(np.vstack((isins, zero_coupon_flags)).T, columns=['ISIN', 'ZCP.FL'])

    results_df.to_csv(output_file, index=False, header=False, quoting=csv.QUOTE_ALL)
    
    print('Saved predictions to file %s' % output_file)

In [68]:
html1_folder = '../int_test/html'
unlabeled_isins = pd.read_csv('../int_test/docID/docid_int_test.csv', header=0, sep=",", quoting=1, thousands=",")

predict_zero_coupon_flag(html1_folder, unlabeled_isins, 'zcp.csv', 'zero_coupon_flag_pipeline.pkl')

Processing files
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................
Finished grouping file contents indexed by ISIN
Processing document contents in correct format
Loading the vectorizer and trained classifier
Predicting zero coupon flags
Saved predictions to file zcp.csv
