In [7]:
import os
from os import path

datafolder = "../train/"
assert path.isdir(datafolder)

data1folder = '../final_test'
assert path.isdir(data1folder)

html1_folder = path.join(data1folder, "html")
assert path.isdir(html1_folder)

html_folder = path.join(datafolder, "html")
assert path.isdir(html_folder)

In [2]:
import pandas as pd

labels = pd.read_csv( datafolder + "outcome/ISIN_train.csv", header=0, sep=",", quoting=1, thousands=",")
files_isins = pd.read_csv( datafolder + "docID/docid_train.csv", header=0, sep=",", quoting=1, thousands=",")
labels.head()

Unnamed: 0,ISIN,ISSUER.NAME,ZCP.FL,MIN.TRAD.AMT,MLT.TRAD.AMT,OPS.CURR
0,ARCBAS031621,CIUDAD DE BUENOS AIRES,N,0,1000,ARS
1,AT0000248448,UNICREDIT BANK AUSTRIA AG,N,0,100000,EUR
2,AT0000A0MPB1,BAWAG PSK BANK FUR ARBEIT UND WIRTSCHAFT UND O...,N,0,100,EUR
3,AT0000A17HH9,RAIFFEISEN CENTROBANK AG,N,0,1000,USD
4,AT0000A1HE76,RAIFFEISEN CENTROBANK AG,Y,0,1000,CZK


In [5]:
import re
import pickle
import nltk
from nltk.corpus import stopwords
from time import time

def get_isin_for_file(labels, file_name, print_result = False):
    isin = None
    #function to get the isin from the labels, given a file name
    file_id = file_name.split('_')[0]
    idx = labels['DOCID'] == file_id
    isins = labels['ISIN'][idx]
    if isins.values: 
        isin = isins.values[0]
    
    if(print_result):
        print("%s - %s" % (file_name, isin))
    return isin

def get_labels_for_isin(labels, isin):
    idx = labels['ISIN'] == isin
    return labels[:][idx].values

def get_label_value_for_isin(labels, isin, attr_name):
    #function to return the value of the label for a given isin
    idx = labels['ISIN'] == isin
    return labels[attr_name][idx].values[0]

def save_file(isin, text):
    f = open(isin + '.txt', 'w')
    f.write(text)
    f.close()

def document_to_words(doc_path):
    with open(doc_path, 'r', encoding='utf8') as html_file:
        full_text = html_file.read()
        body = re.findall(r'<body[^>]*?>(.*?)</body>', full_text)
        cleantext = re.sub('<\/?span[^>]*>', ' ', body[0])
        clean_text = re.sub('<.*?>', ' ', cleantext)
        clean_text = clean_text.lower()
        clean_text = re.sub('0.01', '1', clean_text)
        clean_text = re.sub(',', '', clean_text)
        clean_text = re.sub(r'[^a-z0-9]', ' ', clean_text)
        clean_text = re.sub('\s+', ' ', clean_text)
        words = nltk.word_tokenize(clean_text)
        stops = set(stopwords.words('english'))
        relevant_words = [w for w in words if w not in stops and w != '' and w != ' ']
        return relevant_words

keywords_contexts = [['minimum', 'principal', 'amounts'], \
                     ['minimum', 'amounts'], \
                     ['minimum', 'amount'], \
#                      ['minimum']
                    ]

def contains(small_list, big_list):
    for word in small_list:
        if word not in big_list:
            return False
    return True

def keywords_in_context(keywords, context):
    for keywords_list in keywords:
        if contains(keywords_list, context):
            return True
        
    return False

def sort_contexts(contexts):
    c = sorted(contexts, key=len)
    c.reverse()
    return c

keywords_contexts = sort_contexts(keywords_contexts)

def extract_relevant_info(words, op_curr, window_before, window_after):
#     print('extracting info for : %s - %s', (op_curr, words))

    AMOUNT_REGEX = r'\d{1,12}'
    CURR_AMOUNT_REGEX = r'[a-z]{3}\d{1,12}'
    
    relevant_info = ['0']
    
    for i in range(len(words)):
        word = words[i]

        if word == op_curr and i + window_after < len(words):
            #retain look_around words around the keyword
            context_before = words[i - window_before : i]
            
            keywords_present = keywords_in_context(keywords_contexts, context_before)
            
            if keywords_present:
                context_after = words[i : i + window_after]

                for w in context_after:
                    if re.match(AMOUNT_REGEX, w):

                            context = context_before + context_after

                            relevant_info += context

                            break

        elif word.startswith(op_curr) and re.match(CURR_AMOUNT_REGEX, word):
            context_before = words[i - window_before : i]
            
            keywords_present = keywords_in_context(keywords_contexts, context_before)
            
            if keywords_present:
                context_after = words[i : i + window_after]

                matches = re.findall(AMOUNT_REGEX, word)

                context = context_before + [op_curr] + matches + context_after

                relevant_info += context
                
    return ' '.join(relevant_info)

inspect = []

def group_docs_by_isin(html_folder, labels, files_isins, pickle_results=False, pickle_file='docs_by_isin.pickle'):
    t0 = time()
    
    contents_by_isin = {}
    
    print('Processing files')

    for file_name in os.listdir(html_folder):
        
        print('.', end='')
        
        if file_name.endswith(".html"):

            #first find the isin corresponding to this file
            isin = get_isin_for_file(files_isins, file_name)

            if(isin is not None):

                #found the isin to associate the document with
                words = document_to_words(path.join(html_folder, file_name))
                
                op_curr = get_label_value_for_isin(labels, isin, 'OPS.CURR')
                
                window_before = 4
                window_after = 2
                
                if isin in inspect:
                    save_file(isin, ' '.join(words))
                
#                 file_content = extract_relevant_info(words, op_curr.lower(), window_before, window_after)
                file_content = ' '.join(words)

                try:
                    #if there is already existing data for this isin, append the new data
                    existing_isin_data = contents_by_isin[isin]
                    contents_by_isin[isin] = existing_isin_data + file_content

                except KeyError:
                    contents_by_isin[isin] = file_content
    
    print('\nFinished grouping file contents indexed by ISIN')
    
    if pickle_results:
        pickle.dump(contents_by_isin, open(pickle_file, 'wb'))
        
        print('Saved file contents indexed by ISIN to:', pickle_file)
        
    print('Processed %d files in %0.3fs' % (len(contents_by_isin.keys()), (time() - t0)))
        
    return contents_by_isin

def load_and_sort_training_data(file_name):
    data = pickle.load(open(file_name, 'rb'))
    data_by_isin = [[k, v] for k, v in data.items()]
    data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
    data_by_isin_df.sort_values('isin', axis=0, inplace=True)
  
    dataset = data_by_isin_df.as_matrix(columns=['content'])
    
    labels = pd.read_csv( datafolder + 'outcome/ISIN_train.csv', header=0, sep=',', quoting=1, thousands=",")
    data_labeled_df = pd.DataFrame(labels[['ISIN','MIN.TRAD.AMT']])
    data_labeled_df.set_index(['ISIN'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('ISIN', axis=0, inplace=True)
    labelset = data_labeled_df.as_matrix(columns=['MIN.TRAD.AMT'])
    
    return dataset.ravel(), labelset.ravel()

In [4]:
group_docs_by_isin(html_folder, labels, files_isins, pickle_results=True, pickle_file='min_labeled_docs_by_isin.pickle')
print('OK!')

Processing files
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [5]:
dataset, labelset = load_and_sort_training_data('min_labeled_docs_by_isin.pickle')
print(dataset[:1])

[ 'instrument summary cd buenos aires 3 98 titulos de deuda publica 2014 15 3 18 class 3 issuer gk625220 cuidad autonoma de buenos aires domicile argentina sector cities municipal authorities basic data swiss national security number 26209866 isin arcbas031621 cfi code dbftxr assigned six within six jurisdiction liquid market 82 buenos aires instrument type debt short name 3 98 buenos 18 3 original issuer gk625220 cd buenos aires argentina currency principal usd 100000000 outstanding capital maturity date 15 03 2018 callability unknown coupon 3 98 fixed payment frequency every 6 months income type periodical payment first payment per year 15 03 inflation protected denominations usd 1000 depository custody cvba clearstream bk lux euroclear bank latest instrument ratings scheme name date rating rating trend mdyltdfc moody 39 long term debt foreign ccy ratings 30 11 2015 caa1 mdyltdlc moody 39 long term debt local ccy ratings 04 11 2015 caa1 mdyendc rating history moody 39 endorsement com

In [6]:
from sklearn import cross_validation

dataset, labelset = load_and_sort_training_data('min_labeled_docs_by_isin.pickle')

X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataset, labelset,\
                                                                     test_size=0.3, random_state=53)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib

text_clf = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
                     ('clf', SGDClassifier(average=True))])

text_clf = text_clf.set_params(clf__alpha = 1e-04, \
                               clf__n_iter = 100, \
                               clf__penalty = 'l2', \
                               clf__n_jobs = -1, \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = True, \
                               vect__max_df = 0.4, \
                               vect__ngram_range = (1, 2))

text_clf = text_clf.fit(X_train, y_train)

y_train_pred = text_clf.predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

print(np.mean(y_train == y_train_pred))

print(classification_report(y_train, y_train_pred))

In [50]:
y_test_pred = text_clf.predict(X_test)

In [51]:
print(text_clf.score(X_test, y_test))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

0.922419059255
             precision    recall  f1-score   support

          0       0.94      0.98      0.96      1441
         10       0.00      0.00      0.00         1
        200       1.00      1.00      1.00         2
        250       0.00      0.00      0.00         0
       1000       1.00      0.38      0.55         8
       2000       0.72      0.79      0.75        52
       5000       0.00      0.00      0.00         4
      10000       0.50      0.12      0.20         8
      20000       0.75      0.60      0.67         5
      50000       0.00      0.00      0.00         3
     100000       0.83      0.44      0.57        55
     104300       0.00      0.00      0.00         1
     120000       0.00      0.00      0.00         1
     130000       0.00      0.00      0.00         1
     150000       0.60      0.60      0.60        10
     200000       0.43      0.36      0.39        25
     250000       1.00      0.17      0.29         6
     300000       0.00      0.

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [35]:
def compile_mismatched_data_summary(y_test, y_test_pred, labels_path, doc_isin_path, label_name):
    labels = pd.read_csv(labels_path, header=0, sep=",", quoting=1, thousands=",")
    data_labeled_df = pd.DataFrame(labels[['ISIN', label_name]])
    data_labeled_df.set_index(['ISIN'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('ISIN', axis=0, inplace=True)

    idx = np.where(y_test != y_test_pred)
    mismatched_isins = data_labeled_df.as_matrix()[idx]

    all_labels_df = pd.read_csv(doc_isin_path, header=0, sep=",", quoting=1, thousands=",")
    all_labels_df.set_index(['ISIN'])
    all_labels_df.sort_values('ISIN', axis=0, inplace=True)

    all_labels = all_labels_df.as_matrix(columns=['DOCID', 'ISIN'])

    file_ids = []
    for isin in mismatched_isins[:,0]:
        idxx = all_labels_df['ISIN'] == isin
        file_id = all_labels_df['DOCID'][idxx].values[0]
        file_ids.append(file_id)
        
    summary = np.c_[np.array(file_ids), mismatched_isins[:,0], y_test[idx], y_test_pred[idx]]
    
    summary_df = pd.DataFrame(summary, columns=['fileId', 'isin', 'actual', 'predicted'])
    
    return summary_df

compile_mismatched_data_summary(y_test, y_test_pred, datafolder + 'outcome/ISIN_train.csv', \
                                datafolder + 'docID/docid_train.csv', 'MIN.TRAD.AMT')

Unnamed: 0,fileId,isin,actual,predicted
0,0900045c839f0d17,AT0000A0MPB1,200,0
1,0900045c83f01550,AU3FN0030839,100000,0
2,0900045c83f8dc8e,AU3FN0030912,2000,0
3,0900045c838bd72f,CH0254068866,100000,0
4,0900045c83bae03d,CH0254071365,5000,0
5,0900045c83bba3a8,CH0254071431,150000,0
6,0900045c8398c206,CH0266691317,2000,0
7,0900045c83f93596,CH0266712170,20000,0
8,0900045c83855f37,CH0266718706,2000,0
9,0900045c83f085ae,CH0266720322,0,100000


In [106]:
labels = pd.read_csv( datafolder + "labels.csv", header=0, sep=",", quoting=1, thousands=",")
get_labels_for_isin(labels,'XS0461332347')

array([['0900045c833fd034', 'XS0461332347', 'USD', 2000.0, 1000.0, 'N',
        'USUB', 'N', 'USD', 'DEUTSCHE BANK AG, GREAT WINCHE', 'LONDON',
        'GERMANY', nan, nan, nan]], dtype=object)

In [None]:
from pprint import pprint
from time import time
import logging
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn import cross_validation

dataset, labelset = load_and_sort_training_data('min_labeled_docs_by_isin.pickle')

X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataset, labelset,\
                                                                     test_size=0.3, random_state=53)

pipeline = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
                     ('clf', SGDClassifier())])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.0001, 0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib

dataset, labelset = load_and_sort_training_data('min_labeled_docs_by_isin.pickle')

text_clf = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
                     ('clf', SGDClassifier())])

text_clf = text_clf.set_params(clf__alpha = 1e-05, \
                               clf__n_iter = 50, \
                               clf__penalty = 'l2', \
                               clf__n_jobs = -1, \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = True, \
                               vect__max_df = 1.0, \
                               vect__ngram_range = (1, 2))

text_clf = text_clf.fit(dataset, labelset)

joblib.dump(text_clf, 'minimum_amount_pipeline.pkl') 

['minimum_amount_pipeline.pkl',
 'minimum_amount_pipeline.pkl_01.npy',
 'minimum_amount_pipeline.pkl_02.npy',
 'minimum_amount_pipeline.pkl_03.npy',
 'minimum_amount_pipeline.pkl_04.npy',
 'minimum_amount_pipeline.pkl_05.npy',
 'minimum_amount_pipeline.pkl_06.npy']

In [14]:
import pandas as pd
unlabeled_isins = pd.read_csv('../int_test/docID/docid_int_test.csv', header=0, sep=",", quoting=1, thousands=",")
unlabeled_isins.head()

Unnamed: 0,DOCID,ISIN
0,0900045c80a74f74,US29269MAA45
1,0900045c80b066db,XS0480504256
2,0900045c80c35015,XS0532879300
3,0900045c80c883a0,US40430CLJ61
4,0900045c80cd2703,XS0673671623


In [15]:
labeled_currencies = pd.read_csv('currencyExtraction/optCur_int.csv', header=None, names=['ISIN', 'OPS.CURR'], sep=",", quoting=1, thousands=",")
labeled_currencies.head()

OSError: File b'currencyExtraction/optCur_int.csv' does not exist

In [8]:
group_docs_by_isin(html1_folder, labeled_currencies, unlabeled_isins, pickle_results=True, pickle_file='unlabeled_amt_docs_by_isin.pickle')
print('OK!')

Processing files
................................................................................................................................................................................................................................................................................................................................................................................................................................................
Finished grouping file contents indexed by ISIN
Saved file contents indexed by ISIN to: unlabeled_amt_docs_by_isin.pickle
Processed 424 files in 24.544s
OK!


In [9]:
import numpy as np
import pandas as pd
import pickle

unlabeled_data = pickle.load(open('unlabeled_amt_docs_by_isin.pickle', 'rb'))
data_by_isin = [[k, v] for k, v in unlabeled_data.items()]
data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
data_by_isin_df.sort_values('isin', axis=0, inplace=True)
  
unlabeled_dataset = data_by_isin_df.as_matrix(columns=['content']).flatten()
isins = data_by_isin_df.as_matrix(columns=['isin']).flatten()

In [11]:
from sklearn.externals import joblib
#load the classifier
clf = joblib.load('minimum_amount_pipeline.pkl')
multiple_amounts = clf.predict(unlabeled_dataset)

In [12]:
import csv

results_df = pd.DataFrame(np.vstack((isins, multiple_amounts)).T, columns=['ISIN', 'MIN.TRAD.AMT'])

results_df.to_csv('final_minTrad.csv', index=False, header=False, quoting=csv.QUOTE_ALL)