In [2]:
import os
from os import path

datafolder = "../train/"
assert path.isdir(datafolder)

data1folder = '../final_test'
assert path.isdir(data1folder)

html1_folder = path.join(data1folder, "html")
assert path.isdir(html1_folder)

html_folder = path.join(datafolder, "html")
assert path.isdir(html_folder)

In [3]:
import pandas as pd

labels = pd.read_csv( datafolder + "outcome/ISIN_train.csv", header=0, sep=",", quoting=1, thousands=",")
files_isins = pd.read_csv( datafolder + "docID/docid_train.csv", header=0, sep=",", quoting=1, thousands=",")
labels.head()

Unnamed: 0,ISIN,ISSUER.NAME,ZCP.FL,MIN.TRAD.AMT,MLT.TRAD.AMT,OPS.CURR
0,ARCBAS031621,CIUDAD DE BUENOS AIRES,N,0,1000,ARS
1,AT0000248448,UNICREDIT BANK AUSTRIA AG,N,0,100000,EUR
2,AT0000A0MPB1,BAWAG PSK BANK FUR ARBEIT UND WIRTSCHAFT UND O...,N,0,100,EUR
3,AT0000A17HH9,RAIFFEISEN CENTROBANK AG,N,0,1000,USD
4,AT0000A1HE76,RAIFFEISEN CENTROBANK AG,Y,0,1000,CZK


In [4]:
import re
import pickle
import nltk
from nltk.corpus import stopwords
from time import time

def get_isin_for_file(labels, file_name, print_result = False):
    isin = None
    #function to get the isin from the labels, given a file name
    file_id = file_name.split('_')[0]
    idx = labels['DOCID'] == file_id
    isins = labels['ISIN'][idx]
    if isins.values: 
        isin = isins.values[0]
    
    if(print_result):
        print("%s - %s" % (file_name, isin))
    return isin

def get_labels_for_isin(labels, isin):
    idx = labels['ISIN'] == isin
    return labels[:][idx].values

def get_label_value_for_isin(labels, isin, attr_name):
    #function to return the value of the label for a given isin
    idx = labels['ISIN'] == isin
    return labels[attr_name][idx].values[0]

def save_file(isin, text):
    f = open(isin + '.txt', 'w')
    f.write(text)
    f.close()

def document_to_words(doc_path):
    with open(doc_path, 'r', encoding='utf8') as html_file:
        full_text = html_file.read()
        body = re.findall(r'<body[^>]*?>(.*?)</body>', full_text)
        cleantext = re.sub('<\/?span[^>]*>', ' ', body[0])
        clean_text = re.sub('<.*?>', ' ', cleantext)
        clean_text = clean_text.lower()
        clean_text = re.sub('0.01', '1', clean_text)
        clean_text = re.sub(',', '', clean_text)
        clean_text = re.sub(r'[^a-z0-9]', ' ', clean_text)
        clean_text = re.sub('\s+', ' ', clean_text)
        words = nltk.word_tokenize(clean_text)
        stops = set(stopwords.words('english'))
        relevant_words = [w for w in words if w not in stops and w != '' and w != ' ']
        return relevant_words

keywords_contexts = [['nominal', 'amount'], \
                     ['denomination'], \
                     ['denominations'], \
                     ['notional', 'amount', 'certificate'], \
                     ['specified', 'denomination'], \
                     ['specified', 'denominations'],
                     ['payable', 'security', 'nominal', 'amount'],
                     ['increasing', 'multiples']
                    ]

def contains(small_list, big_list):
    for word in small_list:
        if word not in big_list:
            return False
    return True

def keywords_in_context(keywords, context):
    for keywords_list in keywords:
        if contains(keywords_list, context):
            return True
        
    return False

def sort_contexts(contexts):
    c = sorted(contexts, key=len)
    c.reverse()
    return c

keywords_contexts = sort_contexts(keywords_contexts)

def extract_relevant_info(words, op_curr, window_before, window_after):
#     print('extracting info for : %s - %s', (op_curr, words))

    AMOUNT_REGEX = r'\d{1,12}'
    CURR_AMOUNT_REGEX = r'[a-z]{3}\d{1,12}'
    
    relevant_info = []
    
    for i in range(len(words)):
        word = words[i]

        if word == op_curr and i + window_after < len(words):
            #retain look_around words around the keyword
            context_before = words[i - window_before : i]
            
            keywords_present = keywords_in_context(keywords_contexts, context_before)
            
            if keywords_present:
                context_after = words[i : i + window_after]
                context = context_before + context_after
                relevant_info += context

        elif word.startswith(op_curr) and re.match(CURR_AMOUNT_REGEX, word):
            context_before = words[i - window_before : i]
            context_after = words[i : i + window_after]
            matches = re.findall(AMOUNT_REGEX, word)
            context = context_before + [op_curr] + matches + context_after

            relevant_info += context
            
    return ' '.join(relevant_info)

inspect = []

def group_docs_by_isin(html_folder, labels, files_isins, pickle_results=False, pickle_file='docs_by_isin.pickle'):
    t0 = time()
    
    contents_by_isin = {}
    
    print('Processing files')

    for file_name in os.listdir(html_folder):
        
        print('.', end='')
        
        if file_name.endswith(".html"):

            #first find the isin corresponding to this file
            isin = get_isin_for_file(files_isins, file_name)

            if(isin is not None):

                #found the isin to associate the document with
                words = document_to_words(path.join(html_folder, file_name))
                
                op_curr = get_label_value_for_isin(labels, isin, 'OPS.CURR')
                
                window_before = 8
                window_after = 8
                
                if isin in inspect:
                    save_file(isin, ' '.join(words))
                
                file_content = extract_relevant_info(words, op_curr.lower(), window_before, window_after)
#                 file_content = ' '.join(words)

                try:
                    #if there is already existing data for this isin, append the new data
                    existing_isin_data = contents_by_isin[isin]
                    contents_by_isin[isin] = existing_isin_data + file_content

                except KeyError:
                    contents_by_isin[isin] = file_content
    
    print('\nFinished grouping file contents indexed by ISIN')
    
    if pickle_results:
        pickle.dump(contents_by_isin, open(pickle_file, 'wb'))
        
        print('Saved file contents indexed by ISIN to:', pickle_file)
        
    print('Processed %d files in %0.3fs' % (len(contents_by_isin.keys()), (time() - t0)))
        
    return contents_by_isin

def load_and_sort_training_data(file_name):
    data = pickle.load(open(file_name, 'rb'))
    data_by_isin = [[k, v] for k, v in data.items()]
    data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
    data_by_isin_df.sort_values('isin', axis=0, inplace=True)
  
    dataset = data_by_isin_df.as_matrix(columns=['content'])
    
    labels = pd.read_csv( datafolder + 'outcome/ISIN_train.csv', header=0, sep=',', quoting=1, thousands=',')
    data_labeled_df = pd.DataFrame(labels[['ISIN','MLT.TRAD.AMT']])
    data_labeled_df.set_index(['ISIN'])
    data_labeled_df = data_labeled_df.drop_duplicates()
    data_labeled_df.sort_values('ISIN', axis=0, inplace=True)
    labelset = data_labeled_df.as_matrix(columns=['MLT.TRAD.AMT'])
    
    return dataset.ravel(), labelset.ravel()

In [48]:
group_docs_by_isin(html_folder, labels, files_isins, pickle_results=True, pickle_file='mult_labeled_docs_by_isin.pickle')
print('OK!')

Processing files
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [49]:
dataset, labelset = load_and_sort_training_data('mult_labeled_docs_by_isin.pickle')
print(dataset[:1])

['']


In [50]:
from sklearn import cross_validation


dataset, labelset = load_and_sort_training_data('mult_labeled_docs_by_isin.pickle')

X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataset, labelset,\
                                                                     test_size=0.3, random_state=53)

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn import cross_validation


text_clf = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
#                      ('pca', TruncatedSVD()), \
                     ('clf', SGDClassifier(average=True))
#                      ('clf', RandomForestClassifier())
                    ])


text_clf = text_clf.set_params(\
                               clf__alpha = 1e-04, \
                               clf__n_iter = 50, \
                               clf__penalty = 'l2', \
                               clf__n_jobs = -1, \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = True, \
                               vect__max_df = 0.4, \
                               vect__ngram_range = (1, 2), \
#                                pca__n_components = 100
                              )

"""
text_clf = text_clf.set_params(\
                               clf__alpha = 0.0001, \
                               clf__n_iter = 100, \
                               clf__penalty = 'l2', \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = False, \
                               vect__max_df = 0.75, \
                               vect__max_features = 10000, \
                               vect__ngram_range = (1, 2)
                              )
        

text_clf = text_clf.set_params(clf__criterion = 'gini', \
                               clf__max_features = 'auto', \
                               clf__n_estimators = 50, \
                               clf__n_jobs = -1, \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = False, \
                               vect__max_df = 1, \
                               vect__max_features = None, \
                               vect__ngram_range = (1, 2), \
#                                pca__n_components = 100
                              )
"""
t0 = time()

text_clf = text_clf.fit(X_train, y_train)

t1 = time()

print('Training the classifier took %0.3fs' % (t1 - t0))

y_train_pred = text_clf.predict(X_train)

Training the classifier took 1.520s


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

#print(np.mean(y_train == y_train_pred))

# print(classification_report(y_train, y_train_pred))

In [52]:
y_test_pred = text_clf.predict(X_test)

In [53]:
from sklearn.metrics import classification_report
import numpy as np

print(np.mean(y_test_pred == y_test))

print(classification_report(y_test, y_test_pred))

0.827733659133
             precision    recall  f1-score   support

          1       0.95      0.73      0.82       215
          2       0.33      1.00      0.50         1
          4       0.00      0.00      0.00         1
         10       0.00      0.00      0.00         1
        100       1.00      0.08      0.15        12
        120       0.00      0.00      0.00         1
        500       0.00      0.00      0.00         2
       1000       0.75      0.98      0.85       537
       1500       0.50      1.00      0.67         1
       2000       1.00      0.50      0.67         6
       3264       0.00      0.00      0.00         1
       3306       0.00      0.00      0.00         1
       3606       0.00      0.00      0.00         1
       4247       0.00      0.00      0.00         1
       4800       0.00      0.00      0.00         1
       5000       0.80      0.27      0.40        15
       6146       0.00      0.00      0.00         1
       8688       0.00      0.

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [42]:
labels = pd.read_csv( datafolder + "labels.csv", header=0, sep=",", quoting=1, thousands=",")
get_labels_for_isin(labels,'XS0461332347')

array([['0900045c833fd034', 'XS0461332347', 'USD', 2000.0, 1000.0, 'N',
        'USUB', 'N', 'USD', 'DEUTSCHE BANK AG, GREAT WINCHE', 'LONDON',
        'GERMANY', nan, nan, nan]], dtype=object)

In [29]:
from pprint import pprint
from time import time
import logging
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

dataset, labelset = load_and_sort_training_data('mult_labeled_docs_by_isin.pickle')

X_train, X_test, y_train, y_test = cross_validation.train_test_split(dataset, labelset,\
                                                                     test_size=0.3, random_state=53)

pipeline = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
                     ('clf', SGDClassifier()) \
#                     ('clf', RandomForestClassifier())
                    ])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {\
#               'clf__n_estimators' : (10, 50, 100), \
#               'clf__criterion' : ('entropy', 'gini'), \
#               'clf__max_features' : ('auto', 'sqrt', 'log2'), \
              'vect__max_df': (0.5, 0.75, 1.0), \
              'vect__max_features': (None, 5000, 10000, 50000), \
              'vect__ngram_range': ((1, 1), (1, 2)), \
              'tfidf__use_idf': (True, False), \
              'tfidf__norm': ('l1', 'l2'), \
              'clf__alpha': (0.0001, 0.00001, 0.000001),\
              'clf__penalty': ('l2', 'elasticnet'),\
              'clf__n_iter': (10, 50, 100)\
             }

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train[:1000], y_train[:1000])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (0.0001, 1e-05, 1e-06),
 'clf__n_iter': (10, 50, 100),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 1092 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   53.2s
[Parallel(n_jobs=-1)]: Done 1892 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 3092 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3842 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 4692 tasks      | elapsed:  3.6min


done in 258.210s

Best score: 0.810
Best parameters set:
	clf__alpha: 0.0001
	clf__n_iter: 10
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.75
	vect__max_features: 10000
	vect__ngram_range: (1, 2)


[Parallel(n_jobs=-1)]: Done 5184 out of 5184 | elapsed:  4.3min finished


In [54]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.externals import joblib

dataset, labelset = load_and_sort_training_data('mult_labeled_docs_by_isin.pickle')

text_clf = Pipeline([('vect', CountVectorizer()), \
                     ('tfidf', TfidfTransformer()), \
                     ('clf', SGDClassifier())])

text_clf = text_clf.set_params(clf__alpha = 1e-04, \
                               clf__n_iter = 50, \
                               clf__penalty = 'l2', \
                               clf__n_jobs = -1, \
                               tfidf__norm = 'l2', \
                               tfidf__use_idf = True, \
                               vect__max_df = 0.4, \
                               vect__ngram_range = (1, 2))

text_clf = text_clf.fit(dataset, labelset)

joblib.dump(text_clf, 'multiple_amount_pipeline.pkl') 

['multiple_amount_pipeline.pkl',
 'multiple_amount_pipeline.pkl_01.npy',
 'multiple_amount_pipeline.pkl_02.npy',
 'multiple_amount_pipeline.pkl_03.npy',
 'multiple_amount_pipeline.pkl_04.npy',
 'multiple_amount_pipeline.pkl_05.npy',
 'multiple_amount_pipeline.pkl_06.npy']

In [5]:
unlabeled_isins = pd.read_csv('../final_test/docID/docid_final_test.csv', header=0, sep=",", quoting=1, thousands=",")

# unlabeled_isins = load_isins_files('../int_test/docID/docid_int_test.csv')

unlabeled_isins.head()

Unnamed: 0,DOCID,ISIN
0,0900045c8059e795,CH0025370906
1,0900045c8092c413,XS0545673914
2,0900045c80c5dd9f,US40430CLG23
3,0900045c80d7461e,US44328MAX20
4,0900045c80dd8470,XS0729081124


In [6]:
labeled_currencies = pd.read_csv('currencyExtraction/optCur.csv', header=None, names=['ISIN', 'OPS.CURR'], sep=",", quoting=1, thousands=",")
labeled_currencies.head()

Unnamed: 0,ISIN,OPS.CURR
0,XS1330099810,RUB
1,XS1326148142,SGD
2,XS1129852551,JPY
3,XS1276919328,EUR
4,XS1317265038,USD


In [7]:
group_docs_by_isin(html1_folder, labeled_currencies, unlabeled_isins, pickle_results=True, pickle_file='unlabeled_amt_docs_by_isin.pickle')
print('OK!')

Processing files
................................................................................................................................................................................................................................................................................................................................................................................................................................................
Finished grouping file contents indexed by ISIN
Saved file contents indexed by ISIN to: unlabeled_amt_docs_by_isin.pickle
Processed 424 files in 25.780s
OK!


In [8]:
import numpy as np
import pandas as pd
import pickle

unlabeled_data = pickle.load(open('unlabeled_amt_docs_by_isin.pickle', 'rb'))
data_by_isin = [[k, v] for k, v in unlabeled_data.items()]
data_by_isin_df = pd.DataFrame(data_by_isin, columns = ['isin','content'])
data_by_isin_df.sort_values('isin', axis=0, inplace=True)
  
unlabeled_dataset = data_by_isin_df.as_matrix(columns=['content']).flatten()
isins = data_by_isin_df.as_matrix(columns=['isin']).flatten()

In [10]:
from sklearn.externals import joblib

#load the classifier
clf = joblib.load('multiple_amount_pipeline.pkl')
multiple_amounts = clf.predict(unlabeled_dataset)

In [11]:
import csv

results_df = pd.DataFrame(np.vstack((isins, multiple_amounts)).T, columns=['ISIN', 'MLT.TRAD.AMT'])

results_df.to_csv('final_mltTrad.csv', index=False, header=False, quoting=csv.QUOTE_ALL)