# Fasttext for financial entity classification
Applied to classifying names into:
* fund
* company
* bond

In [2]:
#!pip install fasttext
#!pip install pandas
#!pip instal openxml
#pip install scikit-learn
#!pip install matplotlib
#!pip install seaborn

## Dataset

In [3]:
import pandas as pd
import random

def get_data():
    
    df = pd.read_excel(io="company_bond_fund.xlsx", sheet_name="dataset")
    
    companies = df["company"][:4000]
    bonds = df["bond"][:4000]
    etfs = df["etf"][:4000]
    funds = df["fund"][:4000]

    data = []
    for i,_ in enumerate(companies):
        data.append(("company",companies[i]))
        data.append(("bond", bonds[i]))
        data.append(("etf", etfs[i]))
        data.append(("fund", funds[i]))

    random.shuffle(data)
    return data

data = get_data()
len(data)

16000

## Preprocessing

In [8]:
import re

def preprocess(raw):
    """ preprocess string or list of strings"""

    def tune(string):
        string =  str(string).replace("%", " % ")
        string = string.replace(";"," ")
        string = string.replace(","," ")
        string = string.replace("."," ")
        string = string.replace(":"," ")
        string = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", string).strip()
        return string
    
    if isinstance(raw, list):
        res = []
        for string in raw:
            string = tune(string)
            res.append(string)
        return res
    else:
        return tune(str(raw))


## Writing as fasttext compatible dataset

In [9]:
# preprocess data
for i,(l,d) in enumerate(data):
    d_mod = preprocess(d)
    data[i] = l,d_mod
    if i%1003==0:
        print(data[i])

('fund', 'Meeder Funds  Meeder Balanced Fund  Adviser Class Shares')
('etf', 'Xtrackers Harvest CSI  300  China A-Shares ETF')
('bond', 'INDIGO 2   125  % APR 25')
('bond', 'CMHLSF 0   75  %  10 APR 30')
('etf', 'Principal U S  Small Cap Multi-Factor ETF')
('bond', 'CASA 2   75  %  20 JUN 2024')
('company', 'Welcia Holdings')
('fund', 'BlackRock Liquidity Funds  Federal Trust Fund  Cash Reserve Shares')
('fund', 'Lord Abbett Investment Trust  Lord Abbett Short Duration Income Fund  Class A Shares')
('fund', 'T Rowe Price US Large-Cap Core Fund  Inc  Class I Shares')
('etf', 'ARK Autonomous Technology & Robotics ETF')
('fund', 'MassMutual Select Funds  MassMutual RetireSMART by JPMorgan  2050  Fund  Administrative Class Shares')
('bond', 'BPCE 1   934  %  29 SP 25')
('company', 'Okta')
('bond', 'ENTRA 12  ESG')
('bond', 'CDC 1   44  %  25 NOV 30')


In [10]:
train, test, val =  data[0:10000], data[10000:12000], data[12000:14000]

In [11]:
def write_data(data, name):
    file = open(name,"w+")
    for l,d in data:
        file.write(f"__label__{l} {d}\n")
    

write_data(train,"fasttext_train.txt" )
write_data(test,"fasttext_test.txt" )
write_data(val,"fasttext_validate.txt" )

## Training

In [12]:
import fasttext
model = fasttext.train_supervised('fasttext_train.txt')
model.labels

['__label__etf', '__label__company', '__label__fund', '__label__bond']

## Testing

In [13]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

print_results(*model.test('fasttext_test.txt'))

N	2000
P@1	0.989
R@1	0.989


In [20]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn

names = [x[1] for x in test]
labels = ["__label__"+x[0] for x in test]

# predict the data
predicted = list(map(lambda x: model.predict(x)[0][0], names))

# Create the confusion matrix
cm = confusion_matrix(labels, predicted)
cm

array([[497,   3,   0,   0],
       [  0, 493,   3,   0],
       [  0,   0, 493,   6],
       [  0,   1,   9, 495]])

## Validation

In [15]:
model.predict("test")

(('__label__company',), array([1.00000787]))

In [16]:
model.predict(preprocess(["Amazon", "Google", "some fund", "EU 5 % 2020"]))

([['__label__company'],
  ['__label__company'],
  ['__label__company'],
  ['__label__bond']],
 [array([0.99870664], dtype=float32),
  array([1.0000079], dtype=float32),
  array([1.0000079], dtype=float32),
  array([1.0000077], dtype=float32)])

In [17]:
print_results(*model.test("fasttext_validate.txt"))

N	2000
P@1	0.995
R@1	0.995


## Saving model

In [201]:
#model.quantize(input='fasttext_train.txt', retrain=True)
model.save_model("security_classifier.ftz")