## Import Libraries

In [25]:
# set sys path to access scripts
import sys
sys.path.append('../')

# general
import pandas as pd
import numpy as np

# model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# custom scripts
import scripts.config as config
from scripts.helpers import get_regex
from scripts.helpers import split_sentence
from scripts.helpers import list_to_comma_sep_string
from scripts.helpers import list_to_string
from scripts.helpers import pos_tagging
from scripts.helpers import stem_sentence


## Import Data

In [37]:
df_phrase = pd.read_csv(config.FINANCIAL_PHRASE_BANK)
df_domain_dict = pd.read_csv(config.DOMAIN_DICTIONARY)

## Create Tags using Domain Dictionary

Each sentence is parsed and tagged if it contain words which are present in the lexicon. In addition, tags are created when a performance indicators are in context with directionality. 

- "The lithuanian beer market was <b>up</b> 14.41 million litres in January, a <b>rise</b> of 0.8 percent from the year-earlier figure" — up, up
- In the second quarter <b>production</b> at the plant <b>increased by</b> 20% — leading:up

The feature creation step below creates one new feature in the dataframe. One, a list of tags as describe, a list of words corresponding to the tags.

In [38]:
def reduce_phrases(text, phrases):
    for i in range(len(text) - 1, -1, -1):    
        if i != len(text)-1:
            if list_to_string(text[i:i+2]) in phrases:
                text[i] = text[i]  + '_' + text[i+1]
                del text[i+1]

    text = list_to_string(text)
    
    return text


In [39]:
# get list of phrases in dictionary
phrases = set(list(df_domain_dict[df_domain_dict.word.apply(lambda x: len(x.split())>1)].word))

# reduce phrases within financial text
df_phrase.text = df_phrase.text.apply(lambda x: reduce_phrases(split_sentence(x), phrases))

# reduce phrases within dictionary
df_domain_dict.word = df_domain_dict.word.apply(lambda x: x.replace(' ', '_'))
domain_dict = df_domain_dict.set_index('word')['type'].to_dict()

In [40]:
def get_tags(trials, domain_dict, return_tags=True, return_words=False):
    
    # initiate empty list to store results
    tags_temp = []

    # lookup dictionary category, if not present - ignore word
    tags_temp = [[trial[0], domain_dict[trial[0]]] for trial in trials if trial[0] in domain_dict.keys()]

    # group leading/lagging indicators with directoin
    target = ['lagging', 'lagging-rev', 'leading']
    group_with = ['up', 'down']   
    for i in range(len(tags_temp) - 1, -1, -1):    
        if i == len(tags_temp)-1:
            pass
        elif tags_temp[i][1] in target and tags_temp[i+1][1] in group_with:
            tags_temp[i][0] = tags_temp[i][0] +':' + tags_temp[i+1][0]
            tags_temp[i][1] = tags_temp[i][1] +':' + tags_temp[i+1][1]
            del tags_temp[i+1]
            
    # remove 'lagging' and 'leading' indicators
    # remove_tags = ['leading', 'lagging']
    # tags_temp = [[tag[0], tag[1]] for tag in tags_temp if tag[1] not in remove_tags]
  
    # separate word and tag lists
    tags = [tag[1] for tag in tags_temp]
    words = [tag[0] for tag in tags_temp]
        
    # return tags and words
    if return_tags and return_words:
        return tags, words
    
    # return tags
    elif return_tags:
        return tags
    
    # return words
    else:
        return words


In [41]:
def get_tags_df(text, grammar, use_grammar=False, return_tags=True, return_words=False):
    
    #
    if use_grammar:
        tags = get_regex(pos_tagging(split_sentence(text)), grammar)
    else: 
        tags = pos_tagging(split_sentence(text))
    
    tags = get_tags(tags, domain_dict, return_tags, return_words)
    tags = list_to_comma_sep_string(tags)
    
    return tags

In [42]:
grammar = """JJ : {<JJ.∗> ∗}
        V B : {< V B.∗ >}
        NP : {(< NNS|NN >)∗}
        NP P : {< NNP|NNP S >}
        RB : {< RB.∗ >}
        NP JJ : {(((< NP|NP P > + < IN >< .∗ > ∗ <, >)|(< JJ > ∗ < NP|NP P > +< V B >< NP P > ∗ < .∗ > ∗ <, >))(< RB > | < DT >< NP > | < V B >< T O >))|< JJ > ∗ < NP > (< IN >< DT > ∗ < JJ > ∗ < NP P > ∗ < NP > ∗)∗ < V B > |< JJ > + < NP >< V B > |< NP|NP P > +(< (>< .∗ > ∗ <) >) ∗ ((< IN >< JJ > ∗ < NP >)(< IN >< CD >)∗)∗ < V B > +|< NP >< .∗ > +(< RB > | < JJ >)|< NP|NP P > +(< IN >< NP|NP P >)∗ < .∗ > ∗ < V B >(< DT >< JJ >) ∗ |< NP >< V B > (< RB > |(< T O >< DT >< NP >))|< V B >< NP|NP P > ∗ < P OS >< JJ > ∗ < NP > |< V B >< P RP.∗ >< JJ > ∗ < NP >< IN > |< V B >< T O > ∗ < DT > ∗ < JJ > ∗ < NP >< IN > ∗ < NP > ∗|< NP|NP P > +(< IN >< DT > ∗ < RB > ∗ < JJ > ∗ < NP|NP P >) ∗< RB > ∗(< V B >< JJ >< NP >)∗ < V B >(< DT >< CD >< NP >) ∗ |(< JJ >)∗ < NP|NP P > + < .∗ > ∗(<, >< .∗ > ∗ <, >)∗ < NP >}"""

df_phrase['tags'] = df_phrase.text.apply(lambda x: get_tags_df(x, grammar))
df_phrase['taged_words'] = df_phrase.text.apply(lambda x: get_tags_df(x, grammar, return_tags=False, return_words=True))

In [43]:
df_phrase.head()

Unnamed: 0,sentiment,text,confidence,tags,taged_words
0,neutral,"according to gran , the company has no plans t...",1.0,leading,production
1,positive,"for the last quarter of 2010 , componenta s ne...",1.0,"lagging:up, lagging, neg","net_sales:doubled, profit, loss"
2,positive,"in the third quarter of 2010 , net_sales incre...",1.0,"lagging:up, lagging","net_sales:increased, operating_profit"
3,positive,operating_profit rose to eur 13.1 mn from eur ...,1.0,"lagging:up, lagging","operating_profit:rose, net_sales"
4,positive,"operating_profit totalled eur 21.1 mn , up fro...",1.0,"lagging:up, lagging","operating_profit:up, net_sales"


## Count Vectorise Tags

Similar to the bag of words model, instead counting the number of tags present rather than the number of words. Since there are only 10 unique tags the resulting feature will be nx10 where n is the number of examples.

In [44]:
## 
my_vocabulary = df_phrase[df_phrase['tags'].apply(lambda x: len(x.split()))==1]['tags']
my_vocabulary = my_vocabulary.unique()
my_vocabulary_dict = {}
for i, vocab in enumerate(my_vocabulary):
    my_vocabulary_dict[vocab] = i    

vectorizer = CountVectorizer(lowercase = False, token_pattern = '[a-zA-Z0-9$&+,:;=?@#|<>.^*()%!-]+')
vectorizer.fit_transform(my_vocabulary_dict)
vectorizer.vocabulary_ = my_vocabulary_dict
tf1 = vectorizer.transform(df_phrase['tags'].apply(lambda s: s.replace(',', '')))
column_names=vectorizer.get_feature_names()
column_names=['count_'+column_name for column_name in column_names]
tf1 = pd.DataFrame(tf1.todense(), columns=columns)
df_phrase = pd.merge(df_phrase, tf1, left_index=True, right_index=True)


In [45]:
df_phrase.head()

Unnamed: 0,sentiment,text,confidence,tags,taged_words,count_leading,count_lagging:up,count_up,count_leading:up,count_pos,count_lagging,count_neg,count_leading:down,count_lagging:down,count_down
0,neutral,"according to gran , the company has no plans t...",1.0,leading,production,1,0,0,0,0,0,0,0,0,0
1,positive,"for the last quarter of 2010 , componenta s ne...",1.0,"lagging:up, lagging, neg","net_sales:doubled, profit, loss",0,1,0,0,0,1,1,0,0,0
2,positive,"in the third quarter of 2010 , net_sales incre...",1.0,"lagging:up, lagging","net_sales:increased, operating_profit",0,1,0,0,0,1,0,0,0,0
3,positive,operating_profit rose to eur 13.1 mn from eur ...,1.0,"lagging:up, lagging","operating_profit:rose, net_sales",0,1,0,0,0,1,0,0,0,0
4,positive,"operating_profit totalled eur 21.1 mn , up fro...",1.0,"lagging:up, lagging","operating_profit:up, net_sales",0,1,0,0,0,1,0,0,0,0


## Save CSV with Features

In [46]:
df_phrase.to_csv(config.FINANCIAL_PHRASE_BANK, index=False)
df_domain_dict.to_csv(config.DOMAIN_DICTIONARY, index=False)