## create_scoringDict_fromTrainSet
score currency based on train set


In [1]:
# SYSTEM
from os import listdir
from PyCommonFun import 

#NLTK
import nltk, re, pprint
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
from nltk.metrics import *
from collections import Counter

#GENERAL
import numpy as np
import pandas as pd


### load labels and context dict.
the context dict is a dict. of dict. such as contextAllCurr = {'fileId': { ' curr' : Ngram } },
where Ngramm are part of speech centered around a currency. The contextAllCurr has been 
saved in a separate notebook (save_curencyContext_train(test)Set.ipynb)

In [2]:
# get the solution
labels=pd.read_csv('../../data_0/greg_train_set/lab_greg_train_set.csv')
# get context dict
contextAllCurr=np.load('../../data_0/greg_train_set/contextAllCurr.npy').item()

extract only currencies found in the documents, store results in foundCurrType = {'fileId': [curr]}. 

In [3]:
foundCurrType=dict()

for f in labels.fileId:
    
    foundCurrType[f]=[]
    
    for currKey in contextAllCurr[f]:
        if contextAllCurr[f][currKey] != []:
            foundCurrType[f].append(currKey)

Divide the foundCurrType in three dict:
 - single currency
 - multiple currency
 - void

In [4]:
voidCurr=dict()
singleCurr=dict()
multiCurr=dict()

for key in foundCurrType:
    if len(foundCurrType[key]) == 0:
        voidCurr[key]=foundCurrType[key]
    elif len(foundCurrType[key]) == 1:
        singleCurr[key]=foundCurrType[key]
    elif len(foundCurrType[key]) > 1:
        multiCurr[key]=foundCurrType[key]



## Create bag of word for each currency

I want to get an idea of the association between the operationnal currency and the surrounding 
words. Ideally, the additionnal currency found in the docs should show differents association.

To addess this point, I create two distinct bag of words (bow) associated to:
1) operationnal currency (consider opCurr dict)
2) additionnal  currency (consider multipleCurr dict)
and compare the bow

In [5]:
text_bow_multiCurr=[]
text_bow_OpCurr=[]

for key1 in multiCurr:
    for key2 in contextAllCurr[key1]:
        # do not consider operationnal currency
        if labels[labels['fileId']==key1].OpCurrency.values == key2:
            for val in contextAllCurr[key1][key2]:
                text_bow_OpCurr.append(' '.join(val))
        else:
            for val in contextAllCurr[key1][key2]:
                text_bow_multiCurr.append(' '.join(val))

In [6]:
# tokenize the text but remove ponctuations
tokenizer=RegexpTokenizer(r'\w+')
text_bow_OpCurr=tokenizer.tokenize(str(text_bow_OpCurr).lower())

tokenizer=RegexpTokenizer(r'\w+')
text_bow_multiCurr=tokenizer.tokenize(str(text_bow_multiCurr).lower())


In [7]:
# create a bag of word by counting number of occurence of each term

bow_OpCurr=Counter(text_bow_OpCurr)
len_text_OpCurr=len(text_bow_OpCurr)
# normalize the bow by dividing by length of text
for key, val in bow_OpCurr.iteritems():
    bow_OpCurr[key]=np.float(bow_OpCurr[key])/len_text_OpCurr

bow_multiCurr=Counter(text_bow_multiCurr)
len_text_multiCurr=len(text_bow_multiCurr)
# normalize the bow by dividing by length of text
for key, val in bow_multiCurr.iteritems():
    bow_multiCurr[key]=np.float(bow_multiCurr[key])/len_text_multiCurr
    

In [9]:
# use set method to find common key between the bow
key_OpCurr=set(bow_OpCurr)
key_multiCurr_set=set(bow_multiCurr)



In [10]:
# create a dictionnary with common keys, defined as the ratio of bow_OpCurr and bow_multiCurr, which is kind of weights
bowRatio_intersect_curr=dict()
for k in key_OpCurr.intersection(key_multiCurr_set):
    bowRatio_intersect_curr[k] = bow_OpCurr[k]/bow_multiCurr[k]

In [11]:
# check if it looks ok
for key,value in sorted(bowRatio_intersect_curr.iteritems(), key=lambda (k,v):(v,k), reverse=True):
    print '%s: %s' % (key,value)

isin: 91.4767501509
linked: 84.9767314424
ubs: 79.4746409173
der: 33.0844659022
cap: 31.8258177429
callable: 28.0498732649
000: 26.0564969929
tranche: 22.6556668678
warrants: 22.296053108
zero: 19.7787567894
size: 19.4191430296
avoidance: 18.69991551
whole: 17.8008811104
nominal: 17.4232866626
sd: 17.2614604707
laggard: 17.2614604707
005: 16.9018467109
quanto: 16.1826191913
renault: 15.8230054315
last: 14.923971032
issue: 14.3171228123
certificates: 13.7552263126
denomination: 13.3228335776
return: 13.3057091129
series: 13.2098121102
half: 12.946095353
doubt: 12.946095353
aggregate: 12.5864815932
reverse: 12.2268678334
convertible: 12.2268678334
certificate: 12.1069965802
issuance: 10.9682196741
place: 10.6086059143
minimum: 10.1590887145
fall: 10.0691852746
currencies: 10.009249648
coupon: 9.61149503484
due: 9.55545133201
peugeot: 9.52976463488
sum: 9.34995775498
warrant: 8.99034399517
only: 8.99034399517
fixed: 8.73861436331
up: 8.690665862
syndication: 8.63073023537
places: 8.630730

In [12]:
## save the weight

In [13]:
np.save('../../data_0/greg_train_set/weight_forCurrencyScoring.npy', bowRatio_intersect_curr) 