In [2]:
import numpy as np
from collections import defaultdict, Counter
import os.path
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import datetime
from operator import itemgetter

In [94]:
def weight_normalized_cnb(complement_probs_normalized, vectorized_text, prior_probs):
    '''
    :param complement_probs: dictionary where key = label and values = dictionary where
                            keys = words and values = (# of times word w appears in docs
                            NOT labeled l)/(# of words in documents NOT labeled l)
    :param idf: dictionary where keys = words and values = (total # docs)/(# of docs in 
                which we see that word)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    freq = Counter(vectorized_text)
    for label in prior_probs.keys():
        conditional = 0.0
        for word in freq.keys():
            conditional += (freq[word] * complement_probs_normalized[label][word])
        labels.append((label, np.exp(conditional)))
    return sorted(labels, key=itemgetter(1))

In [80]:
def complement_naive_bayes(complement_probs, vectorized_text, prior_probs):
    '''
    :param complement_probs: dictionary where key = label and values = dictionary where
                            keys = words and values = (# of times word w appears in docs
                            NOT labeled l)/(# of words in documents NOT labeled l)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    doc_denom = 0
    freq = Counter(vectorized_text)
    '''
    for word in freq.keys():
        for label in prior_probs.keys():
            doc_denom += (np.log(prior_probs[label]) + (freq[word]/len(vectorized_text) * complement_probs[label][word]))
    print(doc_denom)
    '''
    for label in prior_probs.keys():
        conditional = 0.0
        for word in freq.keys():
            conditional += (freq[word] * complement_probs[label][word])
        labels.append((label, conditional))
    return sorted(labels, key=itemgetter(1))

In [93]:
def multinomial_naive_bayes(conditional_probs, vectorized_text, prior_probs):
    '''
    :param conditional_probs: dictionary where keys = labels and values = dictionary where
                    keys = words and values = P(x|Y)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    freq = Counter(vectorized_text)
    for label in prior_probs.keys():
        conditional = 0.0
        for word in vectorized_text:
            if conditional_probs[label][word] != 0.0:
                conditional += (freq[word] * conditional_probs[label][word])
        labels.append((label, np.exp(conditional)))
    return sorted(labels, key=itemgetter(1), reverse=True)

In [29]:
def bayes_accuracy_model(num, number_labels, labels):
    '''
    :param num: the number of the document being checked, so we can check
                the correct labels for it
    :param number_labels: dictionary where keys = number of sample and
                            values = the set of labels associated with
                            that sample
    :param labels: the set of labels computed by Naive Bayes
    '''
    sample_labels = number_labels[num]
    successes = 0
    earned = 0
    bottom_5_times = 0
    bottom_5 = ['rye', 'groundnut-oil', 'cotton-oil', 'castor-oil', 'nkr', 'sun-meal']
    computed_labels = [x for x,y in labels]
    if "earn" in computed_labels[:3]:
        earned += 1
    computed_labels_trim = computed_labels[:len(sample_labels)]
    for label in bottom_5:
        if label in computed_labels[:5]:
            bottom_5_times += 1
            break
    if all(x in computed_labels_trim for x in sample_labels):
        successes += 1
    else:
        print(num)
        print(sample_labels, labels[:10])
        diff = set(sample_labels).difference(set(computed_labels_trim))
        score = len(computed_labels_trim) - len(diff)
        score /= len(computed_labels_trim)
        if len(diff) < len(computed_labels_trim):
            successes += score
    return [successes,earned, bottom_5_times]

In [132]:
def compute_precision_recall(computed_label_set, number_labels, label_list):
    '''
    '''
    precision = {label: 0.0 for label in label_list}
    recall = {label: 0.0 for label in label_list}
    precision_denom = {label: 0.0 for label in label_list}
    recall_denom = {label: 0.0 for label in label_list}
    for num in computed_label_set.keys():
        computed_labels = computed_label_set[num]
        actual_labels = number_labels[num] 
        computed_labels = computed_labels[:len(actual_labels)]
        for l in computed_labels:  # Positive prediction
            precision_denom[l] += 1
            if l in actual_labels: # True positive
                precision[l] += 1
                recall[l] += 1
                recall_denom[l] += 1                
            diff = set(actual_labels).difference(set(computed_labels))
            for label in diff:
                recall_denom[label] += 1
    total_precision = sum([v for v in precision.values()])
    total_precision_denom = sum([v for v in precision_denom.values()])
    total_precision /= total_precision_denom
    total_recall = sum([v for v in recall.values()])
    total_recall_denom = sum([v for v in recall_denom.values()])
    total_recall /= total_recall_denom
    return [total_precision, total_recall]

In [7]:
def vectorize_text(valid_words, filepath):
    '''
    This function removes non valid words from the text to put it into
    the Naive Bayes classifier
    :param stop_words: a set of words like "the", "and", etc
                        that should be stripped out of any computations
    :param valid_words: dictionary where keys = valid words in the corpus
    :param filepath: path to the text file
    :return: a vector of text stripped of stop words and non-valid words
    '''
    with open(filepath, "r") as f:
        content = f.read()
        words = nltk.word_tokenize(content)
        words = [word.lower() for word in words]
        new_words = [word.lower() for word in words if word in valid_words]
    return new_words

In [9]:
def get_valid_words(dir_path, stop_words):
    '''
    Utility function that determines the set of valid words 
    to be used for classification and probability calculation
    :param dir_path: a path to the directory containing 
                    all the training samples
    :param stop_words: a set of words like "the", "and", etc
                        that should be stripped out of any computations
    :return: a Python dictionary where the keys = valid words and the 
            values = True, so we can use "key in dict" for future access
            in guaranteed constant time
    '''
    valid_words = defaultdict(bool)
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words]
            new_words = [word.lower() for word in new_words if word.isalpha()]
            new_words = [word.lower() for word in new_words if word not in stop_words]
            new_words = set(new_words)
            for word in new_words:
                valid_words[word] = True
    return valid_words
            

In [10]:
def add_labels_to_samples(filename):
    '''
    This function iterates over the file containing all 
    labels for each numbered sample, and maps them together with
    a dictionary
    :param filename: path to the file with all the labels in it (assumes
                    the file is located in this directory)
    :return: a dictionary with keys = number of the training sample and
            values = the set of labels associated with it
            AND
            the same, but with the test samples. Keep them separate for easy
            access later
    '''
    number_labels_training = defaultdict(list)
    number_labels_test = defaultdict(list)
    with open(filename, "r") as f:
        for line in f:
            terms = line.split()
            if line[0:4] == "test":
                num = int(terms[0][5:len(terms[0])])  # Test number, so we can map this back to the proper label(s) later on
                number_labels_test[num] = terms[1:]
            else:
                num = int(terms[0][9:len(terms[0])])  
                number_labels_training[num] = terms[1:]
    return [number_labels_training, number_labels_test]

In [11]:
def compute_prior_probabilities(number_labels):
    '''
    This function will compute the prior probabilities
    P(y) = probability of seeing a label with a sample. 
    Note: since many samples have multiple labels, these prior
    probabilites will sum to > 1
    :param number_labels: dictionary where keys = number of training sample
                            and value = the list of labels associated with it
    :return: a dictionary where keys = the label and value = probability of seeing
            that label in the document list
    '''
    prior_probs = defaultdict(float)
    i = 0
    for num, labels in number_labels.items():
        for l in labels:
            if not prior_probs[l]:
                prior_probs[l] = 1
            else:
                prior_probs[l] += 1
        i += 1
    for label, freq in prior_probs.items():
        prior_probs[label] /= i
    return prior_probs

In [12]:
def rename_files(dir_path):
    '''
    Utility function designed to rename all files in any directory
    to a .txt file so they can be read from
    :param dir_path: directory of the files to be renamed
    '''
    for file in os.listdir(dir_path):
        filepath = dir_path + '\\' + file 
        os.rename(filepath, filepath+".txt")

In [47]:
def get_parameters(dir_path, valid_words, number_labels, label_list):
    '''
    This function will iterate over the documents and compute the frequencies of 
    the words by label and in total
    :param dir_path: a path to the directory containing all the training samples
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param number_labels: dictionary where keys = document # and values = the set of 
                            labels associated with those labels
    :param label_list: list of all the unique labels
    :return: a dictionary where keys = labels and values = dictionary where keys 
            = words and values = the frequencies of that word in documents with that label
            AND 
            a dictionary where keys = words and values = the total # of occurrences of
            that word
            AND
            a dictionary where keys = words and values = the idf score for that word
            AND 
            a dictionary where keys = labels and values = the total # of words associated with that label
            AND
            the total # of valid words in the entire corpus
    '''
    words_by_doc_num = defaultdict()
    idf = {word: 0.0 for word in valid_words}
    total_num_words = 0
    total_word_count_by_label = {label: 0 for label in label_list}    
    i = 0
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            num = int(file[0:len(file) - 4])
            labels = number_labels[num]
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words]
            new_words = [word.lower() for word in new_words if word in valid_words]
            total_num_words += len(new_words)
            freq = Counter(new_words)
            words_by_doc_num[num] = freq
            for word in freq.keys():
                idf[word] += 1
            for l in labels:
                total_word_count_by_label[l] += len(new_words)
            i += 1
    for word in idf.keys():
        idf[word] = 1 + np.log(i/(idf[word]+1))      
    frequencies = {label: {word: 0.0 for word in valid_words} for label in label_list} 
    total_frequencies = {word: 0 for word in valid_words}
    for num in words_by_doc_num.keys():
        freq = words_by_doc_num[num]
        labels = number_labels[num]
        normalization_term = np.sqrt(sum([score**2 for word, score in freq.items()]))
        for l in labels:
            total_word_count_by_label[l] += len(new_words)
            for word in freq.keys():
                term_to_add = freq[word] * idf[word]
                frequencies[l][word] += (term_to_add/normalization_term)
                total_frequencies[word] += (term_to_add/normalization_term)
    return [frequencies, total_frequencies, idf, total_word_count_by_label, total_num_words]

In [48]:
if __name__ == '__main__':
    dir_path = "C:\\Users\\ksing\\OneDrive\\Documents\\Text Classifiers\\training"
    stop_words = set(stopwords.words('english'))
    valid_words = get_valid_words(dir_path, stop_words)
    number_labels_training, number_labels_test = add_labels_to_samples("cats.txt")
    prior_probs = compute_prior_probabilities(number_labels_training)
    
    parameters = get_parameters(dir_path, valid_words, number_labels_training, prior_probs.keys())
    
    frequencies = parameters[0]
    total_frequencies = parameters[1]
    idf = parameters[2] 
    total_word_count_by_label = parameters[3]
    total_num_words = parameters[4]

In [49]:
    for label, vector in frequencies.items():
        if label != "earn":
            continue
        print("Label:", label)
        for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):
            if score == 0.0:
                continue
            print(word, score)

Label: earn
vs 2037.6218731693598
cts 1430.526394079223
mln 1047.3317024944583
loss 962.2201550638756
net 943.4418504559922
shr 663.6889908910532
dlrs 609.6416337025148
profit 563.4392492665461
lt 516.7765508447939
revs 458.1045001528319
qtr 446.4594937985735
year 406.22387841452996
record 404.9552973796853
dividend 390.8395489141975
div 389.78991772788453
april 377.35158581904386
qtly 356.53377491998936
prior 332.5941087989328
inc 330.7037136432611
pay 309.68261673485665
oper 307.8842051114876
said 290.6300507541069
share 283.9030703337707
note 276.31929787885434
corp 245.88299048178584
march 241.50722426714609
company 238.27249305973777
sales 229.48955397943558
sets 222.1820417431966
billion 215.32308221418893
split 213.68432094165172
stock 211.2996704426567
quarter 210.2625680725256
avg 207.88206945700472
shrs 204.5119132211401
quarterly 193.97657300565496
earnings 185.06668798865002
payout 184.71297397158258
may 175.7760158161672
co 158.7100352198403
payable 156.21357215242935
nine

publishing 5.152554731615607
hovnanian 5.147112343889869
television 5.141794709506208
crk 5.134656367886195
ability 5.122653419704595
discount 5.120633094984558
works 5.116958089701182
recommend 5.114769823680874
announcement 5.109807399625698
paying 5.106972291197753
connecticut 5.102411339889642
surplus 5.100297348896633
core 5.10025957212652
film 5.097543329739635
independent 5.094565499856753
jack 5.086475074678541
usa 5.080194128819507
lloyds 5.077004037762447
dropped 5.060531409605966
shield 5.060409171309356
weis 5.059970976469926
moratorium 5.057421462146039
greater 5.056389749993241
double 5.0408409206049125
roy 5.033558194814771
morris 5.033436525473196
suspension 5.03026513217626
tultex 5.021919363746725
ttx 5.021919363746725
charged 5.016778382729387
truck 5.015224776460265
newmont 5.01427312302546
factors 5.009943517367102
qtrs 5.008542570710768
vogtle 5.007771560997586
maker 5.0058249708268345
virginia 5.001843429298794
european 4.993688305400095
official 4.99290266686861

faced 3.096182796374492
enabled 3.0957525913873334
icn 3.0922405155217225
marketable 3.0902854385464833
sptn 3.089443943231633
worth 3.088509372518626
rockefeller 3.0882498514634915
dasa 3.0882498514634915
thl 3.0882498514634915
porex 3.0882498514634915
enterprise 3.0877033211366665
holly 3.0845951751591842
amgd 3.0844069397071943
turner 3.0836555501725735
main 3.0813057069992675
adverse 3.0786431381905204
satellite 3.0765382938944845
listing 3.073988213474175
issuance 3.0729361740626584
bfg 3.071404262979949
regency 3.0647845115159793
summit 3.063531047588566
workforce 3.062785592757696
eliminate 3.0615525279712537
gkn 3.0614603260736275
placing 3.059666079703801
reynolds 3.0573888275117604
mine 3.055875478879945
precision 3.0553674047032775
notel 3.055328676330573
break 3.053858638352654
mitchell 3.0527409303311632
pace 3.052128509588976
chapman 3.0504401361569635
usf 3.04941928652758
medco 3.046230888759109
uac 3.040618569517909
nacco 3.037031543174979
ltv 3.037031543174979
preussag

ltc 2.2521472378124345
actual 2.251599571469799
sar 2.250736038357797
merrill 2.2505987406466144
mobay 2.2504041975796003
industri 2.2503687674953956
guinness 2.2495822334976525
regional 2.2484399908395094
conglomerate 2.247639856511437
srv 2.247448196926439
ctyn 2.247031823979181
bangor 2.247031823979181
cobk 2.247031823979181
sanwa 2.2434104402040593
aec 2.2427225450645745
universal 2.2405471287590033
clothier 2.240412896248327
strw 2.240412896248327
carbide 2.237847136812439
opening 2.2364740203247546
sng 2.2364738932314916
tol 2.23511143153379
night 2.233223111522766
pence 2.2325021222971007
via 2.2308818127866488
insua 2.2304248479588553
privatised 2.2284523150659616
financiere 2.2284523150659616
dds 2.228025943015306
frozen 2.2276679144176503
agreements 2.2254627253984864
debs 2.2247529385544995
bmd 2.2246547778410517
pegasus 2.2236588036969622
pgulf 2.2236588036969622
malaysia 2.222900500165421
valuation 2.2198747660990406
lawrence 2.2195521946659746
spectrum 2.217090056135689
f

stahl 1.8055190020056842
models 1.8052319421757999
planters 1.8043742580412256
bowling 1.8040038072865583
approves 1.7991585081696955
ahlsell 1.7981525343804075
gosper 1.797625459183345
lner 1.7974899701281315
story 1.7966438923685375
upper 1.7949343098865622
widdrington 1.7941110365468447
sb 1.791534664087792
nch 1.7913142912197646
herley 1.7913142912197646
margin 1.7910457135599556
alternative 1.7908598460485095
status 1.7898996743239652
seidman 1.7896425089692203
transmission 1.7894742885345498
crown 1.7880112825715155
mas 1.787302646509504
healthvest 1.787302646509504
gec 1.787302646509504
chemie 1.787302646509504
infg 1.786938684699762
wendy 1.7868248376655942
linde 1.7859918354366902
mile 1.7846199440558967
sure 1.7835616768847076
crain 1.7830018830672687
tcat 1.7830018830672687
tca 1.7830018830672687
sbos 1.7830018830672687
rmedu 1.7830018830672687
eavn 1.7830018830672687
eaton 1.7830018830672687
vance 1.7830018830672687
interchange 1.7830018830672687
internchange 1.783001883067

canterbury 1.366012797584543
thoratec 1.366012797584543
hmdy 1.366012797584543
hemodynamics 1.366012797584543
atlantis 1.366012797584543
pie 1.3653927863654847
suspend 1.363204143449039
abroad 1.3625029466010763
conform 1.3619603965596028
kren 1.361756526471181
kings 1.361756526471181
qant 1.361314979369301
borrower 1.3608651166826542
danish 1.3603622452559152
bookings 1.3590954639160153
penhale 1.3586487935467317
tarxien 1.3586487935467317
repal 1.358397411695577
christians 1.357993953425742
discovered 1.3568995581511945
develop 1.3566688825878286
ansy 1.3564350791276776
lebanon 1.35429885444025
pool 1.3542729103794104
hospitality 1.3536795016127274
nav 1.353296202763624
sport 1.3530028554649187
daily 1.3517468633862382
vfed 1.351402614981289
cruists 1.351402614981289
biotechnologies 1.351402614981289
ipc 1.351402614981289
tier 1.351402614981289
tierco 1.351402614981289
shfxf 1.351402614981289
shadowfax 1.351402614981289
optkf 1.351402614981289
optrotech 1.351402614981289
onyx 1.35140

wht 1.0843569163307007
anwi 1.0843569163307007
ald 1.0840750596799216
castonguay 1.0825048963344703
format 1.0823338581967672
svenska 1.0823338581967672
usually 1.0815057372622976
strengthening 1.0813979260044302
feel 1.0800985789213247
comprise 1.0790672146674885
posts 1.0786660208179546
ballan 1.0778147334418706
bakery 1.07770743675048
carme 1.0770052592507446
journals 1.0770052592507446
ambj 1.0770052592507446
jcbs 1.0770052592507446
payble 1.0770052592507446
tsbk 1.0770052592507446
ccom 1.0770052592507446
assumptions 1.0760229903930014
word 1.075675926415566
figarella 1.0751905902965933
sidor 1.0751905902965933
sucre 1.0751905902965933
guayana 1.0751905902965933
marcom 1.0751905902965933
apart 1.0749261170398299
feet 1.0747731859808218
bei 1.0743460452768716
beih 1.0743460452768716
vira 1.0743460452768716
aiming 1.0725910452401557
downsizing 1.0716249916764
damaged 1.0712643433560927
quota 1.0712547949370772
limiting 1.0712511326957952
translation 1.071178521850482
safeway 1.071155

plush 0.9030725294250572
ruxpin 0.9030725294250572
teddy 0.9030725294250572
switch 0.9025646605159454
reichert 0.9020019036432791
attractive 0.9019358279912004
invests 0.9014189170172189
panin 0.9012902649497396
finding 0.9008487915169181
achieving 0.9008382591674983
regarding 0.9008067071078772
participating 0.9006906896524564
pertamina 0.9004201180993411
rsto 0.8998720260460231
maar 0.8998720260460231
butl 0.8998720260460231
sssl 0.8998720260460231
earings 0.8998720260460231
iclb 0.8998720260460231
seeks 0.8995869491892032
nza 0.8995240488360663
inch 0.8992943758098104
layoffs 0.8992943758098104
disruption 0.8989577013205623
midmorning 0.8985480794896145
narrowing 0.8972071071200766
facts 0.8965146281211653
nickel 0.8964003217741361
lyons 0.8960665445017696
covt 0.8956571456098823
covington 0.8956571456098823
ael 0.8956571456098823
aelna 0.8956571456098823
elisra 0.8956571456098823
mrll 0.8956571456098823
cefadroxil 0.8956571456098823
applicability 0.8956571456098823
hrly 0.895657145

firmer 0.7311114488881253
tsang 0.7309665179960663
syms 0.7301645535014204
sym 0.7301645535014204
inludes 0.7301645535014204
esquire 0.7301645535014204
ee 0.7301645535014204
btek 0.7301645535014204
baltek 0.7301645535014204
goals 0.728959541979845
mm 0.7286550224337429
liquefied 0.7284370410707002
offshoot 0.7283624168995554
snowstorms 0.7282286956094576
stanely 0.7282286956094576
homeclubs 0.7279074706560611
maxx 0.7279074706560611
galley 0.7279074706560611
seag 0.7279074706560611
raut 0.7279074706560611
avalon 0.7270338243101615
stop 0.7257498511611156
dahlberg 0.7256711904997611
dahl 0.7256711904997611
idel 0.7256711904997611
disposing 0.7256711904997611
npr 0.7256711904997611
encourage 0.7252497264741375
transfers 0.7249846533848211
cominco 0.7245817069408715
describes 0.7244326216083925
approach 0.7243755099596805
denys 0.7242309759343766
library 0.7239031875165497
msm 0.7234553954324736
mnln 0.7234553954324736
mott 0.7234553954324736
jail 0.723357545985569
investigations 0.723151

coverted 0.591903137089371
vhcl 0.591903137089371
vhc 0.591903137089371
preway 0.591903137089371
prew 0.591903137089371
conversative 0.591903137089371
formulating 0.591903137089371
operationally 0.591903137089371
withstood 0.591903137089371
courses 0.5918065550067843
confined 0.5918031948147902
proceed 0.5907012261391373
upri 0.5906988568243307
computing 0.5906189630854874
panels 0.5906189630854874
declaration 0.5897688862832932
hiring 0.5895599873296602
shulansky 0.5895018974245286
buildings 0.5890721419573155
converts 0.5890529689969529
le 0.5888633888124171
abnormally 0.5887000680066415
emphasises 0.5887000680066415
aseay 0.5883121850162744
vasteras 0.5883121850162744
denied 0.5883006533782076
goldfield 0.588011361109262
enzymes 0.588011361109262
outstripped 0.5873955129804427
evaluation 0.5873892414355925
nwp 0.5873165171679549
tricil 0.5870423771587501
star 0.5866779425290831
scrap 0.586594793543588
peat 0.586294487431772
hnh 0.585954210858079
ktii 0.585954210858079
statil 0.58595

crystal 0.4749272290066591
dealings 0.4748124961277361
earthquake 0.47475124562000137
plywood 0.47464708739906686
fundamental 0.47447567529689544
conservatively 0.474160598597246
located 0.47394933747811685
hydroelectric 0.4738473444764737
destinations 0.4734894922923126
thinking 0.47341636579446283
addressed 0.47339766874087424
drilled 0.47338591511332856
speed 0.47329552707814704
dothan 0.47278977089446855
repeals 0.47278977089446855
charterers 0.4725677361108561
couple 0.4723180825755412
happened 0.4723180825755412
documents 0.4719248125098996
lowering 0.47191545007233554
reimburse 0.47129332858206524
elkins 0.4710071417562315
bonanza 0.4706994776995989
treat 0.47042779574469057
passive 0.47042779574469057
enforcement 0.47040197697779124
resturcturing 0.47034639654868876
annal 0.47034639654868876
sigi 0.47034639654868876
dpcz 0.47034639654868876
immunological 0.47034639654868876
highs 0.4699377518422163
arranges 0.4697414491645205
lubrication 0.4697414491645205
creditor 0.4696067997

standstill 0.32885971706321054
rothschild 0.3283323198844573
kept 0.3282455313414655
fried 0.32789682828626343
easing 0.3274223796994832
event 0.3269930610281955
insignificant 0.3268533971937497
supportive 0.32681574961631893
operationgs 0.32653951527799163
bind 0.32631532552731246
feared 0.32621684284302865
adviser 0.3262146811284643
awaiting 0.32559738569576585
aggregating 0.32545394623555646
vegetable 0.3253370552825315
strauss 0.3253293584951468
preopening 0.3253293584951468
finanziaria 0.32513941061983204
agricola 0.32513941061983204
sawmill 0.32508879158320153
parity 0.3249057494968604
altered 0.32413028298493474
aids 0.323785183362469
rejecting 0.32330047032240605
oilpatch 0.32252910249633354
flowing 0.3225065973317423
experiencing 0.32228867823444896
capture 0.3220487908337124
uses 0.32197698830005694
slide 0.32185571267528046
mined 0.3217750308601311
informed 0.32175710514246436
hedged 0.321686504848259
telesis 0.32136889049966466
prompting 0.32136889049966466
aachener 0.32119

tidal 0.20910938951486596
albn 0.20910938951486596
absorption 0.20910938951486596
bureaucracy 0.20890741315392355
privatization 0.20890741315392355
medicine 0.20890741315392355
cochran 0.20890741315392355
verge 0.20890741315392355
unloading 0.20890741315392355
enjoy 0.2086350609799734
anyone 0.20804941250121473
wayne 0.20804941250121473
foundation 0.20724186629322683
imbalance 0.20715405617246918
taxation 0.2056960786630359
brent 0.2056960786630359
eventual 0.2056960786630359
arranged 0.2056564468036998
ore 0.20533683881526835
bears 0.20530938758251105
reiterate 0.20530938758251105
liked 0.20530938758251105
scrambling 0.20530938758251105
unrestricted 0.20522562837160319
reverses 0.20522562837160319
loading 0.20515644505165362
dennis 0.20493907033770897
overallotment 0.20470561585322258
categories 0.204548608684093
portfolios 0.2043450950540493
decades 0.2043450950540493
storm 0.2043450950540493
enhancement 0.20413250679235287
lesley 0.20356712760159623
shun 0.2028725552557665
impressiv

In [50]:
    conditional_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    complement_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    for label, vector in conditional_probs.items():
        denom = total_num_words - total_word_count_by_label[label] + len(valid_words.keys())
        for word in vector.keys():
            mod_cond_freq = frequencies[label][word] + 1
            mod_comp_freq = (total_frequencies[word] - frequencies[label][word]) + 1
            conditional_probs[label][word] = np.log(mod_cond_freq/(total_word_count_by_label[label] + len(valid_words.keys())))
            complement_probs[label][word] = np.log(mod_comp_freq/denom)

In [76]:
    complement_probs_normalized = {label: {word: complement_probs[label][word] for word in valid_words} 
                                   for label in prior_probs.keys()}
    conditional_probs_normalized = {label :{word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    normalize_terms = {label: 0.0 for label in prior_probs.items()}
    for label, vector in complement_probs.items():
        normalize_term_1 = np.sqrt(sum([(complement_probs_normalized[label][word]**2) for word in valid_words]))
        normalize_term_2 = np.sqrt(sum([(conditional_probs[label][word]**2) for word in valid_words]))
        normalize_terms[label] = normalize_term_1
        for word in vector.keys():
            complement_probs_normalized[label][word] /= normalize_term_1
            conditional_probs_normalized[label][word] = conditional_probs[label][word] / normalize_term_2

In [136]:
    # Removing the stemmer actually improves accuracy on test set, who knew
    successes, earned, bottom_5,i = 0, 0, 0, 0
    computed_label_set = defaultdict(list)
    dir_path = "C:\\Users\\ksing\\OneDrive\\Documents\\Text Classifiers\\test"
    for file in os.listdir(dir_path):
        filepath = dir_path + '\\' + file 
        num = int(file[0:len(file) - 4])
        text = vectorize_text(valid_words, filepath)
        # computed_labels = complement_naive_bayes(complement_probs, text, prior_probs)
        # computed_labels = multinomial_naive_bayes(conditional_probs_normalized, text, prior_probs)
        computed_labels = weight_normalized_cnb(complement_probs_normalized, text, prior_probs)
        suc, e, b5 = bayes_accuracy_model(num, number_labels_test, computed_labels)
        computed_label_set[num] = [x for x,y in computed_labels]
        # MNB with doc length normalization, IDF: 86.10% accuracy (2599.288708513709), 1548 "Earn" labels
        # CNB with doc length normalization, IDF: 90.02% accuracy(2717.798340548341), 1130 "Earn" labels
        # However, this approach results in conditional terms that don't make much sense for precision or recall
        # WCNB with doc length normalization, IDF: 87.72% accuracy (2648.141955266955), 1542 "Earn" labels
        
        # Perhaps the reason that TF doesn't lead to improvements with this is because we already stripped out the 
        # stop words, which would be affected the most by this technique

        successes += suc
        earned += e
        bottom_5 += b5
        i += 1
    print(successes, earned, bottom_5, i)

14829
['nat-gas', 'crude'] [('crude', 0.6210121817692901), ('trade', 0.6210530420127806), ('nat-gas', 0.6215849787358416), ('money-fx', 0.6217348198951833), ('dlr', 0.6219940530805214), ('cpi', 0.6220860500373541), ('yen', 0.6222165474537633), ('gnp', 0.6222188895696744), ('interest', 0.6222826181038662), ('gas', 0.6223003801093293)]
14832
['rubber', 'tin', 'sugar', 'corn', 'rice', 'grain', 'trade'] [('trade', 0.6509115883336161), ('bop', 0.6513961306418461), ('sugar', 0.6517336550479357), ('money-supply', 0.6518087349522872), ('rubber', 0.6518568813233966), ('gnp', 0.6518950912290826), ('tin', 0.651895239841607), ('cpi', 0.6519071154038277), ('grain', 0.6520457092374992), ('interest', 0.6520819695055934)]
14840
['rubber', 'coffee', 'lumber', 'palm-oil', 'veg-oil'] [('coffee', 0.3290736314366406), ('rubber', 0.32936218621472213), ('money-fx', 0.3300284407702936), ('veg-oil', 0.33017325017477256), ('trade', 0.33034522116811377), ('palm-oil', 0.33040325367612694), ('grain', 0.33078255573

15206
['rice'] [('trade', 0.481362713869462), ('grain', 0.48381220152641113), ('wheat', 0.4844575947321226), ('money-fx', 0.4845175303849048), ('oilseed', 0.4848739352776426), ('livestock', 0.4849298356114987), ('corn', 0.4849459832078156), ('carcass', 0.484984487510232), ('interest', 0.48499633525949376), ('soybean', 0.48502853791677836)]
15212
['interest', 'dlr', 'money-fx'] [('interest', 0.6082751864562633), ('money-fx', 0.608459266724183), ('gnp', 0.6104745033944893), ('dlr', 0.6106321854547071), ('money-supply', 0.6106690941973976), ('cpi', 0.6109101737040632), ('trade', 0.6110221782762595), ('yen', 0.6111478947672938), ('bop', 0.6112068714224437), ('ipi', 0.6112188682754026)]
15237
['livestock'] [('grain', 0.7282398906410188), ('carcass', 0.7288623740640018), ('livestock', 0.7289321349229613), ('trade', 0.7290678465056354), ('wheat', 0.7290739108839506), ('corn', 0.7291904763255733), ('oilseed', 0.7292264821431328), ('ship', 0.7292291699658795), ('soybean', 0.7293066906092931), (

15562
['dlr'] [('money-fx', 0.7821893534904837), ('dlr', 0.7851686806313695), ('interest', 0.7852921823030347), ('trade', 0.7856115861112511), ('reserves', 0.7856296244182163), ('yen', 0.7856695967242932), ('gnp', 0.7857676240816893), ('coffee', 0.7858172365810462), ('ipi', 0.7858776423220052), ('cocoa', 0.7859052999054805)]
15563
['lead'] [('copper', 0.8889475843311591), ('earn', 0.8894164591635506), ('interest', 0.8906522133010053), ('veg-oil', 0.8907443161032046), ('zinc', 0.8907475238878872), ('lead', 0.890751171171555), ('silver', 0.8907932552495094), ('palm-oil', 0.8908102260099986), ('alum', 0.8908245905301084), ('corn', 0.8908305872244204)]
15567
['sorghum', 'wheat', 'rice', 'grain'] [('grain', 0.471737148026618), ('wheat', 0.47318488019530747), ('corn', 0.4734783757051226), ('rice', 0.47395232106609075), ('oilseed', 0.47395890080801856), ('sugar', 0.4739824681965093), ('soybean', 0.47410673306833195), ('gnp', 0.4741361885665837), ('cotton', 0.47414645682672585), ('veg-oil', 0.

15842
['grain'] [('earn', 0.91327924160497), ('grain', 0.9153599686251607), ('corn', 0.9160243793796384), ('wheat', 0.9162795263942346), ('oilseed', 0.9164620789371347), ('soybean', 0.9165162915325527), ('livestock', 0.9166537394551236), ('barley', 0.9166565953480152), ('hog', 0.9166628326552165), ('sugar', 0.916667809561945)]
15844
['grain'] [('earn', 0.9286321445634521), ('grain', 0.9304373449451089), ('wheat', 0.9313467582297211), ('corn', 0.9313468797780533), ('oilseed', 0.9316841769052286), ('soybean', 0.9317497627893194), ('sugar', 0.9318367360053837), ('barley', 0.9318398517800978), ('sorghum', 0.9318566118622477), ('rice', 0.9318641911583307)]
15853
['meal-feed', 'soy-meal', 'oilseed', 'veg-oil', 'soy-oil', 'soybean', 'cotton', 'wheat', 'grain'] [('earn', 0.465022247237238), ('grain', 0.49166716183529746), ('oilseed', 0.4918991451456875), ('wheat', 0.49193214725719187), ('corn', 0.49204903873649414), ('soybean', 0.4920498611986865), ('sugar', 0.492212204853966), ('cotton', 0.49

15999
['soybean', 'oilseed', 'corn', 'grain'] [('grain', 0.7962214043875283), ('corn', 0.7977124523014436), ('wheat', 0.7979037666817131), ('oilseed', 0.7986565477667817), ('soybean', 0.7988441719714677), ('barley', 0.7992928149619876), ('veg-oil', 0.799295824497254), ('meal-feed', 0.7993232957367801), ('sugar', 0.7993346034453711), ('cotton', 0.7993862768378618)]
16009
['silver', 'gold', 'livestock', 'cotton', 'grain', 'money-fx'] [('money-fx', 0.28577599264236137), ('dlr', 0.28693513969223267), ('livestock', 0.2869720472599681), ('grain', 0.28711493311810043), ('gold', 0.28715452448481493), ('trade', 0.28724909440659885), ('interest', 0.2872836217073672), ('wheat', 0.28738098422633673), ('hog', 0.287423426528579), ('silver', 0.28744314487276174)]
16014
['ship', 'grain'] [('cocoa', 1.0), ('sorghum', 1.0), ('oat', 1.0), ('barley', 1.0), ('corn', 1.0), ('wheat', 1.0), ('grain', 1.0), ('sunseed', 1.0), ('oilseed', 1.0), ('soybean', 1.0)]
16040
['crude', 'ship'] [('cocoa', 1.0), ('sorghum

16196
['gnp'] [('money-fx', 0.5723231711696017), ('iron-steel', 0.5728400285296029), ('gnp', 0.5728682746153115), ('interest', 0.5729832581526125), ('trade', 0.5731751614100635), ('reserves', 0.5740254937236515), ('bop', 0.5740408000355333), ('cpi', 0.5740524578383028), ('money-supply', 0.5740805116710376), ('dlr', 0.5740914095894397)]
16197
['gnp'] [('earn', 0.6859743032354424), ('gnp', 0.6863985613465083), ('cpi', 0.6880084583891326), ('ipi', 0.6882519514159886), ('bop', 0.688327612467999), ('jobs', 0.6884914297260414), ('money-supply', 0.6886155383282844), ('reserves', 0.6886463503792629), ('wpi', 0.6887572010109813), ('interest', 0.6887994176541798)]
16200
['dlr'] [('money-fx', 0.12847974656257194), ('trade', 0.131171946583248), ('dlr', 0.13160910119000024), ('interest', 0.1319289836801814), ('gnp', 0.13230698826708154), ('yen', 0.1323552306442817), ('bop', 0.13251715608805828), ('cpi', 0.1325391777778918), ('money-supply', 0.13256364213835053), ('reserves', 0.1326150088657084)]
16

16766
['trade', 'bop'] [('trade', 0.6593501966658867), ('money-fx', 0.6599103801443099), ('interest', 0.6603974507840578), ('bop', 0.6605366564122864), ('dlr', 0.6607250364804793), ('money-supply', 0.6609746115910244), ('reserves', 0.6610125858204895), ('gnp', 0.6610250699804159), ('grain', 0.6610267042597627), ('wheat', 0.6610663938837693)]
16770
['money-fx'] [('trade', 0.6898499576901372), ('money-fx', 0.6915421057502046), ('interest', 0.692173120393185), ('dlr', 0.6936219063858315), ('yen', 0.6938891741270488), ('gnp', 0.6939241606168427), ('money-supply', 0.6939320387561105), ('livestock', 0.6940659906430735), ('coffee', 0.6940966037018712), ('reserves', 0.6941076946738879)]
16772
['wpi'] [('cpi', 0.7792987239563021), ('interest', 0.779661895243594), ('wpi', 0.7796760625449104), ('veg-oil', 0.7797447663085991), ('gnp', 0.7797638920836114), ('grain', 0.7798065175344929), ('cotton', 0.779843065759731), ('money-supply', 0.7798441778118167), ('dlr', 0.779844281543782), ('wheat', 0.7798

17580
['acq', 'pet-chem'] [('acq', 0.7951034151957533), ('interest', 0.8034713446171575), ('pet-chem', 0.8035606133823299), ('cotton', 0.8036190764975091), ('gas', 0.8036266854606188), ('money-supply', 0.8036278176962621), ('nickel', 0.8036465170099152), ('wpi', 0.8036492999018898), ('propane', 0.8036498817548292), ('rapeseed', 0.8036518499438688)]
17584
['crude'] [('cocoa', 1.0), ('sorghum', 1.0), ('oat', 1.0), ('barley', 1.0), ('corn', 1.0), ('wheat', 1.0), ('grain', 1.0), ('sunseed', 1.0), ('oilseed', 1.0), ('soybean', 1.0)]
17585
['pet-chem', 'acq'] [('acq', 0.5876239854176919), ('earn', 0.5890849443807926), ('corn', 0.5909158009689962), ('carcass', 0.5909541622227421), ('orange', 0.5910656328474967), ('rice', 0.5910846159375363), ('livestock', 0.5910905782545443), ('retail', 0.5911137086203264), ('wpi', 0.5911315540501447), ('rye', 0.5911506633243105)]
17593
['interest'] [('trade', 0.5296498669290246), ('money-fx', 0.5296988308338416), ('interest', 0.5299016521118193), ('wheat', 0

17916
['bop'] [('trade', 0.8955971248383899), ('bop', 0.8957146121846508), ('money-fx', 0.8959693447975376), ('earn', 0.895990363881277), ('gnp', 0.8964439447462429), ('dlr', 0.8965149073466111), ('dmk', 0.8965464782681688), ('nkr', 0.8965484701763039), ('rubber', 0.8965515634646632), ('money-supply', 0.8965605275175643)]
17926
['bop', 'trade'] [('trade', 0.7534100359313862), ('money-fx', 0.7539020934480076), ('bop', 0.7539929411993259), ('crude', 0.7542771339179891), ('interest', 0.7548742346398333), ('money-supply', 0.7550340870497053), ('reserves', 0.7550380889515671), ('dlr', 0.7551340221844394), ('gnp', 0.7553330283147117), ('pet-chem', 0.7553786690068026)]
17927
['propane'] [('crude', 0.6878023309716046), ('nat-gas', 0.6882652677562847), ('propane', 0.6892949338465784), ('interest', 0.6894045940008192), ('gas', 0.6897459517635518), ('heat', 0.6897476778485039), ('wheat', 0.6898208041833199), ('grain', 0.6898404117327512), ('corn', 0.689854423766452), ('ship', 0.6898689896638089)]

18185
['money-fx', 'interest'] [('money-fx', 0.5859674540416417), ('trade', 0.5866822043170221), ('interest', 0.5867594008459721), ('dlr', 0.5874282095429466), ('money-supply', 0.5875014763596917), ('bop', 0.5876860749205961), ('gnp', 0.5877194690268531), ('yen', 0.587791638042003), ('cpi', 0.5878382552946649), ('reserves', 0.5878617532851327)]
18191
['pet-chem'] [('acq', 0.8213999266111031), ('alum', 0.8218323786499067), ('gold', 0.8218498125617727), ('barley', 0.8218731853283002), ('pet-chem', 0.8218745759324592), ('iron-steel', 0.8218748535256679), ('rubber', 0.8218812450709829), ('strategic-metal', 0.8218977896166798), ('zinc', 0.8219220098545031), ('meal-feed', 0.8219225719615435)]
18213
['crude', 'ship'] [('earn', 0.5259279254867097), ('crude', 0.5268150936684043), ('ship', 0.5287753436213124), ('coffee', 0.5303296860110448), ('interest', 0.5304091910399128), ('gnp', 0.530435133952606), ('money-supply', 0.5304719674859396), ('ipi', 0.5304869007237679), ('cpi', 0.5304888689562587)

18509
['wpi'] [('cpi', 0.8610178679364777), ('money-supply', 0.8616157554316294), ('wpi', 0.861624050256036), ('interest', 0.8616493727936732), ('gnp', 0.861662857597272), ('ipi', 0.8616857102817513), ('coffee', 0.8618258772590041), ('jobs', 0.861833432973955), ('reserves', 0.8618403589719646), ('bop', 0.8618537693047451)]
18524
['reserves'] [('earn', 0.9199734218824475), ('reserves', 0.9239142932077543), ('money-supply', 0.9242096865326584), ('bop', 0.9242427063173575), ('interest', 0.9242608800898054), ('money-fx', 0.9242804737698528), ('trade', 0.924341116965639), ('corn', 0.9243603680782205), ('barley', 0.9243731777964728), ('jobs', 0.9243841504875396)]
18527
['reserves'] [('money-fx', 0.6234404243762288), ('reserves', 0.6239202587780411), ('interest', 0.6248008731730731), ('money-supply', 0.6249660934231701), ('gnp', 0.6250529078714528), ('bop', 0.6252169851392146), ('gold', 0.625299520183994), ('trade', 0.6253195222621762), ('jobs', 0.625334348707704), ('cpi', 0.6253656480217387)

18857
['nat-gas', 'crude', 'acq'] [('crude', 0.8186814863150398), ('nat-gas', 0.8191734219090925), ('gold', 0.8211514087959096), ('acq', 0.8212019594092898), ('rubber', 0.821333768407622), ('fuel', 0.8213607431275528), ('reserves', 0.8213700936223892), ('palm-oil', 0.8213779638867449), ('heat', 0.8213813457934939), ('gas', 0.8213837404510176)]
18858
['fuel'] [('trade', 0.5943671519027658), ('gnp', 0.5944642586885287), ('money-fx', 0.5944974956758369), ('interest', 0.5945137845169819), ('bop', 0.5945356943392659), ('sugar', 0.5946671690789945), ('cpi', 0.5946880743728575), ('gas', 0.5948103176775277), ('money-supply', 0.5948762269096388), ('fuel', 0.5948807114303297)]
18873
['cotton'] [('grain', 0.47945763154788346), ('wheat', 0.4801328937025545), ('cotton', 0.4802346659928645), ('oilseed', 0.4804226312772017), ('corn', 0.4805888416239329), ('ship', 0.48068690426819527), ('livestock', 0.4807073482517872), ('rapeseed', 0.48089365812676654), ('rice', 0.4809274773761632), ('soybean', 0.480

19061
['gnp', 'bop', 'yen', 'dlr', 'money-fx'] [('gnp', 0.582892193945883), ('money-fx', 0.5829585814168892), ('trade', 0.5836429131758007), ('dlr', 0.5843040570183828), ('yen', 0.5851669883603137), ('bop', 0.5851766112054929), ('interest', 0.585223204575752), ('cpi', 0.585426332306152), ('money-supply', 0.5854399890731113), ('reserves', 0.5854861266653544)]
19075
['bop', 'gnp'] [('money-fx', 0.5635342398911497), ('gnp', 0.5638930355085396), ('interest', 0.5639176006092255), ('trade', 0.5640035535834413), ('bop', 0.5641342198663708), ('money-supply', 0.5641601046612923), ('reserves', 0.5642310316472193), ('gold', 0.5644042673338713), ('cpi', 0.5648768862031585), ('dlr', 0.5649189752842885)]
19082
['wpi'] [('cpi', 0.7742227916274177), ('wpi', 0.7744864219065466), ('coffee', 0.7746590185342553), ('heat', 0.7746826641516401), ('nat-gas', 0.7747256396311432), ('ipi', 0.7747268794177823), ('money-supply', 0.7747291883324187), ('gnp', 0.7747365356664242), ('crude', 0.7747375331999544), ('int

19541
['nkr', 'money-fx'] [('money-fx', 0.5749259394443946), ('interest', 0.5780133596831651), ('dlr', 0.5782431132344625), ('lei', 0.5793205830889091), ('trade', 0.5793221992702392), ('dmk', 0.5793286514028053), ('yen', 0.579411924336006), ('cpi', 0.5794275476516668), ('reserves', 0.5794572220430688), ('bop', 0.5794742761126644)]
19549
['rapeseed', 'oilseed', 'wheat', 'grain'] [('cpi', 0.7524727293601587), ('livestock', 0.752544487429207), ('grain', 0.7525737879710271), ('carcass', 0.752727420249558), ('wheat', 0.7528195021763904), ('soybean', 0.7530339306888242), ('veg-oil', 0.753048516808929), ('money-supply', 0.7530558432673559), ('interest', 0.7530570898797638), ('oilseed', 0.7530797933049554)]
19551
['veg-oil'] [('trade', 0.49643128869364617), ('veg-oil', 0.49656811191541445), ('grain', 0.49682468617508674), ('money-fx', 0.49717112340195685), ('oilseed', 0.49753836306745985), ('wheat', 0.4976980256732259), ('carcass', 0.4977103762862947), ('corn', 0.4977239468459279), ('coffee', 

20127
['reserves'] [('earn', 0.9286781272507297), ('money-fx', 0.9299855873123556), ('reserves', 0.9302964148123634), ('money-supply', 0.9303892214823107), ('interest', 0.9305339386272296), ('dlr', 0.9306465928932594), ('bop', 0.9307087108462354), ('cpi', 0.9307494270145587), ('dmk', 0.9307532724508493), ('gnp', 0.9307614024702339)]
20208
['rapeseed', 'sunseed', 'soybean', 'oilseed', 'soy-meal', 'meal-feed'] [('grain', 0.583314260716901), ('oilseed', 0.5839005753901353), ('meal-feed', 0.5839596715918774), ('corn', 0.583986055303374), ('wheat', 0.5841272078827076), ('soy-meal', 0.5842021700741212), ('rapeseed', 0.584337406974343), ('soybean', 0.5843912365267047), ('veg-oil', 0.5843985391181961), ('sugar', 0.5844688364245152)]
20214
['earn'] [('cocoa', 1.0), ('sorghum', 1.0), ('oat', 1.0), ('barley', 1.0), ('corn', 1.0), ('wheat', 1.0), ('grain', 1.0), ('sunseed', 1.0), ('oilseed', 1.0), ('soybean', 1.0)]
20232
['veg-oil'] [('grain', 0.44857258979295733), ('trade', 0.44907826842826815), 

20828
['ship', 'crude'] [('trade', 0.5911964469245814), ('ship', 0.5920258352640295), ('crude', 0.592302385094228), ('grain', 0.5936538079989854), ('wheat', 0.59401427870847), ('veg-oil', 0.5940422645891897), ('money-fx', 0.594042490676436), ('corn', 0.5940870663363824), ('carcass', 0.5941532159265874), ('oilseed', 0.5941962088128601)]
20868
['dlr', 'money-fx'] [('money-fx', 0.13255027561844768), ('interest', 0.1343538810117604), ('dlr', 0.13551271993736647), ('trade', 0.1356104741404608), ('money-supply', 0.13599933986274318), ('gnp', 0.13603790336157898), ('reserves', 0.13622445742656913), ('cpi', 0.1362316123587689), ('yen', 0.13623660865513879), ('bop', 0.1363286042449677)]
20911
['coconut-oil', 'palmkernel', 'palm-oil', 'veg-oil', 'soybean', 'oilseed'] [('veg-oil', 0.4723940840669256), ('trade', 0.47349509876910234), ('palm-oil', 0.47421328383899963), ('grain', 0.47435201026102236), ('oilseed', 0.47435895488200847), ('coconut-oil', 0.47460886486623066), ('soybean', 0.4746158880217

21573
['yen', 'dlr', 'money-fx'] [('money-fx', 0.3004710758503651), ('dlr', 0.30392112373766805), ('interest', 0.3052947456970926), ('trade', 0.30540238270090514), ('yen', 0.3059512977069355), ('money-supply', 0.3063124798076294), ('cpi', 0.3064047715353936), ('gnp', 0.3064636353093024), ('bop', 0.30647219810288234), ('reserves', 0.3064877390371462)]
21575
['ipi'] [('grain', 0.39928540569155485), ('trade', 0.3998046390292554), ('ipi', 0.39980584999565627), ('gnp', 0.40037164667102904), ('cpi', 0.40055358563947996), ('wheat', 0.400562426271217), ('money-supply', 0.40058103245760124), ('reserves', 0.4005845992764962), ('money-fx', 0.4006051479876026), ('jobs', 0.4006237432161021)]
2648.141955266955 1542 650 3019


In [137]:
    precision, recall = compute_precision_recall(computed_label_set, number_labels_test, prior_probs.keys())
    print(precision, recall)

0.8456608811748999 0.6832793959007551
