In [1]:
import numpy as np
from collections import defaultdict, Counter
import os.path
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import datetime
from operator import itemgetter

In [298]:
def weight_normalized_cnb(complement_probs, idf, vectorized_text, 
                          prior_probs):
    '''
    :param complement_probs: dictionary where key = label and values = dictionary where
                            keys = words and values = (# of times word w appears in docs
                            NOT labeled l)/(# of words in documents NOT labeled l)
    :param idf: dictionary where keys = words and values = (total # docs)/(# of docs in 
                which we see that word)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    freq = Counter(vectorized_text)
    for label in prior_probs.keys():
        conditional = 0.0
        for word in freq.keys():
            conditional += (freq[word] * complement_probs[label][word])
        labels.append((label, conditional))
    return sorted(labels, key=itemgetter(1))

In [79]:
def complement_naive_bayes(complement_probs, idf, vectorized_text, prior_probs_normalized, prior_probs):
    '''
    :param complement_probs: dictionary where key = label and values = dictionary where
                            keys = words and values = (# of times word w appears in docs
                            NOT labeled l)/(# of words in documents NOT labeled l)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    doc_denom = 0
    freq = Counter(vectorized_text)
    '''
    for word in freq.keys():
        for label in prior_probs.keys():
            doc_denom += (np.log(prior_probs[label]) + (freq[word]/len(vectorized_text) * complement_probs[label][word]))
    print(doc_denom)
    '''
    normalization_term = np.sqrt(sum([freq[word]**2 for word in freq.keys()]))
    for label in prior_probs.keys():
        prob = prior_probs_normalized[label]
        conditional = 0.0
        for word in freq.keys():
            conditional += (freq[word] * complement_probs[label][word])
        prob -= conditional
        labels.append((label, conditional))
    return sorted(labels, key=itemgetter(1))

In [322]:
def multinomial_naive_bayes(conditional_probs, vectorized_text, prior_probs):
    '''
    :param conditional_probs: dictionary where keys = labels and values = dictionary where
                    keys = words and values = P(x|Y)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    freq = Counter(vectorized_text)
    for label in prior_probs.keys():
        conditional = 0.0
        for word in vectorized_text:
            if conditional_probs[label][word] != 0.0:
                conditional += (freq[word] * conditional_probs[label][word])
        labels.append((label, conditional))
    return sorted(labels, key=itemgetter(1), reverse=True)

In [284]:
def bayes_accuracy_model(num, number_labels, labels):
    '''
    :param num: the number of the document being checked, so we can check
                the correct labels for it
    :param number_labels: dictionary where keys = number of sample and
                            values = the set of labels associated with
                            that sample
    :param labels: the set of labels computed by Naive Bayes
    rye 0.00012871669455528383
    groundnut-oil 0.00012871669455528383
    cotton-oil 0.00012871669455528383
    castor-oil 0.00012871669455528383
    nkr 0.00012871669455528383
    sun-meal 0.00012871669455528383
    '''
    sample_labels = number_labels[num]
    successes = 0
    earned = 0
    bottom_5_times = 0
    bottom_5 = ['rye', 'groundnut-oil', 'cotton-oil', 'castor-oil', 'nkr', 'sun-meal']
    computed_labels = [x for x,y in labels]
    if "earn" in computed_labels[:3]:
        earned += 1
    computed_labels_trim = computed_labels[:len(sample_labels)]
    for label in bottom_5:
        if label in computed_labels[:5]:
            bottom_5_times += 1
            break
    if all(x in computed_labels_trim for x in sample_labels):
        successes += 1
    else:
        print(num)
        print(sample_labels, labels[:10])
        diff = set(sample_labels).difference(set(computed_labels_trim))
        score = len(computed_labels_trim) - len(diff)
        score /= len(computed_labels_trim)
        if len(diff) < len(computed_labels_trim):
            successes += score
    return [successes,earned, bottom_5_times]

In [275]:
def vectorize_text(valid_words, filepath):
    '''
    This function removes non valid words from the text to put it into
    the Naive Bayes classifier
    :param stop_words: a set of words like "the", "and", etc
                        that should be stripped out of any computations
    :param valid_words: dictionary where keys = valid words in the corpus
    :param filepath: path to the text file
    :return: a vector of text stripped of stop words and non-valid words
    '''
    with open(filepath, "r") as f:
        content = f.read()
        words = nltk.word_tokenize(content)
        words = [word.lower() for word in words]
        new_words = [word.lower() for word in words if word in valid_words]
    return new_words

In [8]:
def compute_total_word_frequencies(dir_path, valid_words):
    '''
    :param dir_path: a path to the directory containing all the training samples
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param st: Lancaster Stemmer object 
    :return: a dictionary where keys = words and values = # of documents in which
            that word appears 
    '''
    frequencies = {word: 0 for word in valid_words}
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            num = int(file[0:len(file) - 4]) 
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words if word not in stop_words]
            new_words = [word.lower() for word in new_words if word in valid_words.keys()]
            new_words = set(new_words)
            for word in new_words:
                frequencies[word] += 1
    return frequencies

In [10]:
def get_valid_words(dir_path, stop_words):
    '''
    Utility function that determines the set of valid words 
    to be used for classification and probability calculation
    :param dir_path: a path to the directory containing 
                    all the training samples
    :param stop_words: a set of words like "the", "and", etc
                        that should be stripped out of any computations
    :return: a Python dictionary where the keys = valid words and the 
            values = True, so we can use "key in dict" for future access
            in guaranteed constant time
    '''
    valid_words = defaultdict(bool)
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words]
            new_words = [word.lower() for word in new_words if word.isalpha()]
            new_words = [word.lower() for word in new_words if word not in stop_words]
            new_words = set(new_words)
            for word in new_words:
                valid_words[word] = True
    return valid_words
            

In [11]:
def add_labels_to_samples(filename):
    '''
    This function iterates over the file containing all 
    labels for each numbered sample, and maps them together with
    a dictionary
    :param filename: path to the file with all the labels in it (assumes
                    the file is located in this directory)
    :return: a dictionary with keys = number of the training sample and
            values = the set of labels associated with it
            AND
            the same, but with the test samples. Keep them separate for easy
            access later
    '''
    number_labels_training = defaultdict(list)
    number_labels_test = defaultdict(list)
    with open(filename, "r") as f:
        for line in f:
            terms = line.split()
            if line[0:4] == "test":
                num = int(terms[0][5:len(terms[0])])  # Test number, so we can map this back to the proper label(s) later on
                number_labels_test[num] = terms[1:]
            else:
                num = int(terms[0][9:len(terms[0])])  
                number_labels_training[num] = terms[1:]
    return [number_labels_training, number_labels_test]

In [12]:
def compute_prior_probabilities(number_labels):
    '''
    This function will compute the prior probabilities
    P(y) = probability of seeing a label with a sample. 
    Note: since many samples have multiple labels, these prior
    probabilites will sum to > 1
    :param number_labels: dictionary where keys = number of training sample
                            and value = the list of labels associated with it
    :return: a dictionary where keys = the label and value = probability of seeing
            that label in the document list
    '''
    prior_probs = defaultdict(float)
    i = 0
    for num, labels in number_labels.items():
        for l in labels:
            if not prior_probs[l]:
                prior_probs[l] = 1
            else:
                prior_probs[l] += 1
        i += 1
    for label, freq in prior_probs.items():
        prior_probs[label] /= i
    return prior_probs

In [82]:
def rename_files(dir_path):
    '''
    Utility function designed to rename all files in any directory
    to a .txt file so they can be read from
    :param dir_path: directory of the files to be renamed
    '''
    for file in os.listdir(dir_path):
        filepath = dir_path + '\\' + file 
        os.rename(filepath, filepath+".txt")

In [83]:
def compute_frequencies_by_class(mega_docs, valid_words, label_list):
    '''
    This function computes the frequencies of all words by class. This is done because
    the outright frequencies are needed for Naive Bayes and conditional_probs can easily
    be obtained from this by dividing each entry by the number of elements in each "mega doc"
    :param mega_docs: a dictionary where keys = labels and values = vectors of all the
                        valid words present in documents with that label
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param label_list: list of all unique labels in the dataset
    :return: a dictionary where keys = labels and values = dictionary where keys = words
                and values = frequencies of that word in docs with that label
            AND
            a dictionary where keys = words and values = the total frequency of those words
            all documents throughout the corpus
    '''
    frequencies = {label: {word: 0 for word in valid_words} for label in label_list}
    total_frequencies = {word:0 for word in valid_words}
    for label, vector in mega_docs.items():
        freq = Counter(vector)
        for word in freq.keys():
            frequencies[label][word] += freq[word]
            total_frequencies[word] += freq[word]
    return [frequencies, total_frequencies]

In [312]:
def compute_word_frequencies(dir_path, valid_words, number_labels, label_list):
    '''
    This function will iterate over the documents and compute the frequencies of 
    the words by label and in total
    :param dir_path: a path to the directory containing all the training samples
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param number_labels: dictionary where keys = document # and values = the set of 
                            labels associated with those labels
    :param label_list: list of all the unique labels
    :return: a dictionary where keys = labels and values = dictionary where keys 
            = words and values = the frequencies of that word in documents with that label
            AND 
            a dictionary where keys = words and values = the total # of occurrences of
            that word
            AND
            a dictionary where keys = words and values = the idf score for that word
            AND 
            a dictionary where keys = labels and values = the total # of words associated with that label
            AND
            the total # of valid words in the entire corpus
    '''
    words_by_doc_num = defaultdict()
    idf = {word: 0.0 for word in valid_words}
    total_num_words = 0
    total_word_count_by_label = {label: 0 for label in label_list}    
    i = 0
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            num = int(file[0:len(file) - 4])
            labels = number_labels[num]
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words]
            new_words = [word.lower() for word in new_words if word in valid_words]
            total_num_words += len(new_words)
            freq = Counter(new_words)
            words_by_doc_num[num] = freq
            for word in freq.keys():
                idf[word] += 1
            for l in labels:
                total_word_count_by_label[l] += len(new_words)
            i += 1
    for word in idf.keys():
        idf[word] = 1 + np.log(i/(idf[word]+1))      
    frequencies = {label: {word: 0.0 for word in valid_words} for label in label_list} 
    total_frequencies = {word: 0 for word in valid_words}
    for num in words_by_doc_num.keys():
        freq = words_by_doc_num[num]
        labels = number_labels[num]
        normalization_term = np.sqrt(sum([score**2 for word, score in freq.items()]))
        for l in labels:
            total_word_count_by_label[l] += len(new_words)
            for word in freq.keys():
                term_to_add = freq[word] * idf[word]
                frequencies[l][word] += (term_to_add/normalization_term)
                total_frequencies[word] += (term_to_add/normalization_term)
    return [frequencies, total_frequencies, idf, total_word_count_by_label, total_num_words]

In [313]:
if __name__ == '__main__':
    dir_path = "C:\\Users\\ksing\\OneDrive\\Documents\\Text Classifiers\\training"
    stop_words = set(stopwords.words('english'))
    valid_words = get_valid_words(dir_path, stop_words)
    number_labels_training, number_labels_test = add_labels_to_samples("cats.txt")
    prior_probs = compute_prior_probabilities(number_labels_training)
    
    parameters = compute_word_frequencies(dir_path, valid_words, number_labels_training, prior_probs.keys())
    
    frequencies = parameters[0]
    total_frequencies = parameters[1]
    idf = parameters[2] 
    total_word_count_by_label = parameters[3]
    total_num_words = parameters[4]

In [314]:
    for label, vector in frequencies.items():
        if label != "earn":
            continue
        print("Label:", label)
        for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):
            if score == 0.0:
                continue
            print(word, score)

Label: earn
vs 2037.6218731693598
cts 1430.526394079223
mln 1047.3317024944583
loss 962.2201550638756
net 943.4418504559922
shr 663.6889908910532
dlrs 609.6416337025148
profit 563.4392492665461
lt 516.7765508447939
revs 458.1045001528319
qtr 446.4594937985735
year 406.22387841452996
record 404.9552973796853
dividend 390.8395489141975
div 389.78991772788453
april 377.35158581904386
qtly 356.53377491998936
prior 332.5941087989328
inc 330.7037136432611
pay 309.68261673485665
oper 307.8842051114876
said 290.6300507541069
share 283.9030703337707
note 276.31929787885434
corp 245.88299048178584
march 241.50722426714609
company 238.27249305973777
sales 229.48955397943558
sets 222.1820417431966
billion 215.32308221418893
split 213.68432094165172
stock 211.2996704426567
quarter 210.2625680725256
avg 207.88206945700472
shrs 204.5119132211401
quarterly 193.97657300565496
earnings 185.06668798865002
payout 184.71297397158258
may 175.7760158161672
co 158.7100352198403
payable 156.21357215242935
nine

jamesway 5.194126732049438
reflecting 5.191428124601934
refining 5.188984781440883
robins 5.182150015390126
investigation 5.180573400039956
gelco 5.178675446218467
santos 5.176502939626005
f 5.164703863060939
urs 5.159652493843801
publishing 5.152554731615607
hovnanian 5.147112343889869
television 5.141794709506208
crk 5.134656367886195
ability 5.122653419704595
discount 5.120633094984558
works 5.116958089701182
recommend 5.114769823680874
announcement 5.109807399625698
paying 5.106972291197753
connecticut 5.102411339889642
surplus 5.100297348896633
core 5.10025957212652
film 5.097543329739635
independent 5.094565499856753
jack 5.086475074678541
usa 5.080194128819507
lloyds 5.077004037762447
dropped 5.060531409605966
shield 5.060409171309356
weis 5.059970976469926
moratorium 5.057421462146039
greater 5.056389749993241
double 5.0408409206049125
roy 5.033558194814771
morris 5.033436525473196
suspension 5.03026513217626
ttx 5.021919363746725
tultex 5.021919363746725
charged 5.016778382729

marine 3.1718907538280536
electro 3.170067105321311
furnaces 3.167448599917537
vishay 3.167448599917537
whether 3.165698993639889
fairfax 3.1655482418340344
host 3.162723853999611
anacomp 3.162612936213014
cheung 3.1614696638473325
wang 3.1595135265853784
approvals 3.159382564147496
singapore 3.158895184478583
auditor 3.157293482445385
supreme 3.1567098423634783
stvtf 3.1554446688735873
rogers 3.1554208392916148
gnt 3.153518734174031
treatment 3.1534404547385115
ech 3.1515619411247053
representing 3.147544436597415
talbot 3.147078944261313
burger 3.1454671818063717
flbk 3.138224490172501
holiday 3.13626871874548
lord 3.1357498464950604
del 3.13333749961189
duvel 3.132057001954895
scat 3.132057001954895
ekr 3.1313182377568385
graff 3.127097288626348
alberta 3.1266579361243076
originally 3.1259741035302433
assurance 3.1247211696043338
operate 3.1228975714164897
csp 3.1223617430386987
wg 3.1169126726598675
pharmacia 3.11539126503311
entitled 3.1151052196269
incorporation 3.114536741258256

commencement 2.1795774764571467
noncash 2.1790822395888148
interested 2.1789518712670763
george 2.1787200008037955
garden 2.175686412003764
lasmo 2.172790325481187
left 2.17038558167788
projections 2.169857786985018
talc 2.1687138326614015
limit 2.1673614491150444
scale 2.1669307008270886
cag 2.166851787917751
fact 2.1665745945809096
asia 2.165085458564368
aims 2.164602265799575
scrip 2.163681937382224
obligation 2.162648649630208
toward 2.1612935490090392
donald 2.15882131278609
dillard 2.1556766325659784
managers 2.154010837719514
situation 2.1534838989399985
groupe 2.1534679868751585
fiama 2.1533378665104292
federally 2.153132862623658
cvg 2.1503811805931865
subsequent 2.1499732200353625
ever 2.1491819981419553
tractor 2.1485910447915257
stays 2.148454959465398
dhr 2.1483969881059406
instrument 2.1458675646626997
vanadium 2.1443728392033727
mrs 2.143272086255296
daniel 2.140943915581226
slough 2.1396022596807223
frost 2.138103341596224
undergarment 2.137940810163708
ho 2.13632288091

cleanup 1.8108743109514225
gmhc 1.8083938649639226
monkey 1.8083938649639226
dst 1.8083938649639226
sfp 1.8083938649639226
cruises 1.8066992995838662
charan 1.8061450588501144
gats 1.805911164934675
stahl 1.8055190020056842
models 1.8052319421757999
planters 1.8043742580412256
bowling 1.8040038072865583
approves 1.7991585081696955
ahlsell 1.7981525343804075
gosper 1.797625459183345
lner 1.7974899701281315
story 1.7966438923685375
upper 1.7949343098865622
widdrington 1.7941110365468447
sb 1.791534664087792
nch 1.7913142912197646
herley 1.7913142912197646
margin 1.7910457135599556
alternative 1.7908598460485095
status 1.7898996743239652
seidman 1.7896425089692203
transmission 1.7894742885345498
crown 1.7880112825715155
mas 1.787302646509504
healthvest 1.787302646509504
gec 1.787302646509504
chemie 1.787302646509504
infg 1.786938684699762
wendy 1.7868248376655942
linde 1.7859918354366902
mile 1.7846199440558967
sure 1.7835616768847076
crain 1.7830018830672687
tca 1.7830018830672687
tcat 1

revaluation 1.2724470837134225
dls 1.2719404092525268
obsolete 1.2717113399081064
want 1.271628476882236
wmb 1.2703677761375975
westminster 1.270195005175432
fewer 1.270176241623152
focused 1.270026139710913
authorities 1.2699161081094708
inflation 1.2682908581171575
beverage 1.2678611001414302
approaches 1.2675223423313946
internationally 1.2667734879714396
severely 1.2662781091038573
samsung 1.265612063754616
angus 1.2646821045503038
vancouver 1.2634913195282926
underwritten 1.2633555746791558
eldr 1.2633079044708344
pratt 1.2619944841588886
leaving 1.2615352254381609
guaranty 1.2614698393230814
party 1.2612753902515434
track 1.2609419539531252
jmbrs 1.2607727223852494
jmb 1.2607727223852494
indqa 1.2607727223852494
cnvlz 1.2607727223852494
gdb 1.2607727223852494
mpp 1.2607727223852494
trti 1.2607727223852494
transtech 1.2607727223852494
radif 1.2607727223852494
kba 1.2607727223852494
mfm 1.2607727223852494
snat 1.2607727223852494
avatar 1.2607727223852494
avtr 1.2607727223852494
aut

tenney 1.0231204319925946
tny 1.0231204319925946
rebates 1.0230179559182728
pca 1.0229820519777113
experienced 1.022674869501508
coast 1.0206415272003182
shrinkage 1.0197401091763663
unknown 1.0184510086261818
transnational 1.017905186635151
eggs 1.017703768013203
trails 1.016938378827484
trls 1.016938378827484
tsr 1.016938378827484
tsri 1.016938378827484
foil 1.016938378827484
utl 1.016938378827484
unitil 1.016938378827484
operationswas 1.016938378827484
samson 1.016938378827484
ati 1.016938378827484
atim 1.016938378827484
gendis 1.016938378827484
biflyx 1.016938378827484
tajon 1.016938378827484
trc 1.016938378827484
covering 1.0143755378771238
reasons 1.0140355373921615
chances 1.013034744890073
assume 1.0125162808795287
sweet 1.0123054308129875
hope 1.0117023996845202
interfirst 1.011678397969698
layoff 1.0112521402731525
contran 1.0108670506619701
vhi 1.0108670506619701
savo 1.0108670506619701
savoy 1.0108670506619701
alatenn 1.0108670506619701
atng 1.0108670506619701
aagiy 1.01086

miglio 0.8157157142043013
cml 0.8157157142043013
cmli 0.8157157142043013
bigb 0.8157157142043013
btrl 0.8157157142043013
eqty 0.8157157142043013
extraordainry 0.8157157142043013
sjr 0.8157157142043013
lvt 0.8157157142043013
seis 0.8157157142043013
mercer 0.8155632393988899
mnft 0.8155632393988899
hang 0.8142685104063849
seng 0.8142685104063849
perhaps 0.8141076975559605
streamlining 0.8135804929532349
ratios 0.8132276085777956
westbank 0.8126658434308442
infusaid 0.8125722893549392
intraocular 0.8125722893549392
bsh 0.8125722893549392
sbm 0.8125722893549392
onc 0.8125722893549392
sigr 0.8125722893549392
hovis 0.8121292736652405
mcdougall 0.8121292736652405
princeton 0.8121292736652405
ireland 0.8119330263187116
internally 0.8118111314061397
thousand 0.8114813980327893
chamber 0.8101146717675608
molded 0.8099402935173743
toog 0.8094649269619502
jmed 0.8094649269619502
flti 0.8094649269619502
sp 0.8094649269619502
spelling 0.8094649269619502
tengelmann 0.8094649269619502
warenhandelsgese

misleading 0.6313994624591414
wardley 0.6311978818013863
concentrates 0.6311978818013863
jerome 0.6311968082054294
ing 0.6310932565768315
fleming 0.6305135371317035
nonproductive 0.6303863611926247
aztec 0.6303863611926247
aztc 0.6303863611926247
woolco 0.6303863611926247
usm 0.6303863611926247
improves 0.6300311681756192
goodman 0.6296019301151687
funeral 0.6294736331690585
clouded 0.6293253791300404
maximum 0.6291565570744405
pfina 0.6289321807986459
alrn 0.6289321807986459
purusant 0.6280180233792213
atfc 0.6280180233792213
ronson 0.6274880177428973
ronc 0.6274880177428973
pacc 0.6274880177428973
amsterdam 0.6266367040424939
entrants 0.626446010842673
indemnifying 0.626446010842673
resubmit 0.626446010842673
distract 0.626446010842673
cohesiveness 0.626446010842673
laid 0.6263586994877774
hrok 0.6260537575411625
concentration 0.6257434441753058
buyer 0.6255110935868755
imatron 0.6251483536910615
terrorism 0.6251483536910615
mpet 0.6246292875326441
questionable 0.6245772280686022
fra

tate 0.5396335399058807
tool 0.5385956181147704
strategies 0.5384591456086991
child 0.5381632041656681
pegged 0.5377176217219012
pose 0.5376541878295658
ferrominera 0.5375952951482966
leopoldo 0.5375952951482966
orinoco 0.5375952951482966
bln 0.5375952951482966
alcasa 0.5375952951482966
interalumina 0.5375952951482966
venalum 0.5375952951482966
mrcm 0.5375952951482966
deputy 0.5374198399344685
eqica 0.5357943008231582
pman 0.5357943008231582
saunders 0.5355538831664782
practical 0.535323371283003
jnj 0.5352089465095548
xon 0.535128113927855
magna 0.5346818656832255
eroded 0.5346818656832255
ammunition 0.5342349530790895
counterbalance 0.5342349530790895
pare 0.5342331902811108
engineered 0.5341324390596739
awarded 0.5336566025672431
structured 0.5335963161312712
capitalize 0.533363469557267
lid 0.533363469557267
exchanged 0.5332233191142385
sims 0.533005152095388
oklahoma 0.5327347396671391
exclusively 0.5326328874054763
swing 0.5324284981567201
sludge 0.5322459551599124
settlem 0.5322

voutilainen 0.41726333444841174
pertti 0.41726333444841174
adjoining 0.41725907500675274
downgraded 0.41725907500675274
stagnation 0.4167655691273743
freighter 0.4167655691273743
whereby 0.4167056878781996
emerge 0.41606890735760965
unions 0.4157226868955328
artificially 0.41540215342006837
reorganizations 0.4148740056577198
mobility 0.4148740056577198
emr 0.4148740056577198
leach 0.41483825410009656
cdni 0.4147471506944627
inject 0.41444614101843
zinc 0.41422643467193004
movement 0.41374264622228096
horticulture 0.41350600930606574
fisons 0.41350600930606574
bae 0.41346165894265735
hopefully 0.4133593719278639
drastically 0.4129544321137798
practises 0.41192713430427375
mnst 0.41179910508290535
oc 0.4117258647319677
orion 0.4117258647319677
restrictive 0.41168141489383636
sql 0.41146197301552356
tndm 0.41146197301552356
frederic 0.41146197301552356
bois 0.41083947863441933
trials 0.41083947863441933
lear 0.410738326596734
dailies 0.41039842700042106
westdeutsche 0.4103043665568052
lan

comments 0.26141916904645346
illegal 0.2612853763029278
spun 0.2610168189530965
harold 0.2604389908997332
criticism 0.26008679564111076
ctc 0.26002776840228964
finalize 0.26002776840228964
reputation 0.25980646572912997
reassure 0.259781839983727
modular 0.2596702258690225
sharper 0.2596702258690225
turbulence 0.2590793166642992
refined 0.25852991533181147
closely 0.2585028939237641
differences 0.2573778269724255
slowing 0.2573778269724255
names 0.2571964727356566
triggered 0.25717410000580626
employment 0.25708229490384626
eliminations 0.25639152567980883
specialties 0.25639152567980883
bacon 0.25624002804121077
cold 0.25614321933711875
wilhelm 0.2559384264077546
obviously 0.25584152834682256
arguello 0.25542642926069903
commercially 0.25542642926069903
feedstocks 0.25542642926069903
impulses 0.2551972433408812
squabbling 0.2551972433408812
amazed 0.2551972433408812
teamed 0.2551026942956452
pte 0.2551026942956452
diversifying 0.2551026942956452
wanamaker 0.25510048608874314
renfrew 0

In [315]:
    conditional_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    complement_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    for label, vector in conditional_probs.items():
        denom = total_num_words - total_word_count_by_label[label] + len(valid_words.keys())
        for word in vector.keys():
            mod_cond_freq = frequencies[label][word] + 1
            mod_comp_freq = (total_frequencies[word] - frequencies[label][word]) + 1
            conditional_probs[label][word] = np.log(mod_cond_freq/(total_word_count_by_label[label] + len(valid_words.keys())))
            complement_probs[label][word] = np.log(mod_comp_freq/denom)

In [316]:
    complement_probs_normalized = {label: {word: complement_probs[label][word] for word in valid_words} 
                                   for label in prior_probs.keys()}
    for label, vector in complement_probs.items():
        normalize_term = np.sqrt(sum([(complement_probs_normalized[label][word]**2) for word in valid_words]))
        for word in vector.keys():
            complement_probs_normalized[label][word] /= normalize_term

In [324]:
    # Removing the stemmer actually improves accuracy on test set, who knew
    successes, earned, bottom_5,i = 0, 0, 0, 0
    dir_path = "C:\\Users\\ksing\\OneDrive\\Documents\\Text Classifiers\\test"
    for file in os.listdir(dir_path):
        filepath = dir_path + '\\' + file 
        num = int(file[0:len(file) - 4])
        text = vectorize_text(valid_words, filepath)
        # computed_labels = complement_naive_bayes(complement_probs, idf, text, prior_probs_normalized, prior_probs)
        # computed_labels = multinomial_naive_bayes(conditional_probs, text, prior_probs)
        computed_labels = weight_normalized_cnb(complement_probs_normalized, idf, text, 
                                               prior_probs)
        suc, e, b5 = bayes_accuracy_model(num, number_labels_test, computed_labels)
        # Even with using conditional_probs, earn appears in 1773/3019 samples
        
        # CNB brought earn labels down to 1170/3019, which is the best improvement so far
             
        # Multinomial Naive Bayes: 84.09% (2538.627561327562) accuracy on test set (????), 1773 "Earn" labels
        # Complement Naive Bayes: 85.09% (2568.913203463204) accuracy on test set, 1687 "Earn" labels
        # Weight Normalized CNB w/ TF-IDF transformation: 76.35% (2304.986291486292), 2131 "Earn" labels
        # (I FORGOT HOW I GOT THIS AHHHHH)
        # CNB with IDF transformation: 86.76% (2619.324711399711), 1321 "Earn" labels
        # MNB with IDF transformation: 84.87% (2562.234704184704), 1521 "Earn" labels
        
        # MNB with doc length normalization, IDF: 86.10% accuracy (2599.288708513709), 1548 "Earn" labels
        # CNB with doc length normalization, IDF: 86.93% accuracy(2624.394769119769), 1312 "Earn" labels
        # WCNB with doc length normalization and IDF^2: 87.72% accuracy (2648.141955266955), 1542 "Earn" labels
        
        # Perhaps the reason that TF doesn't lead to improvements with this is because we already stripped out the 
        # stop words, which would be affected the most by this technique

        successes += suc
        earned += e
        bottom_5 += b5
        i += 1
    print(successes, earned, bottom_5, i)

14829
['nat-gas', 'crude'] [('crude', -0.4764045808653885), ('trade', -0.4763387868270032), ('nat-gas', -0.4754826457035697), ('money-fx', -0.47524161172304086), ('dlr', -0.47482474725245594), ('cpi', -0.47467685170012), ('yen', -0.47446709980029944), ('gnp', -0.4744633356581971), ('interest', -0.4743609194958147), ('gas', -0.4743323765935096)]
14832
['rubber', 'tin', 'sugar', 'corn', 'rice', 'grain', 'trade'] [('trade', -0.4293814550077891), ('bop', -0.42863732622235895), ('sugar', -0.42811930504389484), ('money-supply', -0.42800411139156647), ('rubber', -0.4279302483234143), ('gnp', -0.4278716330246437), ('tin', -0.42787140505466886), ('cpi', -0.4278531882410955), ('grain', -0.42764061332032494), ('interest', -0.4275850048569903)]
14840
['rubber', 'coffee', 'lumber', 'palm-oil', 'veg-oil'] [('coffee', -1.1114737494983045), ('rubber', -1.1105972637761765), ('money-fx', -1.108576444082847), ('veg-oil', -1.1081377617565598), ('trade', -1.1076170465769304), ('palm-oil', -1.10744138969576

15121
['earn'] [('acq', -0.5721713695132166), ('earn', -0.5640260058674195), ('reserves', -0.5592052021805795), ('bop', -0.5589371905939611), ('dlr', -0.5588741582295026), ('jobs', -0.5587605401439554), ('dmk', -0.5587263981751048), ('soy-meal', -0.5587197679820729), ('meal-feed', -0.5587093496512215), ('groundnut', -0.5587066157334882)]
15206
['rice'] [('trade', -0.7311342101653298), ('grain', -0.7260584609373567), ('wheat', -0.7247253752544542), ('money-fx', -0.7246016658809001), ('oilseed', -0.7238663490941922), ('livestock', -0.7237510673524709), ('corn', -0.7237177690765368), ('carcass', -0.723638373071155), ('interest', -0.7236139442391942), ('soybean', -0.7235475487101991)]
15212
['interest', 'dlr', 'money-fx'] [('interest', -0.4971278901096), ('money-fx', -0.49682530926487584), ('gnp', -0.49351874978508625), ('dlr', -0.4932604888771178), ('money-supply', -0.49320004720999333), ('cpi', -0.49280534583263674), ('trade', -0.4926220221448265), ('yen', -0.49241629547183613), ('bop', 

15556
['lead'] [('alum', -1.828172190238538), ('grain', -1.8267514338210575), ('money-fx', -1.8254302751285805), ('crude', -1.82525737080469), ('copper', -1.8242441552421862), ('lead', -1.824151111228506), ('sugar', -1.823984726360319), ('ship', -1.8238509991041538), ('corn', -1.8238268563683466), ('wheat', -1.823793488320777)]
15562
['dlr'] [('money-fx', -0.2456584277345813), ('dlr', -0.2418567044978989), ('interest', -0.24169942369602151), ('trade', -0.2412927739456801), ('reserves', -0.24126981336286818), ('yen', -0.24121893533079422), ('gnp', -0.24109417392610635), ('coffee', -0.24103103702447898), ('ipi', -0.24095417001702157), ('cocoa', -0.24091897739247373)]
15563
['lead'] [('copper', -0.1177170054638707), ('earn', -0.11718969515868186), ('interest', -0.11580126070658563), ('veg-oil', -0.11569785554017704), ('zinc', -0.11569425430596347), ('lead', -0.11569015968195076), ('silver', -0.11564291519302522), ('palm-oil', -0.11562386408466332), ('alum', -0.11560773898430404), ('corn',

15808
['earn'] [('acq', -0.2319484531825458), ('earn', -0.22384764506258778), ('housing', -0.21770552856644865), ('groundnut', -0.21768135677506512), ('platinum', -0.21768046469819105), ('sun-meal', -0.21768038068761122), ('castor-oil', -0.21768025562975132), ('palladium', -0.21768018175144718), ('lin-oil', -0.21768009401483154), ('nkr', -0.2176799770634906)]
15836
['wheat', 'grain'] [('grain', -0.292329990269377), ('trade', -0.2922857381800846), ('wheat', -0.29210979835326856), ('veg-oil', -0.2914058763779471), ('palm-oil', -0.2912686536962249), ('corn', -0.2912492111231221), ('money-fx', -0.29118210909874853), ('dlr', -0.2911277107917057), ('sugar', -0.29101312171317834), ('oilseed', -0.29097748662429457)]
15838
['strategic-metal'] [('tin', -0.6845172391980213), ('trade', -0.6837954067087699), ('grain', -0.6837540484833629), ('crude', -0.6829372950230872), ('wheat', -0.6826419494372428), ('money-fx', -0.68263829477209), ('corn', -0.6826343244029728), ('oilseed', -0.6824116473625446),

15949
['livestock'] [('grain', -0.2907894952540932), ('wheat', -0.28989686991506325), ('meal-feed', -0.2896640318467566), ('veg-oil', -0.2891838422850904), ('corn', -0.2891654541653147), ('oilseed', -0.289100551879914), ('livestock', -0.28909621921084455), ('lumber', -0.288619695040451), ('rice', -0.2886004325877868), ('potato', -0.2885869582507281)]
15953
['meal-feed', 'soy-meal', 'soybean', 'oilseed'] [('oilseed', -0.41330961846796566), ('soybean', -0.41299291989333425), ('grain', -0.41194144616877987), ('veg-oil', -0.4119314983620032), ('meal-feed', -0.4117149831352896), ('corn', -0.4116980763437656), ('soy-meal', -0.41157762059319664), ('wheat', -0.41138425881254803), ('livestock', -0.4109651009012685), ('money-supply', -0.41096282451785837)]
15975
['veg-oil', 'meal-feed', 'oilseed', 'grain'] [('grain', -0.8179789285673644), ('corn', -0.8157948372754505), ('wheat', -0.8150692420509087), ('oilseed', -0.8139324344148989), ('soybean', -0.8135117414546135), ('cotton', -0.81342929678428

16146
['meal-feed'] [('grain', -0.940437153773758), ('trade', -0.931371366332664), ('veg-oil', -0.9304037524304516), ('corn', -0.9303007299310505), ('money-fx', -0.9298272237809042), ('wheat', -0.9297367669893476), ('cocoa', -0.929480731655303), ('tin', -0.9294405168002131), ('soybean', -0.9293468349118476), ('oilseed', -0.9292725503027398)]
16147
['groundnut', 'oilseed', 'rice', 'corn', 'grain'] [('grain', -0.4856328822993789), ('rubber', -0.4833599273274322), ('wheat', -0.4830078766760845), ('corn', -0.4827818641398117), ('rice', -0.48230978988821527), ('oilseed', -0.48230335846163497), ('cotton', -0.4818757687624859), ('soybean', -0.4818407561576665), ('sugar', -0.48173170529944637), ('veg-oil', -0.4816195494472858)]
16158
['cpi', 'bop', 'jobs', 'gnp'] [('gnp', -1.8109622104292928), ('trade', -1.8036965842923394), ('ipi', -1.8034932920524318), ('jobs', -1.8017216082256164), ('money-fx', -1.8016478951522208), ('interest', -1.7994197214105199), ('cpi', -1.798992254645557), ('reserves'

16314
['fuel'] [('earn', -0.23383902371910512), ('fuel', -0.23114433057943726), ('crude', -0.2294714921429136), ('heat', -0.22905460530349997), ('interest', -0.2286069560135987), ('copper', -0.22830250564550486), ('cpi', -0.2282223413625601), ('wpi', -0.22809426879352995), ('gas', -0.2280836102503794), ('nat-gas', -0.22807165789151496)]
16488
['earn'] [('acq', -0.35417376424813524), ('earn', -0.34641386959488457), ('rubber', -0.34345865980268625), ('gold', -0.34341463449239523), ('reserves', -0.3433907719885857), ('jobs', -0.34337422659874844), ('interest', -0.3433513191697691), ('dmk', -0.3433511665661989), ('barley', -0.3433508720150475), ('propane', -0.34334845564061794)]
16499
['nat-gas'] [('crude', -0.17638262778243397), ('nat-gas', -0.17623857095821963), ('acq', -0.17573767905848467), ('barley', -0.17535422950908275), ('grain', -0.17533049671000045), ('wheat', -0.17524616859689182), ('fuel', -0.17523554856157877), ('money-supply', -0.17523400542181927), ('corn', -0.17523159604832

17036
['money-supply'] [('money-fx', -1.6399921234631842), ('interest', -1.6387350972441816), ('money-supply', -1.63360268435062), ('dlr', -1.6250820991842114), ('reserves', -1.624168749853086), ('gnp', -1.6236053808796418), ('cpi', -1.6229001808664123), ('bop', -1.6228990386855011), ('housing', -1.6228146970180124), ('yen', -1.6228008968823064)]
17045
['dlr'] [('money-fx', -0.08748859404088838), ('dlr', -0.08649660780044055), ('yen', -0.08589677478286713), ('trade', -0.08559935731319962), ('interest', -0.08552472295784738), ('wheat', -0.08548191696323168), ('grain', -0.08546495503304607), ('veg-oil', -0.08545846346761232), ('dmk', -0.08545286951234475), ('cotton', -0.08544552029428529)]
17046
['acq'] [('crude', -0.07869099911482152), ('acq', -0.0782554437678018), ('nat-gas', -0.07792873274363203), ('veg-oil', -0.07791393732609411), ('sugar', -0.0778989026519438), ('palm-oil', -0.0778981414270813), ('gas', -0.07788962291047506), ('money-supply', -0.07787542399163151), ('interest', -0.0

17670
['lumber', 'trade'] [('trade', -0.8249421434718521), ('grain', -0.8240205294880443), ('lumber', -0.8228790470406657), ('corn', -0.8222731754788474), ('wheat', -0.8219774738163741), ('oilseed', -0.8212473973916931), ('veg-oil', -0.8212262567747587), ('iron-steel', -0.8209102720658389), ('soybean', -0.8207453929658712), ('interest', -0.8207138248778723)]
17682
['acq'] [('earn', -0.3426016080013894), ('acq', -0.33100927184867834), ('housing', -0.3289146051465964), ('ipi', -0.32886280015215874), ('cpu', -0.3287605886806938), ('naphtha', -0.3287461011072022), ('sorghum', -0.3287376059691692), ('income', -0.32873489667070643), ('castor-oil', -0.3287333202373712), ('lei', -0.32873117315896944)]
17714
['alum', 'copper', 'silver', 'gold'] [('money-fx', -0.966266072135523), ('silver', -0.9642055312887835), ('gold', -0.9635618568161841), ('interest', -0.9635306772392737), ('copper', -0.963225405750143), ('cocoa', -0.9629180909389472), ('sugar', -0.9628692050642127), ('grain', -0.96280998789

17962
['rice', 'wheat', 'grain'] [('ship', -0.4849052880295193), ('grain', -0.4845784391364567), ('sugar', -0.484528960013305), ('interest', -0.4844329913669605), ('trade', -0.48415479006844697), ('wheat', -0.4839924209848814), ('livestock', -0.48388891500517933), ('carcass', -0.4838128790142824), ('oilseed', -0.48372855228251266), ('iron-steel', -0.4836220206823442)]
17966
['interest', 'reserves'] [('money-fx', -2.013442658640869), ('interest', -2.0096481619782356), ('dlr', -1.9957918610593344), ('money-supply', -1.9954331023394776), ('reserves', -1.9947998464359649), ('gnp', -1.994676799487437), ('bop', -1.9942631450289956), ('yen', -1.993900553627877), ('trade', -1.9938280211051689), ('cpi', -1.9933230172744985)]
17970
['gnp', 'cpi'] [('money-fx', -1.8921901135351678), ('interest', -1.8905157177770964), ('gnp', -1.890495712608021), ('trade', -1.8898342163156834), ('bop', -1.885959799393945), ('money-supply', -1.8857319581841914), ('cpi', -1.8848600470073535), ('jobs', -1.88473121498

18337
['zinc', 'lead'] [('lead', -0.4569354391534175), ('ship', -0.4562048154636278), ('alum', -0.4560652819141104), ('gold', -0.4557171288190576), ('zinc', -0.4554382259436278), ('iron-steel', -0.4553220434486355), ('copper', -0.4553159089120333), ('yen', -0.4551966227654709), ('silver', -0.45513996274690144), ('carcass', -0.4550425300680063)]
18345
['rapeseed', 'oilseed', 'wheat', 'grain'] [('grain', -0.3809893551330224), ('wheat', -0.3800717272008312), ('corn', -0.37887184596098866), ('oilseed', -0.3785695356927745), ('soybean', -0.3782603679198206), ('cotton', -0.3782533971096455), ('barley', -0.37821056350364807), ('rice', -0.37816981138467975), ('rapeseed', -0.37813261920158675), ('sorghum', -0.37806009575544874)]
18347
['trade', 'bop'] [('earn', -0.7349279052860344), ('bop', -0.7324024139588592), ('trade', -0.7301563956538806), ('money-supply', -0.727250451946822), ('jobs', -0.7268034198577327), ('housing', -0.7259462436052934), ('ipi', -0.7255873927003335), ('retail', -0.725547

18609
['sugar', 'rice', 'grain'] [('sugar', -0.8683813500402017), ('grain', -0.8654239139108845), ('trade', -0.8654109019675416), ('money-fx', -0.8646875177907795), ('wheat', -0.8637460731734595), ('rice', -0.8631278130456987), ('corn', -0.8630820851716846), ('veg-oil', -0.8630099655009213), ('interest', -0.862993313779391), ('palm-oil', -0.8626293893464523)]
18614
['corn', 'grain'] [('grain', -1.3670866098950523), ('wheat', -1.3625148914281806), ('trade', -1.3619194655201736), ('corn', -1.3609761748219107), ('oilseed', -1.3594516612323753), ('interest', -1.3591311193157882), ('soybean', -1.3589385047553568), ('money-fx', -1.358804157932065), ('cotton', -1.3581743063648026), ('carcass', -1.3577941909507942)]
18625
['nzdlr', 'money-fx'] [('money-fx', -1.0885044860570787), ('interest', -1.086431755588434), ('trade', -1.08536747695648), ('gnp', -1.08390851754061), ('bop', -1.0836611644443128), ('dlr', -1.083471585173051), ('money-supply', -1.082870067635279), ('yen', -1.0822840320314526),

18943
['zinc'] [('alum', -0.07989930144825291), ('zinc', -0.07850955400612183), ('copper', -0.07840229762148178), ('lead', -0.07806710829285146), ('silver', -0.07805001782058595), ('wheat', -0.07783450209817633), ('grain', -0.07778097211004029), ('soy-meal', -0.07778063191286273), ('soy-oil', -0.07777304121207648), ('soybean', -0.07776877090950149)]
18944
['strategic-metal'] [('earn', -0.22885237190826788), ('strategic-metal', -0.22749341841477977), ('acq', -0.2272588550064074), ('rubber', -0.22607638619632545), ('lead', -0.22601412149711175), ('silver', -0.22600038410670506), ('zinc', -0.22599814734880794), ('cocoa', -0.2259896356809778), ('copper', -0.22598254145412797), ('pet-chem', -0.22595979895698937)]
18962
['acq'] [('earn', -0.31771661466008577), ('acq', -0.3126812750418842), ('retail', -0.30897452572995326), ('dmk', -0.30876977090510016), ('cotton-oil', -0.30874060080311233), ('soy-oil', -0.30873916391451883), ('groundnut', -0.30873478131075904), ('sun-oil', -0.308734576367259

19223
['heat'] [('crude', -0.5625933307107227), ('acq', -0.5580784003029431), ('heat', -0.5577693541255752), ('nat-gas', -0.557639433360092), ('fuel', -0.5573207735538812), ('corn', -0.5569396442527789), ('gas', -0.5569254673819549), ('grain', -0.5567881723376414), ('veg-oil', -0.5566532123217282), ('propane', -0.5566506880238641)]
19255
['acq'] [('money-fx', -0.5787883792584022), ('grain', -0.5773485792060303), ('trade', -0.5771823746557682), ('acq', -0.5769862917588181), ('corn', -0.576971688024895), ('interest', -0.5769322971014085), ('wheat', -0.5767851198968078), ('dlr', -0.5766487064369373), ('carcass', -0.5765829085320724), ('oilseed', -0.5764809140195765)]
19271
['nat-gas', 'acq'] [('acq', -0.16132226329468657), ('crude', -0.15639292166976632), ('nat-gas', -0.15636765129866248), ('gold', -0.15569540114124503), ('lumber', -0.1556240585994323), ('fuel', -0.15548774725502512), ('jobs', -0.155464820844248), ('jet', -0.15546086357469602), ('propane', -0.15546050314103405), ('heat', 

19668
['sugar', 'rapeseed', 'oilseed', 'oat', 'barley', 'wheat', 'rye', 'grain'] [('grain', -0.3859538980433182), ('wheat', -0.3843598060022392), ('sugar', -0.38292768581945835), ('corn', -0.3827255126705198), ('oilseed', -0.3826894379246917), ('barley', -0.38229950746521835), ('rapeseed', -0.38228118464284416), ('cotton', -0.3821347420996106), ('soybean', -0.382089116963113), ('rice', -0.38203535364840135)]
19672
['castor-oil', 'cotton', 'coffee', 'tea', 'trade'] [('coffee', -0.7389882801286844), ('trade', -0.7387052877845999), ('grain', -0.7385885854229906), ('wpi', -0.7378520197615487), ('carcass', -0.7378282823595665), ('sugar', -0.7378139017584173), ('cocoa', -0.7378055709892927), ('wheat', -0.7377537607065469), ('meal-feed', -0.737740874936043), ('veg-oil', -0.7377016387344167)]
19692
['zinc'] [('silver', -0.15916972535287952), ('copper', -0.15892130233863577), ('gold', -0.15869810494606473), ('alum', -0.15845158635283432), ('zinc', -0.1583965363866232), ('iron-steel', -0.1582042

20208
['rapeseed', 'sunseed', 'soybean', 'oilseed', 'soy-meal', 'meal-feed'] [('grain', -0.5390291971810941), ('oilseed', -0.5380245582673125), ('meal-feed', -0.5379233540317109), ('corn', -0.5378781743441733), ('wheat', -0.5376364981566508), ('soy-meal', -0.5375081744271771), ('rapeseed', -0.5372767113197535), ('soybean', -0.5371845948944413), ('veg-oil', -0.5371720989064652), ('sugar', -0.5370518161303782)]
20214
['earn'] [('cocoa', 0.0), ('sorghum', 0.0), ('oat', 0.0), ('barley', 0.0), ('corn', 0.0), ('wheat', 0.0), ('grain', 0.0), ('sunseed', 0.0), ('oilseed', 0.0), ('soybean', 0.0)]
20232
['veg-oil'] [('grain', -0.8016847604280876), ('trade', -0.8005580892191841), ('wheat', -0.799818634962412), ('money-fx', -0.7989497777006435), ('veg-oil', -0.7981691090295332), ('corn', -0.7979706439255568), ('interest', -0.7979435122996296), ('coffee', -0.7976842131424214), ('oilseed', -0.7976531439350989), ('soybean', -0.7975405490680242)]
20248
['jobs', 'trade'] [('cocoa', 0.0), ('sorghum', 0.

20773
['earn'] [('acq', -0.5225746071260016), ('earn', -0.5137046852023659), ('zinc', -0.5075279939879481), ('retail', -0.5072750890614994), ('rye', -0.5071671460364823), ('oat', -0.5071623763833468), ('cotton-oil', -0.5071566156010748), ('barley', -0.5071560230228553), ('housing', -0.5071546791360443), ('groundnut', -0.507148724708408)]
20828
['ship', 'crude'] [('trade', -0.5256069193043286), ('ship', -0.5242050044016583), ('crude', -0.523737988872566), ('grain', -0.5214589443385879), ('wheat', -0.5208519216786582), ('veg-oil', -0.5208048096436582), ('money-fx', -0.520804429052551), ('corn', -0.5207293940368757), ('carcass', -0.5206180536097571), ('oilseed', -0.5205456962956603)]
20868
['dlr', 'money-fx'] [('money-fx', -2.0207932668946986), ('interest', -2.007278057015498), ('dlr', -1.9986897689917715), ('trade', -1.9979686635351708), ('money-supply', -1.9951052472082833), ('gnp', -1.9948217308879659), ('reserves', -1.9934513314420366), ('cpi', -1.9933988097110502), ('yen', -1.9933621

21570
['rice', 'grain'] [('grain', -0.25077642031299563), ('wheat', -0.2493940568018482), ('corn', -0.24936082294942863), ('rice', -0.2492917869791579), ('sugar', -0.2490037561594544), ('money-supply', -0.24884739743876463), ('oilseed', -0.24881830732491728), ('coffee', -0.24874707200681206), ('soybean', -0.24868767014702475), ('rubber', -0.2486755972877909)]
21573
['yen', 'dlr', 'money-fx'] [('money-fx', -1.2024037830492982), ('dlr', -1.190987072631095), ('interest', -1.1864775896838924), ('trade', -1.1861250843259272), ('yen', -1.1843293475171783), ('money-supply', -1.1831495222331911), ('cpi', -1.182848268340764), ('gnp', -1.1826561756388645), ('bop', -1.1826282353778657), ('reserves', -1.182577527547466)]
21575
['ipi'] [('grain', -0.9180788153140651), ('trade', -0.9167792536083302), ('ipi', -0.9167762247175951), ('gnp', -0.9153620465583132), ('cpi', -0.9149077245710896), ('wheat', -0.9148856537808273), ('money-supply', -0.9148392047056088), ('reserves', -0.91483030063196), ('money-