In [1]:
import numpy as np
from collections import defaultdict, Counter
import os.path
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import datetime
from operator import itemgetter

In [489]:
def weight_normalized_cnb(complement_probs, idf, vectorized_text, 
                          prior_probs):
    '''
    :param complement_probs: dictionary where key = label and values = dictionary where
                            keys = words and values = (# of times word w appears in docs
                            NOT labeled l)/(# of words in documents NOT labeled l)
    :param idf: dictionary where keys = words and values = (total # docs)/(# of docs in 
                which we see that word)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
     :param normalize_comp: dictionary where keys = labels and values = sum(|log(p)| for 
                            p in complement_probs)
    '''
    labels = []
    freq = Counter(vectorized_text)
    for label in prior_probs.keys():
        prob = prior_probs[label]
        conditional = 0.0
        for word in freq.keys():
            conditional -= (freq[word] * (idf[word]**2) * complement_probs[label][word])
            # The idf[word] term here is a powerful tool for improving accuracy, but why?
        prob += conditional
        labels.append((label, prob))
    return sorted(labels, key=itemgetter(1), reverse=True)

In [439]:
def complement_naive_bayes(complement_probs, idf, vectorized_text, prior_probs):
    '''
    :param complement_probs: dictionary where key = label and values = dictionary where
                            keys = words and values = (# of times word w appears in docs
                            NOT labeled l)/(# of words in documents NOT labeled l)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    for label in prior_probs.keys():
        prob = np.log(prior_probs[label])
        conditional = 0.0
        freq = Counter(vectorized_text)
        for word in freq.keys():
            conditional -= (freq[word] * idf[word] * complement_probs[label][word])
        prob += conditional
        labels.append((label, prob))
    return sorted(labels, key=itemgetter(1), reverse=True)

In [463]:
def multinomial_naive_bayes(conditional_probs, idf, vectorized_text, prior_probs):
    '''
    :param conditional_probs: dictionary where keys = labels and values = dictionary where
                    keys = words and values = P(x|Y)
    :param vectorized_text: words from text that are in valid_words
    :param prior_probs: dictionary where keys = labels and values = the probability
                        of seeing that label in the dataset
    '''
    labels = []
    freq = Counter(vectorized_text)
    for label in prior_probs.keys():
        prob = np.log(prior_probs[label])
        conditional = 0.0
        for word in vectorized_text:
            if conditional_probs[label][word] != 0.0:
                conditional += (freq[word] * idf[word] * conditional_probs[label][word])
        prob += conditional
        labels.append((label, prob))
    return sorted(labels, key=itemgetter(1), reverse=True)

In [5]:
def bayes_accuracy_model(num, number_labels, labels):
    '''
    :param num: the number of the document being checked, so we can check
                the correct labels for it
    :param number_labels: dictionary where keys = number of sample and
                            values = the set of labels associated with
                            that sample
    :param labels: the set of labels computed by Naive Bayes
    rye 0.00012871669455528383
    groundnut-oil 0.00012871669455528383
    cotton-oil 0.00012871669455528383
    castor-oil 0.00012871669455528383
    nkr 0.00012871669455528383
    sun-meal 0.00012871669455528383
    '''
    sample_labels = number_labels[num]
    successes = 0
    earned = 0
    bottom_5_times = 0
    bottom_5 = ['rye', 'groundnut-oil', 'cotton-oil', 'castor-oil', 'nkr', 'sun-meal']
    computed_labels = [x for x,y in labels]
    if "earn" in computed_labels[:3]:
        earned += 1
    computed_labels_trim = computed_labels[:len(sample_labels)]
    for label in bottom_5:
        if label in computed_labels[:5]:
            bottom_5_times += 1
            break
    if all(x in computed_labels_trim for x in sample_labels):
        successes += 1
    else:
        print(num)
        # print(sample_labels, computed_labels[:10])
        print(sample_labels, labels[:10])
        diff = set(sample_labels).difference(set(computed_labels_trim))
        if len(diff) < len(computed_labels_trim):
            successes += (len(diff)/len(computed_labels_trim))
    return [successes,earned, bottom_5_times]

In [6]:
def vectorize_text(stop_words, valid_words, filepath):
    '''
    This function removes non valid words from the text to put it into
    the Naive Bayes classifier
    :param stop_words: a set of words like "the", "and", etc
                        that should be stripped out of any computations
    :param valid_words: dictionary where keys = valid words in the corpus
    :param filepath: path to the text file
    :return: a vector of text stripped of stop words and non-valid words
    '''
    with open(filepath, "r") as f:
        content = f.read()
        words = nltk.word_tokenize(content)
        words = [word.lower() for word in words]
        new_words = [word.lower() for word in words if word in valid_words]
    return new_words

In [7]:
def cosine_similarity(avg_tf_idf, tf_idf_vector):
    '''
    This function takes the average TF-IDF vector for 
    every unique label and computes the cosine similarity between in
    and the tf-idf vector for a given sample. 
    :param avg_tf_idf: dictionary where keys = labels and values = dictionary
                        where keys = words and values = the average tf-idf score
                        for that term in documents with that specific label
    :param tf_idf_vector: numpy array 
    '''
    labels = []
    for label in avg_tf_idf.keys():
        # Cosine similarity = (a * b)/(|a| * |b|)
        # Higher cosine similarity = more similar documents
        vector = np.asarray(list(avg_tf_idf[label].values()))
        similarity = np.dot(vector, tf_idf_vector)
        mag_a = np.sqrt(np.dot(vector, vector))
        mag_b = np.sqrt(np.dot(tf_idf_vector, tf_idf_vector))
        denom = np.dot(mag_a, mag_b)
        similarity /= denom
        labels.append((label, similarity))
    return sorted(labels, key=itemgetter(1), reverse=True)

In [8]:
def compute_total_word_frequencies(dir_path, valid_words):
    '''
    :param dir_path: a path to the directory containing all the training samples
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param st: Lancaster Stemmer object 
    :return: a dictionary where keys = words and values = # of documents in which
            that word appears 
    '''
    frequencies = {word: 0 for word in valid_words}
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            num = int(file[0:len(file) - 4]) 
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words if word not in stop_words]
            new_words = [word.lower() for word in new_words if word in valid_words.keys()]
            new_words = set(new_words)
            for word in new_words:
                frequencies[word] += 1
    return frequencies

In [9]:
def compute_tf_idf_by_label(tf_idf, prior_probs, number_labels):
    '''
    This function will compute the total tf_idf score for
    each individual label
    :param tf_idf: a dictionary where keys = number of document and values = 
                    dictionary where keys = words and values = the tf_idf score 
                    of that word in that document
    :param prior_probs: a dictionary where keys = labels and values = the prob
                        of seeing that label (only used so I can grab the unique
                        labels for the document set)
    :param number_labels: a dictionary where the keys = numbers of a document and values
                            = the set of labels associated with it
    :return: a dictionary where keys = labels and values = sum of all tf-idf scores for
            all words that are in that label
    '''
    total_tf_by_label = {label: 0.0 for label in prior_probs.keys()}
    for num, vector in tf_idf.items():
        labels = number_labels[num]
        for l in labels:
            total_tf_by_label[l] += sum(list(vector.values()))
    return total_tf_by_label

In [10]:
def get_valid_words(dir_path, stop_words):
    '''
    Utility function that determines the set of valid words 
    to be used for classification and probability calculation
    :param dir_path: a path to the directory containing 
                    all the training samples
    :param stop_words: a set of words like "the", "and", etc
                        that should be stripped out of any computations
    :return: a Python dictionary where the keys = valid words and the 
            values = True, so we can use "key in dict" for future access
            in guaranteed constant time
    '''
    valid_words = defaultdict(bool)
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words]
            new_words = [word.lower() for word in new_words if word.isalpha()]
            new_words = [word.lower() for word in new_words if word not in stop_words]
            new_words = set(new_words)
            for word in new_words:
                valid_words[word] = True
    return valid_words
            

In [11]:
def add_labels_to_samples(filename):
    '''
    This function iterates over the file containing all 
    labels for each numbered sample, and maps them together with
    a dictionary
    :param filename: path to the file with all the labels in it (assumes
                    the file is located in this directory)
    :return: a dictionary with keys = number of the training sample and
            values = the set of labels associated with it
            AND
            the same, but with the test samples. Keep them separate for easy
            access later
    '''
    number_labels_training = defaultdict(list)
    number_labels_test = defaultdict(list)
    with open(filename, "r") as f:
        for line in f:
            terms = line.split()
            if line[0:4] == "test":
                num = int(terms[0][5:len(terms[0])])  # Test number, so we can map this back to the proper label(s) later on
                number_labels_test[num] = terms[1:]
            else:
                num = int(terms[0][9:len(terms[0])])  
                number_labels_training[num] = terms[1:]
    return [number_labels_training, number_labels_test]

In [12]:
def compute_prior_probabilities(number_labels):
    '''
    This function will compute the prior probabilities
    P(y) = probability of seeing a label with a sample. 
    Note: since many samples have multiple labels, these prior
    probabilites will sum to > 1
    :param number_labels: dictionary where keys = number of training sample
                            and value = the list of labels associated with it
    :return: a dictionary where keys = the label and value = probability of seeing
            that label in the document list
    '''
    prior_probs = defaultdict(float)
    i = 0
    for num, labels in number_labels.items():
        for l in labels:
            if not prior_probs[l]:
                prior_probs[l] = 1
            else:
                prior_probs[l] += 1
        i += 1
    for label, freq in prior_probs.items():
        prior_probs[label] /= i
    return prior_probs

In [13]:
def word_vectors_for_mega_docs(dir_path, valid_words, number_labels, label_list):
    '''
    This function returns a list of all the words from documents
    of each label
    :param dir_path: a path to the directory containing all the training samples
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param number_labels: dictionary where keys = document # and values = the set of 
                            labels associated with those labels
    :param label_list: list of all the unique labels
    :return: a dictionary where keys = labels and values = a dictionary where 
            keys = words and values = a vector with all the valid words in 
            documents with that label
            AND
            the total # of words in the entire corpus
            AND
            a dictionary where keys = words and values = idf scores of those words
    '''
    mega_docs = {label: [] for label in label_list}
    total_words = 0
    idf = {word: 0 for word in valid_words}
    i = 0
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            i += 1
            content = f.read()
            num = int(file[0:len(file) - 4])
            labels = number_labels[num]
            words = nltk.word_tokenize(content)
            words = [word.lower() for word in words]
            new_words = [word.lower() for word in words if word in valid_words]
            unique_words = set(new_words)
            for word in unique_words:
                idf[word] += 1
            total_words += len(new_words)
            for l in labels:
                mega_docs[l] += new_words
    for word in idf.keys():
        idf[word] = 1 + np.log(i/(idf[word] + 1))
    return [mega_docs, total_words, idf]

In [17]:
def rename_files(dir_path):
    '''
    Utility function designed to rename all files in any directory
    to a .txt file so they can be read from
    :param dir_path: directory of the files to be renamed
    '''
    for file in os.listdir(dir_path):
        filepath = dir_path + '\\' + file 
        os.rename(filepath, filepath+".txt")

In [55]:
def compute_frequencies_by_class(mega_docs, valid_words, label_list):
    '''
    This function computes the frequencies of all words by class. This is done because
    the outright frequencies are needed for Naive Bayes and conditional_probs can easily
    be obtained from this by dividing each entry by the number of elements in each "mega doc"
    :param mega_docs: a dictionary where keys = labels and values = vectors of all the
                        valid words present in documents with that label
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param label_list: list of all unique labels in the dataset
    :return: a dictionary where keys = labels and values = dictionary where keys = words
                and values = frequencies of that word in docs with that label
            AND
            a dictionary where keys = words and values = the total frequency of those words
            all documents throughout the corpus
    '''
    frequencies = {label: {word: 0 for word in valid_words} for label in label_list}
    total_frequencies = {word:0 for word in valid_words}
    for label, vector in mega_docs.items():
        freq = Counter(vector)
        for word in freq.keys():
            frequencies[label][word] += freq[word]
            total_frequencies[word] += freq[word]
    return [frequencies, total_frequencies]

In [474]:
def compute_word_frequencies(dir_path, valid_words, number_labels, label_list):
    '''
    This function will iterate over the documents and compute the frequencies of 
    the words by label and in total
    :param dir_path: a path to the directory containing all the training samples
    :param valid_words: a dictionary where the keys are all the unique, valid
                        terms are present in the text file
    :param number_labels: dictionary where keys = document # and values = the set of 
                            labels associated with those labels
    :param label_list: list of all the unique labels
    :return: a dictionary where keys = labels and values = dictionary where keys 
            = words and values = the frequencies of that word in documents with that label
            AND 
            a dictionary where keys = words and values = the total # of occurrences of
            that word
            AND
            a dictionary where keys = words and values = the idf score for that word
            AND 
            a dictionary where keys = labels and values = the total # of words associated with that label
            AND
            the total # of valid words in the entire corpus
    '''
    frequencies = {label: {word: 0.0 for word in valid_words} for label in label_list}
    idf = {word: 0.0 for word in valid_words}
    total_frequencies = {word: 0 for word in valid_words}
    total_word_count_by_label = {label: 0 for label in label_list}
    i = 0
    total_num_words = 0
    for file in os.listdir(dir_path):
        with open(dir_path + '\\' + file, "r") as f:
            content = f.read()
            num = int(file[0:len(file) - 4])
            labels = number_labels[num]
            words = nltk.word_tokenize(content)
            new_words = [word.lower() for word in words]
            new_words = [word.lower() for word in new_words if word in valid_words]
            total_num_words += len(new_words)
            freq = Counter(new_words)
            length_normalize = np.sqrt(sum([score**2 for score in freq.values()]))
            for l in labels:
                total_word_count_by_label[l] += len(new_words)
                for word in freq.keys():
                    frequencies[l][word] += (freq[word]/length_normalize)
                    total_frequencies[word] += (freq[word]/length_normalize)
                    idf[word] += 1
            i += 1
    for word in idf.keys():
        idf[word] = 1 + np.log(i/(idf[word]+1))
    return [frequencies, total_frequencies, idf, total_word_count_by_label, total_num_words]

In [475]:
if __name__ == '__main__':
    dir_path = "C:\\Users\\ksing\\OneDrive\\Documents\\Text Classifiers\\training"
    stop_words = set(stopwords.words('english'))
    valid_words = get_valid_words(dir_path, stop_words)
    number_labels_training, number_labels_test = add_labels_to_samples("cats.txt")
    prior_probs = compute_prior_probabilities(number_labels_training)
    
    parameters = compute_word_frequencies(dir_path, valid_words, number_labels_training, prior_probs.keys())
    
    frequencies = parameters[0]
    total_frequencies = parameters[1]
    idf = parameters[2] 
    total_word_count_by_label = parameters[3]
    total_num_words = parameters[4]

In [476]:
    conditional_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    complement_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    for label, vector in conditional_probs.items():
        denom = total_num_words - total_word_count_by_label[label] + len(valid_words.keys())
        for word in vector.keys():
            mod_cond_freq = frequencies[label][word] + 1
            mod_comp_freq = (total_frequencies[word] - frequencies[label][word]) + 1
            conditional_probs[label][word] = np.log(mod_cond_freq/(total_word_count_by_label[label] + len(valid_words.keys())))
            complement_probs[label][word] = np.log(mod_comp_freq/denom)

In [477]:
    '''
    mega_docs, total_num_words, idf = word_vectors_for_mega_docs(dir_path, valid_words, number_labels_training, prior_probs.keys())
    frequencies, total_frequencies = compute_frequencies_by_class(mega_docs, valid_words, prior_probs.keys())
    tf_idf_by_label = {label: {word: 0 for word in valid_words} for label in prior_probs.keys()}
    tf_idf_total = {word: 0 for word in valid_words}
    for label, vector in tf_idf_by_label.items():
        for word in vector.keys():
            tf_idf_by_label[label][word] = (np.log(frequencies[label][word]+1) * idf[word])
            tf_idf_total[word] += (np.log(frequencies[label][word]+1) * idf[word])
    for label, vector in tf_idf_by_label.items():
        if label != "earn":
            continue
        print("Label:", label, len(mega_docs[label]))
        for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):
            if score == 0.0:
                continue
            print(word, score, tf_idf_total[word])
        print("\n")
    conditional_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    complement_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    for label, vector in conditional_probs.items():
        denom = total_num_words - len(mega_docs[label]) + len(valid_words.keys())
        for word in vector.keys():
            conditional_probs[label][word] = np.log((tf_idf_by_label[label][word]+1)/(len(mega_docs[label]) + len(valid_words.keys())))
            # Odd, the values of complement_probs are the same regardless of the word, why is that
            # print(label, word, total_frequencies[word], frequencies[label][word])
            complement_probs[label][word] = np.log((tf_idf_total[word] - tf_idf_by_label[label][word] + 1)/denom)
    conditional_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    complement_probs = {label: {word: 0.0 for word in valid_words} for label in prior_probs.keys()}
    for label, vector in conditional_probs.items():
        denom = total_num_words - len(mega_docs[label]) + len(valid_words.keys())
        for word in vector.keys():
            conditional_probs[label][word] = np.log((frequencies[label][word]+1)/(len(mega_docs[label]) + len(valid_words.keys())))
            # Odd, the values of complement_probs are the same regardless of the word, why is that
            # print(label, word, total_frequencies[word], frequencies[label][word])
            complement_probs[label][word] = np.log((total_frequencies[word] - frequencies[label][word] + 1)/denom)
    '''

'\nmega_docs, total_num_words, idf = word_vectors_for_mega_docs(dir_path, valid_words, number_labels_training, prior_probs.keys())\nfrequencies, total_frequencies = compute_frequencies_by_class(mega_docs, valid_words, prior_probs.keys())\ntf_idf_by_label = {label: {word: 0 for word in valid_words} for label in prior_probs.keys()}\ntf_idf_total = {word: 0 for word in valid_words}\nfor label, vector in tf_idf_by_label.items():\n    for word in vector.keys():\n        tf_idf_by_label[label][word] = (np.log(frequencies[label][word]+1) * idf[word])\n        tf_idf_total[word] += (np.log(frequencies[label][word]+1) * idf[word])\nfor label, vector in tf_idf_by_label.items():\n    if label != "earn":\n        continue\n    print("Label:", label, len(mega_docs[label]))\n    for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):\n        if score == 0.0:\n            continue\n        print(word, score, tf_idf_total[word])\n    print("\n")\nconditional_probs = {label: {word:

In [478]:
    complement_probs_normalized = {label: {word: ((np.exp(complement_probs[label][word])-1) * idf[word]) +1 for word in valid_words} 
                                   for label in prior_probs.keys()}
    for label, vector in complement_probs.items():
        normalize_term = np.sqrt(sum([(complement_probs_normalized[label][word]**2) for word in valid_words]))
        for word in vector.keys():
            complement_probs_normalized[label][word] = complement_probs[label][word] / normalize_term

In [479]:
    '''
    for label, vector in complement_probs.items():
        if label != "earn":
            continue
        print("Label:", label, len(mega_docs[label]), len(vector.keys()))
        for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):
            if score == 0.0:
                continue
            print(word, score)
        print("\n")
    for label, vector in conditional_probs.items():
        if label != "earn":
            continue
        print("Label:", label, len(mega_docs[label]))
        for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):
            if score == 0.0:
                continue
            print(word, score)
        print("\n")
    '''

'\nfor label, vector in complement_probs.items():\n    if label != "earn":\n        continue\n    print("Label:", label, len(mega_docs[label]), len(vector.keys()))\n    for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):\n        if score == 0.0:\n            continue\n        print(word, score)\n    print("\n")\nfor label, vector in conditional_probs.items():\n    if label != "earn":\n        continue\n    print("Label:", label, len(mega_docs[label]))\n    for word, score in sorted(vector.items(), key=itemgetter(1), reverse=True):\n        if score == 0.0:\n            continue\n        print(word, score)\n    print("\n")\n'

In [480]:
    prior_probs_normalized = {label: np.log(prior_probs[label]) for label in prior_probs.keys()}
    normalize = np.sqrt(sum([prior_probs_normalized[label]**2 for label in prior_probs.keys()]))
    for label, score in sorted(prior_probs_normalized.items(), key=itemgetter(1), reverse=True):
        prior_probs_normalized[label] /= normalize
        print(label, prior_probs_normalized[label], prior_probs[label])

earn -0.016801392668380023 0.37031793023555154
acq -0.0262046336588211 0.2123825460162183
money-fx -0.045158706616913784 0.0692495816707427
grain -0.04883089979096004 0.055734328742437896
crude -0.05064328363162351 0.050070794182005406
trade -0.05158190163178988 0.04736774359634444
interest -0.05257568608534391 0.04466469301068349
wheat -0.060909437193724554 0.02728793924572017
ship -0.062150565838972896 0.025357188827390912
corn -0.06349003581474896 0.023426438409061657
money-supply -0.0679274371065068 0.018020337237739735
dlr -0.0690512318067612 0.01686188698674218
sugar -0.0697094133570805 0.016218303513965762
oilseed -0.06998002923345899 0.015960870124855194
coffee -0.07185318161560161 0.014287553095636504
gnp -0.07344994501886726 0.013000386150083665
gold -0.07466474310889995 0.012099369288196679
veg-oil -0.07597359392282031 0.011198352426309692
soybean -0.07782049590167017 0.010039902175312138
livestock -0.07848384097865034 0.009653752091646286
nat-gas -0.07848384097865034 0.0096

In [488]:
    # Removing the stemmer actually improves accuracy on test set, who knew
    successes, earned, bottom_5,i = 0, 0, 0, 0
    dir_path = "C:\\Users\\ksing\\OneDrive\\Documents\\Text Classifiers\\test"
    for file in os.listdir(dir_path):
        filepath = dir_path + '\\' + file 
        num = int(file[0:len(file) - 4])
        text = vectorize_text(stop_words, valid_words, filepath)
        # computed_labels = complement_naive_bayes(complement_probs, idf, text, prior_probs)
        # computed_labels = multinomial_naive_bayes(conditional_probs, idf, text, prior_probs)
        computed_labels = weight_normalized_cnb(complement_probs_normalized, idf, text, 
                                               prior_probs_normalized)
        suc, e, b5 = bayes_accuracy_model(num, number_labels_test, computed_labels)
        # Even with using conditional_probs, earn appears in 1773/3019 samples
        
        # CNB brought earn labels down to 1170/3019, which is the best improvement so far
        
        # The slower the denominator function grows, the less we see bottom 5 labels appear in the top 5
        # This occurs (I think) because labels with less docs = larger denominator term = smaller number inside log
        # = more negative logarithm output = (freq * compl_prob[label][word]) is disproportionately smaller for smaller
        # classes. This function returns an argmin, which means that super negative terms are more likely to float to the
        # top, like the bottom 5 labels
                
        # Multinomial Naive Bayes: 84.09% (2538.627561327562) accuracy on test set (????), 1773 "Earn" labels
        # Complement Naive Bayes: 85.09% (2568.913203463204) accuracy on test set, 1687 "Earn" labels
        # Weight Normalized CNB w/ TF-IDF transformation: 76.35% (2304.986291486292), 2131 "Earn" labels
        # (I FORGOT HOW I GOT THIS AHHHHH)
        # CNB with IDF transformation: 86.76% (2619.324711399711), 1321 "Earn" labels
        # MNB with IDF transformation: 84.87% (2562.234704184704), 1521 "Earn" labels
        
        # CNB with doc length normalization: 88.92% accuracy(2684.349675324675), 1242 "Earn" labels
            # Adding the IDF transformation to CNB drops the accuracy a few percent
        # MNB with doc length normalization and IDF: 85.57% accuracy (2583.5922438672446), 1761 "Earn" labels
                # When length normalization is used with MNB it screws up the accuracy a LOT (down to around 2000)
        # WCNB with doc length normalization and IDF^2: 86.41% accuracy (2608.605230880231), 1312 "Earn" labels
        
        # Perhaps the reason that TF doesn't lead to improvements with this is because we already stripped out the 
        # stop words, which would be affected the most by this technique
        # It appears as if there isn't much dependence within the documents in the Reuters dataset because
        # the weight normalization term doesn't appear to do much.
        successes += suc
        earned += e
        bottom_5 += b5
        i += 1
    print(successes, earned, bottom_5, i)

14828
['grain'] [('cpi', 23.86781813294042), ('grain', 23.867136092260232), ('livestock', 23.858603288696308), ('ship', 23.85769775117297), ('gnp', 23.85755660183207), ('wheat', 23.856667131912566), ('corn', 23.856105970286897), ('cotton', 23.855730132974077), ('barley', 23.855024456387504), ('sugar', 23.85470296153643)]
14829
['nat-gas', 'crude'] [('trade', 30.44535828406691), ('nat-gas', 30.420032425946356), ('dlr', 30.405591190513892), ('cpi', 30.39199544034739), ('crude', 30.391325447029885), ('yen', 30.383975694222077), ('alum', 30.376791923603307), ('gas', 30.375305601927323), ('veg-oil', 30.373307620886724), ('iron-steel', 30.3732384494301)]
14832
['rubber', 'tin', 'sugar', 'corn', 'rice', 'grain', 'trade'] [('bop', 24.818113235244724), ('sugar', 24.801814333985398), ('rubber', 24.798950288964367), ('tin', 24.793732580444285), ('trade', 24.787368054716126), ('money-supply', 24.783867475043976), ('cpi', 24.773776742237253), ('rice', 24.765010287381873), ('gold', 24.76035496453807

15033
['corn', 'grain'] [('corn', 31.569814534642553), ('copper', 31.56154538062904), ('grain', 31.546062858373176), ('money-fx', 31.527329621328068), ('cpi', 31.527134147338565), ('meal-feed', 31.52680154402297), ('veg-oil', 31.525659338966822), ('retail', 31.525404151845734), ('coffee', 31.524958435642098), ('interest', 31.523252560091116)]
15049
['interest'] [('money-fx', 19.521267265843704), ('interest', 19.48154324480942), ('money-supply', 19.270111103847626), ('dlr', 19.21626114149918), ('cpi', 19.21082626075957), ('reserves', 19.20553257553558), ('gold', 19.20406310847715), ('livestock', 19.200160771308536), ('carcass', 19.199835326316087), ('jobs', 19.1994904627163)]
15063
['earn', 'acq', 'crude', 'pet-chem'] [('pet-chem', 204.93729609695222), ('acq', 204.71860298946882), ('alum', 204.64485092702512), ('iron-steel', 204.61303868997248), ('silver', 204.6063401225014), ('lumber', 204.60355386848184), ('cpu', 204.6031226384894), ('rubber', 204.60147320441746), ('lei', 204.60070697

15314
['earn'] [('iron-steel', 36.322534626647624), ('gold', 36.313171353725004), ('strategic-metal', 36.312413279996086), ('copper', 36.30964208553214), ('orange', 36.30945649979024), ('gas', 36.30514223974551), ('alum', 36.3047959275419), ('barley', 36.30035797055533), ('acq', 36.300306124449584), ('zinc', 36.29971520744684)]
15322
['crude', 'nat-gas'] [('nat-gas', 37.03749062962854), ('carcass', 36.98678472111021), ('livestock', 36.97904115244162), ('cocoa', 36.976570332852084), ('veg-oil', 36.97500369683594), ('rubber', 36.97009949816563), ('iron-steel', 36.965627590458595), ('gas', 36.96465834147033), ('ship', 36.96387635584251), ('fuel', 36.95773006610593)]
15324
['earn'] [('acq', 14.857055535284495), ('ship', 14.740814123452566), ('interest', 14.740381729528638), ('coffee', 14.735965234101304), ('oilseed', 14.727527850890313), ('soybean', 14.725258933963605), ('cpi', 14.72519271838733), ('gold', 14.725056208966802), ('veg-oil', 14.724641564601384), ('dlr', 14.724521374895284)]
1

15556
['lead'] [('alum', 157.26072582374766), ('copper', 156.95119839447793), ('lead', 156.90937059644025), ('zinc', 156.85552255356376), ('ship', 156.8321092081764), ('silver', 156.77881700498463), ('meal-feed', 156.77456542011285), ('nickel', 156.76928013511088), ('strategic-metal', 156.75789582899202), ('barley', 156.74853729878933)]
15562
['dlr'] [('money-fx', 17.24218167106307), ('interest', 17.109228654735457), ('dlr', 17.099725669078392), ('reserves', 17.084158162954814), ('ship', 17.080002286571744), ('coffee', 17.077839638000142), ('money-supply', 17.07548623403796), ('yen', 17.075106931647053), ('gnp', 17.07347935557697), ('veg-oil', 17.070786850033617)]
15563
['lead'] [('copper', 9.492895293794122), ('interest', 9.410864116974063), ('veg-oil', 9.406601814839131), ('corn', 9.402899033059812), ('gold', 9.40151438552291), ('coffee', 9.40093110423421), ('grain', 9.400644567372483), ('sugar', 9.399305957138353), ('oilseed', 9.399092334595245), ('wheat', 9.398805525657036)]
15567


15749
['earn'] [('crude', 5.322452368175061), ('earn', 5.267565623758863), ('acq', 5.250039014280451), ('interest', 5.226088631735945), ('ship', 5.22241978034184), ('coffee', 5.217392287499911), ('grain', 5.2166475888321795), ('dlr', 5.215260179034346), ('wheat', 5.215056833287697), ('oilseed', 5.215039819049025)]
15751
['soy-oil', 'veg-oil'] [('grain', 8.055668634462975), ('wheat', 8.022804880772167), ('corn', 7.9894313339673735), ('veg-oil', 7.98894661951685), ('oilseed', 7.974572995540548), ('sugar', 7.965622236024333), ('soybean', 7.963634502691762), ('ship', 7.9608230607372175), ('livestock', 7.9606941988215745), ('money-supply', 7.954738882101193)]
15767
['lead'] [('crude', 11.318940958876027), ('corn', 11.310931485586858), ('copper', 11.308045241191206), ('grain', 11.305831508589144), ('ship', 11.299223525397041), ('interest', 11.29579146546792), ('nat-gas', 11.294954377353994), ('wheat', 11.294120002676781), ('oilseed', 11.292553828460177), ('gold', 11.292213758650098)]
15768
[

15921
['soybean', 'oilseed'] [('grain', 14.899291531623804), ('oilseed', 14.885527747167435), ('corn', 14.87626945976051), ('soybean', 14.875065825830834), ('wheat', 14.86566353658528), ('veg-oil', 14.850302592390038), ('coffee', 14.846469100518648), ('sugar', 14.84542129232979), ('gas', 14.830637498420392), ('livestock', 14.829215631252332)]
15922
['soy-meal', 'meal-feed'] [('oilseed', 17.26486485224858), ('soybean', 17.250185814603192), ('grain', 17.24501649323839), ('corn', 17.242803407215128), ('meal-feed', 17.238947187649572), ('veg-oil', 17.237211743732274), ('wheat', 17.221213083335975), ('coffee', 17.214905032660514), ('rubber', 17.20994090013074), ('sugar', 17.209535913633463)]
15923
['soy-oil', 'veg-oil'] [('oilseed', 18.34436148799845), ('soybean', 18.340361679578393), ('grain', 18.340320374670686), ('corn', 18.337400712222653), ('veg-oil', 18.333689921541417), ('wheat', 18.32336498465185), ('coffee', 18.32026038283999), ('rubber', 18.31964197354746), ('sugar', 18.3182710848

16108
['cpi', 'gnp', 'bop'] [('interest', 47.62947502915984), ('gnp', 47.55762690583172), ('bop', 47.47743743358071), ('lei', 47.46757690829595), ('cpi', 47.454315818545936), ('jobs', 47.45238065919167), ('money-supply', 47.4463192883662), ('reserves', 47.441420058817585), ('rice', 47.44000684576462), ('alum', 47.437200235276336)]
16112
['earn'] [('crude', 113.68619733834807), ('nat-gas', 113.46104704223704), ('jet', 113.19679948291528), ('zinc', 113.12725832790808), ('fuel', 113.12515549232357), ('heat', 113.12504897728992), ('tin', 113.11713957572154), ('pet-chem', 113.1166016038806), ('dmk', 113.11359631511478), ('hog', 113.11342940867223)]
16117
['acq'] [('gold', 21.073875467603205), ('acq', 21.064928159812677), ('silver', 20.999173738287936), ('copper', 20.971754331727475), ('livestock', 20.960219049471522), ('money-supply', 20.958281968927285), ('wheat', 20.957659638795427), ('barley', 20.956850378783496), ('reserves', 20.956109762932332), ('jobs', 20.9554836399788)]
16118
['reta

16278
['earn'] [('acq', 52.618415197949446), ('earn', 52.53220821506334), ('alum', 52.40720517049483), ('copper', 52.406166118022526), ('zinc', 52.399639539825145), ('barley', 52.398431668578745), ('gold', 52.39731126721268), ('silver', 52.39703192401396), ('lead', 52.396062611362005), ('orange', 52.39574981088874)]
16281
['lei'] [('interest', 3.098132601048749), ('money-fx', 3.0870153394749766), ('trade', 3.070747655879629), ('grain', 3.0685278113398677), ('money-supply', 3.0678946565440923), ('crude', 3.066498100794241), ('gnp', 3.062945550554907), ('wheat', 3.0626261390726444), ('corn', 3.062295483291318), ('ship', 3.06141898467035)]
16289
['lead'] [('copper', 9.492895293794122), ('interest', 9.410864116974063), ('veg-oil', 9.406601814839131), ('corn', 9.402899033059812), ('gold', 9.40151438552291), ('coffee', 9.40093110423421), ('grain', 9.400644567372483), ('sugar', 9.399305957138353), ('oilseed', 9.399092334595245), ('wheat', 9.398805525657036)]
16312
['earn'] [('interest', 15.84

16784
['rice', 'orange', 'carcass', 'trade'] [('orange', 103.70212394572367), ('livestock', 103.59164746692234), ('carcass', 103.58396056417284), ('rice', 103.41770657017753), ('trade', 103.40194779992349), ('coffee', 103.38265322817415), ('hog', 103.37460954497732), ('yen', 103.36791767739926), ('cotton', 103.36293075788522), ('dlr', 103.358931485923)]
16790
['grain', 'trade'] [('trade', 47.01821009860023), ('carcass', 46.93222304416183), ('livestock', 46.92163215544078), ('orange', 46.90763648151507), ('grain', 46.83700404561379), ('wheat', 46.81908155834447), ('rice', 46.8134699785616), ('veg-oil', 46.81237779441657), ('cotton', 46.80286353070065), ('oilseed', 46.79724692331497)]
16826
['earn'] [('acq', 13.634551982973372), ('earn', 13.525328403243865), ('money-supply', 13.437511098390418), ('gold', 13.435285696517512), ('dlr', 13.434411956182135), ('veg-oil', 13.432242790360803), ('reserves', 13.432054881694638), ('cpi', 13.430907345430864), ('livestock', 13.430261701803246), ('cop

17670
['lumber', 'trade'] [('lumber', 64.57282203056468), ('veg-oil', 64.45987714998346), ('gold', 64.45947687429202), ('corn', 64.45376567587999), ('iron-steel', 64.4507834287718), ('meal-feed', 64.4417741675997), ('grain', 64.44109099041398), ('rubber', 64.43696855736376), ('wheat', 64.43559379412619), ('orange', 64.43538508438277)]
17682
['acq'] [('earn', 19.337917115434834), ('acq', 19.337212981388486), ('nat-gas', 19.230673180164324), ('ipi', 19.230028652094223), ('copper', 19.22896380221172), ('livestock', 19.22793375397455), ('money-supply', 19.227841831829533), ('gold', 19.227350801257586), ('veg-oil', 19.226687541808204), ('jobs', 19.22650600636074)]
17722
['corn', 'wheat', 'grain', 'soybean'] [('grain', 4.408833866205818), ('corn', 4.396160349816062), ('oilseed', 4.388427570999111), ('wheat', 4.38275828390561), ('soybean', 4.355645413325357), ('interest', 4.311138802437983), ('ship', 4.3109850075779335), ('money-supply', 4.310267607345119), ('dlr', 4.3072857725035485), ('suga

17950
['acq'] [('pet-chem', 48.21764997939753), ('alum', 48.191137466811085), ('iron-steel', 48.19040723065274), ('barley', 48.189543396617935), ('housing', 48.18718610667132), ('zinc', 48.18643769184245), ('copper', 48.185923227387256), ('strategic-metal', 48.18505352297753), ('hog', 48.18447008347409), ('lead', 48.184322194576026)]
17962
['rice', 'wheat', 'grain'] [('ship', 49.092768757113596), ('carcass', 49.04119662565844), ('livestock', 49.03820702126601), ('sugar', 49.03278009961493), ('iron-steel', 49.03110884630977), ('gas', 49.01979771952113), ('copper', 49.017643510923016), ('rice', 49.01664992868007), ('alum', 49.01542375141187), ('veg-oil', 49.015040042254)]
17966
['interest', 'reserves'] [('interest', 148.4487700566378), ('money-fx', 148.29938422260383), ('reserves', 148.08194196708348), ('money-supply', 148.06958535796736), ('dmk', 148.06782100761342), ('yen', 148.06584147261972), ('dlr', 148.06573643445512), ('silver', 148.06457601085717), ('lei', 148.06456576426172), ('

18317
['copper', 'acq'] [('copper', 91.60923924177486), ('zinc', 91.09898618112014), ('gold', 91.0956365700129), ('lead', 91.09362527266924), ('rubber', 91.09189100156053), ('iron-steel', 91.0874774755664), ('pet-chem', 91.08679946182173), ('alum', 91.0798860681512), ('silver', 91.0757149228464), ('nickel', 91.0729283898588)]
18321
['reserves'] [('money-fx', 9.816652945447563), ('interest', 9.772268329955784), ('gold', 9.762603611170185), ('money-supply', 9.762190263016485), ('reserves', 9.754766596455417), ('earn', 9.74790388524623), ('dlr', 9.741634040083937), ('acq', 9.741120580113208), ('corn', 9.739522751673134), ('cpi', 9.73921996368452)]
18337
['zinc', 'lead'] [('lead', 38.92782862627193), ('alum', 38.875245468154795), ('ship', 38.86695784832109), ('gold', 38.8393606227437), ('copper', 38.814079914442765), ('zinc', 38.81037689665409), ('iron-steel', 38.79617641882166), ('silver', 38.788456141927135), ('livestock', 38.78681007109246), ('carcass', 38.78645838009278)]
18345
['rapes

18665
['income'] [('money-supply', 22.84174380978974), ('income', 22.815267840269517), ('wheat', 22.811792627118), ('jobs', 22.809262699585286), ('copper', 22.8083817306192), ('gnp', 22.806906226000564), ('bop', 22.805321610093156), ('cpi', 22.802329308035894), ('ipi', 22.798713790153553), ('reserves', 22.797303742589577)]
18670
['interest'] [('money-fx', 7.476665673710195), ('interest', 7.4756612034662355), ('money-supply', 7.439556933072823), ('dlr', 7.434727879700481), ('ship', 7.425813467917816), ('wheat', 7.424081586235414), ('corn', 7.423716919651017), ('oilseed', 7.423708269119098), ('sugar', 7.422608270346409), ('veg-oil', 7.421272560813717)]
18672
['income', 'jobs', 'reserves', 'interest'] [('interest', 102.04572227730273), ('jobs', 101.82108202004115), ('money-supply', 101.7646297581329), ('money-fx', 101.73180085661987), ('cpi', 101.72357712158481), ('ipi', 101.70348484450554), ('housing', 101.70087358817297), ('dlr', 101.69979234893377), ('bop', 101.69769926973018), ('reser

19028
['dlr', 'money-fx', 'money-supply'] [('money-supply', 103.97926099962396), ('interest', 103.88803544328518), ('money-fx', 103.80887996143504), ('housing', 103.72774109255661), ('lei', 103.70781292033296), ('cpi', 103.70388821847148), ('dlr', 103.70041815364924), ('gnp', 103.69480745580282), ('ipi', 103.69124707232929), ('jobs', 103.68962560287682)]
19055
['grain', 'ship'] [('ship', 36.0080210946506), ('tin', 35.41403797740301), ('veg-oil', 35.409282983729426), ('oilseed', 35.40903123304196), ('copper', 35.406678806493126), ('livestock', 35.40249071839808), ('jobs', 35.40040573336281), ('carcass', 35.399157171538256), ('wheat', 35.39800512168544), ('barley', 35.39758759159482)]
19057
['soybean', 'oilseed'] [('grain', 12.691562099226735), ('ship', 12.672700175963953), ('oilseed', 12.669214998582017), ('corn', 12.663726495692163), ('soybean', 12.647563026867417), ('wheat', 12.64598302392131), ('sugar', 12.62591198255325), ('veg-oil', 12.617874546939632), ('money-supply', 12.60988506

19444
['bop', 'trade'] [('money-fx', 35.141500751353554), ('bop', 35.12933504190211), ('reserves', 35.11079595915024), ('money-supply', 35.10739212308625), ('interest', 35.10527918595039), ('jobs', 35.09791237419651), ('dlr', 35.091177334846016), ('gold', 35.09066415277953), ('veg-oil', 35.08935767052642), ('copper', 35.08770162783822)]
19477
['zinc', 'lead'] [('alum', 21.517373012100865), ('lead', 21.478384451186333), ('ship', 21.459535361205504), ('gold', 21.456448986614532), ('copper', 21.415416353307727), ('zinc', 21.402965841170783), ('silver', 21.381359449222852), ('livestock', 21.380138840085884), ('carcass', 21.379102955014133), ('iron-steel', 21.378184633546457)]
19478
['gnp'] [('housing', 132.02365661090607), ('cpi', 132.0105828433977), ('gnp', 132.0039384343474), ('jobs', 131.95946440567684), ('lei', 131.95899390546475), ('dlr', 131.94350799564037), ('ipi', 131.93739317906042), ('retail', 131.93590337381102), ('lumber', 131.92510866198546), ('yen', 131.9198297066379)]
19489


19835
['cotton', 'rice', 'grain'] [('cotton', 28.314792093110174), ('rice', 28.234972159968827), ('veg-oil', 28.222837209956523), ('wheat', 28.218427953504737), ('grain', 28.216703151501736), ('palm-oil', 28.213369306948536), ('ship', 28.211601578330843), ('livestock', 28.201401754278304), ('barley', 28.200383489649234), ('oilseed', 28.199288924036022)]
19869
['crude', 'money-fx'] [('crude', 45.57851972341667), ('alum', 45.45231976330203), ('iron-steel', 45.44609227686851), ('copper', 45.358405405721186), ('gas', 45.34887461503174), ('reserves', 45.33132019138241), ('dlr', 45.33001305141784), ('pet-chem', 45.32890903602895), ('cpi', 45.32821669739206), ('barley', 45.324768387527456)]
19903
['pet-chem', 'naphtha', 'crude'] [('crude', 30.73153786819303), ('pet-chem', 30.70853976603261), ('gas', 30.582677325874588), ('gold', 30.57502307271225), ('alum', 30.574110408744833), ('iron-steel', 30.572543980334167), ('nat-gas', 30.571062747682582), ('rubber', 30.564395140733836), ('copper', 30.5

20439
['acq'] [('trade', 18.46278998396292), ('rubber', 18.433273435923137), ('wheat', 18.423556147522298), ('dlr', 18.41529208103859), ('oilseed', 18.411124377027058), ('veg-oil', 18.4101065614409), ('livestock', 18.40968710205411), ('yen', 18.4092264468637), ('carcass', 18.40790624620334), ('grain', 18.40704419262176)]
20441
['carcass', 'trade'] [('coffee', 53.49514404291118), ('trade', 53.46921483041952), ('orange', 53.423703140715226), ('carcass', 53.38951883681753), ('livestock', 53.375895359130766), ('grain', 53.31518182000544), ('corn', 53.30276361318264), ('rice', 53.299824589450296), ('cotton', 53.2985698432233), ('veg-oil', 53.29641062203303)]
20462
['sugar', 'carcass', 'livestock', 'trade'] [('trade', 105.37956408147731), ('carcass', 104.98486880523352), ('veg-oil', 104.97229614980364), ('livestock', 104.96391630406184), ('oilseed', 104.95086874113818), ('sugar', 104.92917728063509), ('cotton', 104.9175497248577), ('groundnut', 104.90539538763362), ('barley', 104.87833397187

20868
['dlr', 'money-fx'] [('money-fx', 148.7545142413451), ('interest', 148.44967091510838), ('dlr', 148.31045763413502), ('money-supply', 148.17153031552522), ('reserves', 148.1563808542238), ('dmk', 148.14903281875766), ('cpi', 148.14103205439616), ('meal-feed', 148.13786545012994), ('yen', 148.1371471914964), ('lei', 148.13415892158045)]
20909
['crude'] [('ship', 26.644168598971586), ('crude', 26.604730226260575), ('sugar', 26.537871384950343), ('coffee', 26.47908141170493), ('livestock', 26.467529826576886), ('veg-oil', 26.467437465417465), ('carcass', 26.465554419856634), ('trade', 26.46507090474098), ('copper', 26.458503613305414), ('barley', 26.45713506392248)]
20911
['coconut-oil', 'palmkernel', 'palm-oil', 'veg-oil', 'soybean', 'oilseed'] [('veg-oil', 67.67521062835236), ('palm-oil', 67.47527304644554), ('coconut-oil', 67.40593030920556), ('oilseed', 67.38261870958381), ('livestock', 67.37537880631862), ('carcass', 67.37260933346886), ('soybean', 67.35314394741107), ('cotton'

21473
['bop'] [('trade', 53.37293445304855), ('bop', 53.30849803351277), ('ship', 53.27609347380298), ('dlr', 53.25973230484313), ('iron-steel', 53.248058963087004), ('alum', 53.24270309923095), ('yen', 53.23997767444981), ('reserves', 53.23959753826586), ('jobs', 53.238225848291904), ('carcass', 53.23510426437745)]
21484
['crude'] [('ship', 14.924898450504429), ('crude', 14.911912094922842), ('coffee', 14.825433955642803), ('money-supply', 14.824935930244852), ('oilseed', 14.823567302855235), ('veg-oil', 14.823014494089339), ('gold', 14.822837504655105), ('corn', 14.822407358105702), ('livestock', 14.82135660228447), ('dlr', 14.82128147643296)]
21486
['ship', 'propane'] [('crude', 148.07598766247486), ('nat-gas', 147.62950934356346), ('ship', 147.6029023314426), ('propane', 147.53506572079672), ('barley', 147.48369124499973), ('palm-oil', 147.4752707409693), ('heat', 147.4472931965914), ('rapeseed', 147.4449532495734), ('meal-feed', 147.44217916403306), ('veg-oil', 147.44130135937962)