In [1]:
import os
import glob
from codecs import encode, decode
from pymongo import MongoClient
import re
from bson.code import Code
from nltk.corpus import stopwords
import math

In [2]:
client = MongoClient('localhost', 27017)
db = client.test
# total number of documents in train/test set
n = 60

In [3]:
client.list_database_names()

['admin', 'config', 'local', 'test']

In [4]:
print(db.list_collection_names())

['TotalCounts', 'WordCounts', 'train', 'test']


In [19]:
def listdir_no_hidden(path):
    return glob.glob(os.path.join(path, '*'))


# def create_content(filename):

#     with open(filename) as f:
#         text = []
#         for line in f:
#             for word in line.strip().split():
#                 word = decode(word.strip(), 'latin2', 'ignore')
#                 text.append(word)
#     return text
def create_content(filename):
    text = []
    with open(filename, 'r', encoding='latin2') as f:
        for line in f:
            for word in line.strip().split():
                text.append(word)
    return text


def create_preprocessed_content(filename):
    """
    :param filename:
    :return: preprocessed text (string)
    """

    # with open(filename) as f:
    #     text = []
    #     for line in f:
    #         new_line = text_to_words(line)
    #         for word in new_line.strip().split():
    #             word = decode(word.strip(), 'latin2', 'ignore')
    #             text.append(word)
    text= []
    with open(filename, 'r', encoding= 'latin2') as f:
        for line in f: 
            new_line = text_to_words(line)
            for word in new_line.strip().split():
                text.append(word)
    return text



def text_to_words(raw_text):
    """
    :param raw text:
    :return: string of words with removed stop words
    """
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # Join the words back into one string separated by space,
    # and return the result.
    return " ".join(meaningful_words)

In [20]:
def create_train_collection():  # create test collection with 2 class : X and Y
    
    db.train.drop()
    for file in listdir_no_hidden("../data/train"):
        my_dict = {}
        class_x = 0
        class_y = 0
        my_dict['content'] = create_preprocessed_content(file)
        # my_dict['content'] = create_preprocessed_content(file)
        if "fortnow" in file:
            class_x = 1
        else:
            class_y = 1
        my_dict['classX'] = class_x  #need to turn into 4 classes
        my_dict['classY'] = class_y
        db.train.insert_one(my_dict)


def create_test_collection():           # create test collection with 2 class : X and Y

    db.test.drop()
    for file in listdir_no_hidden("../data/test"):
        my_dict = {}
        class_x = 0
        class_y = 0
        my_dict['content'] = create_preprocessed_content(file)
        # my_dict['content'] = create_preprocessed_content(file)
        if "fortnow" in file:
            class_x = 1
        else:
            class_y = 1
        my_dict['classX'] = class_x #need to turn into 4 classes
        my_dict['classY'] = class_y
        my_dict['predclassX'] = 0
        my_dict['predclassY'] = 0
        db.test.insert_one(my_dict)

In [21]:
def map_reduce_total_counts():
    db.TotalCounts.drop()
    pipeline_one = [
            { #mapper
            "$project": {
                "clX": {
                    "$cond": {
                        "if": { "$eq": ["$classX", 1] },
                        "then": { "$size": "$content" },
                        "else": 0
                    }
                },
                "clY": {
                    "$cond": {
                        "if": { "$ne": ["$classX", 1] },
                        "then": { "$size": "$content" },
                        "else": 0
                    }
                },
                "V": { "$size": "$content" }
            }
        },
        { #reducer
            "$group": {
                "_id": "doc",
                "clX": { "$sum": "$clX" },
                "clY": { "$sum": "$clY" },
                "V": { "$sum": "$V" }
            }
        },
        { #output
            "$out": "TotalCounts"
        }
        ]

    db.train.aggregate(pipeline_one)

In [22]:
def map_reduce_word_counts():
    db.WordCounts.drop()
    pipeline_two = [
        # Step 1: Unwind the content array to create a document for each word
    { 
        "$unwind": "$content" 
    },
    
    # Step 2: Project the required fields with the exact structure
    {
        "$project": {
            "_id": 0,
            "word": "$content",
            "value": {
                "classX": "$classX",
                "classY": "$classY"
            }
        }
    },
    
    # Step 3: Group by word (equivalent to the reduce phase)
    {
        "$group": {
            "_id": "$word",
            "classX": { "$sum": "$value.classX" },
            "classY": { "$sum": "$value.classY" }
        }
    },
    
    # Step 4: Reshape the output to match the original format
    {
        "$project": {
            "_id": 1,
            "value": {
                "classX": "$classX",
                "classY": "$classY"
            }
        }
    },
    
    # Step 5: Write to output collection
    {
        "$out": "WordCounts"
    }
    ]

    db.train.aggregate(pipeline_two)

In [23]:
def map_reduce():
    map_reduce_total_counts()
    map_reduce_word_counts()
    vocabulary = db.TotalCounts.find_one()['V']

    class_x = db.TotalCounts.find_one()['clX']
    class_y = db.TotalCounts.find_one()['clY']

    docs = db.train.find()
    test_docs = db.test.find()
    
    return vocabulary, class_x, class_y, docs, test_docs




In [24]:
def naive_bayes_classifier():
    vocabulary, class_x, class_y, docs, test_docs = map_reduce()
    n = 60.0
    n_class_x = 30.0
    n_class_y = 30.0
    denominator_x = class_x + vocabulary
    denominator_y = class_y + vocabulary
    probability_class_x = math.log10(n_class_x / n)
    probability_class_y = math.log10(n_class_y / n)
    for doc in test_docs:
        sum_x = 0.0
        sum_y = 0.0
        for word in doc['content']:
            dict_two = db.WordCounts.find_one({"_id": word})
            if dict_two is not None:
                word_class_x = dict_two['value']['classX']
                word_class_y = dict_two['value']['classY']
            else:
                word_class_x = 0.0
                word_class_y = 0.0
            sum_x += math.log10((word_class_x + 1.0) / denominator_x)
            sum_y += math.log10((word_class_y + 1.0) / denominator_y)
        x = sum_x + probability_class_x
        y = sum_y + probability_class_y
        if x > y:
            db.test.update_one(doc, {'$set': {'predClassX': 1, 'predClassY': 0}})
        else:
            db.test.find_one_and_update(doc, {'$set': {'predClassX': 0, 'predClassY': 1}})


In [47]:
def confusion_matrix():
    naive_bayes_classifier()
    db.test.aggregate([
        {
        "$project": {
            "emit": {
                "key": "doc",
                "value": {
                    "a": {
                        "$cond": [
                            {"$and": [
                                {"$eq": ["$classX", 1]},
                                {"$eq": ["$predClassX", 1]}
                            ]},
                            1,
                            0
                        ]
                    },
                    "b": {
                        "$cond": [
                            {"$and": [
                                {"$eq": ["$classX", 1]},
                                {"$eq": ["$predClassX", 0]}
                            ]},
                            1,
                            0
                        ]
                    },
                    "c": {
                        "$cond": [
                            {"$and": [
                                {"$eq": ["$classX", 0]},
                                {"$eq": ["$predClassX", 1]}
                            ]},
                            1,
                            0
                        ]
                    },
                    "d": {
                        "$cond": [
                            {"$and": [
                                {"$eq": ["$classX", 0]},
                                {"$eq": ["$predClassX", 0]}
                            ]},
                            1,
                            0
                        ]
                    }
                }
            }
        }
    },
    
    {
    "$group": {
        "_id": "$emit.key",
        "value": {
            "$accumulator": {
                "init": "function() { return {a: 0, b: 0, c: 0, d: 0}; }",
                "accumulate": "function(state, value) { return {a: state.a + value.a, b: state.b + value.b, c: state.c + value.c, d: state.d + value.d}; }",
                "accumulateArgs": ["$emit.value"],
                "merge": "function(state1, state2) { return {a: state1.a + state2.a, b: state1.b + state2.b, c: state1.c + state2.c, d: state1.d + state2.d}; }",
                "lang": "js"
                }
            }
        }
        },
        {
            "$out": "Results"
        }
    ])
    calculated_confusion_matrix = db.Results.find_one()['value']
    return calculated_confusion_matrix
    
    

In [48]:
create_test_collection()

In [49]:
create_train_collection()

In [50]:
db.WordCounts.find_one()


{'_id': 'raz', 'value': {'classX': 3, 'classY': 0}}

In [51]:
a = confusion_matrix()
print(a)

{'a': 26.0, 'b': 4.0, 'c': 3.0, 'd': 27.0}
