In [1]:
import os
import glob
from codecs import encode, decode
from pymongo import MongoClient
import re
from bson.code import Code
from nltk.corpus import stopwords
import math
import pandas as pd 
import numpy as np 
import aggregation_pipeline


ImportError: cannot import name 'SON' from 'bson' (c:\Users\Anh\.conda\envs\data\Lib\site-packages\bson\__init__.py)

# connect to MongoDB

In [2]:
client = MongoClient('localhost', 27017)
db = client.ag_news_classification


In [3]:
client.list_database_names()

['admin', 'ag_news_classification', 'config', 'local', 'test']

In [4]:
print(db.list_collection_names())

['TotalCounts', 'WordCounts', 'Results', 'test', 'train']


# Preprocess Input Data


In [5]:
def listdir_no_hidden(path):
    return glob.glob(os.path.join(path, '*'))

def create_content(filename):
    text = []
    with open(filename, 'r', encoding='latin2') as f:
        for line in f:
            for word in line.strip().split():
                text.append(word)
    return text


def create_preprocessed_content(filename):
    """
    :param filename:
    :return: preprocessed text (string)
    """

    text= []
    with open(filename, 'r', encoding= 'latin2') as f:
        for line in f: 
            new_line = text_to_words(line)
            for word in new_line.strip().split():
                text.append(word)
    return text



def text_to_words(raw_text):
    """
    :param raw text:
    :return: string of words with removed stop words
    """
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # Join the words back into one string separated by space,
    # and return the result.
    return " ".join(meaningful_words)

# Create Train and Test Collection


In [6]:
def create_train_collection(path= "../ag_news_data/train"):  # create test collection with 2 class : X and Y
    
    db.train.drop()
    for file in listdir_no_hidden(path):
        my_dict = {}
        class_x = 0
        class_y = 0
        my_dict['content'] = create_preprocessed_content(file)
        # my_dict['content'] = create_preprocessed_content(file)
        if "fortnow" in file:
            class_x = 1
        else:
            class_y = 1
        my_dict['classX'] = class_x  #need to turn into 4 classes
        my_dict['classY'] = class_y
        db.train.insert_one(my_dict)


def create_test_collection(path= "../ag_news_data/test"):           # create test collection with 2 class : X and Y

    db.test.drop()
    for file in listdir_no_hidden(path):
        my_dict = {}
        class_x = 0
        class_y = 0
        my_dict['content'] = create_preprocessed_content(file)
        # my_dict['content'] = create_preprocessed_content(file)
        if "fortnow" in file:
            class_x = 1
        else:
            class_y = 1
        my_dict['classX'] = class_x #need to turn into 4 classes
        my_dict['classY'] = class_y
        my_dict['predclassX'] = 0
        my_dict['predclassY'] = 0
        db.test.insert_one(my_dict)

In [7]:
def create_train_collection_dataframe(path= "../ag_news_data/train.csv"):  # create test collection with 2 class : X and Y
    df = pd.read_csv(path)
    db.train.drop()
    res = []
    for idx, row in df.iterrows():
        my_dict = {}
        classes = [0 for i in range(4)]
        my_dict['content'] = text_to_words(row["text"]).strip().split()
        label = row["label"]
        classes[label] = 1
        my_dict['class0'] = classes[0]
        my_dict['class1'] = classes[1]
        my_dict['class2'] = classes[2]
        my_dict['class3'] = classes[3]
        res.append(my_dict)
    db.train.insert_many(res)

def create_test_collection_dataframe(path= "../ag_news_data/test.csv"):           # create test collection with 2 class : X and Y
    df = pd.read_csv(path)
    db.test.drop()
    res = []
    for idx, row in df.iterrows():
        my_dict = {}
        classes = [0 for i in range(4)]
        my_dict['content'] = text_to_words(row["text"]).strip().split()
        label = row["label"]
        classes[label] = 1
        my_dict['class0'], my_dict['predclass0'] = classes[0],0 
        my_dict['class1'], my_dict['predclass1'] = classes[1],0
        my_dict['class2'], my_dict['predclass2'] = classes[2],0
        my_dict['class3'], my_dict['predclass3'] = classes[3],0
        res.append(my_dict)
    db.test.insert_many(res)

# Map Reduce for Statistics


In [8]:
def map_reduce_total_counts():
    db.TotalCounts.drop()
    pipeline_one = aggregation_pipeline.get_total_counts_pipeline()
    db.train.aggregate(pipeline_one)

In [9]:
def map_reduce_word_counts():
    db.WordCounts.drop()
    pipeline_two = aggregation_pipeline.get_word_counts_pipeline()

    db.train.aggregate(pipeline_two)

# Map Reduce for Test dataset Classification 


In [10]:
def map_reduce():
    map_reduce_total_counts()
    map_reduce_word_counts()
    vocabulary = db.TotalCounts.find_one()['V']

    class_0 = db.TotalCounts.find_one()['cl0']
    class_1 = db.TotalCounts.find_one()['cl1']
    class_2 = db.TotalCounts.find_one()['cl2']
    class_3 = db.TotalCounts.find_one()['cl3']

    docs = db.train.find()
    test_docs = db.test.find()
    
    return vocabulary, class_0, class_1, class_2, class_3, docs, test_docs




<img src="../image/naive_bayes.png">

In [11]:
def naive_bayes_classifier():
    ## P(x),  P(C_k)
    vocabulary_len, class_0, class_1, class_2, class_3, docs, test_docs = map_reduce()
    n = 120000.0
    classes_vocab_len = np.array([class_0, class_1, class_2, class_3])
    classes_sample_num = np.array([30000.0, 30000.0, 30000.0, 30000.0])
    denominator = classes_vocab_len + vocabulary_len    # smooth P(x)
    probability_class = classes_sample_num / n        # P(C_k)
    
    # P(C_k) * P(x|C_k)
    for doc in test_docs:

        label_prob = np.array([0.0, 0.0, 0.0, 0.0])
        for word in doc['content']:
            dict_two = db.WordCounts.find_one({"_id": word})
            if dict_two is not None:
                word_class = [dict_two['value']['class0'], dict_two['value']['class1'], dict_two['value']['class2'], dict_two['value']['class3']]
            else:
                word_class = [0.0, 0.0, 0.0, 0.0]
            label_prob += np.log10((np.array(word_class) + 1.0) / denominator)  # P(x|C_k)
            
        label_prob += np.log10(probability_class)  # P(C_k) * P(x|C_k)
        
        label = np.argmax(label_prob)  # pred label
        if label == 0:
            db.test.update_one(doc, {'$set': {'predClass0': 1, 'predClass1': 0, 'predClass2': 0, 'predClass3': 0}})
        elif label == 1:
            db.test.update_one(doc, {'$set': {'predClass0': 0, 'predClass1': 1, 'predClass2': 0, 'predClass3': 0}})
        elif label == 2:
            db.test.update_one(doc, {'$set': {'predClass0': 0, 'predClass1': 0, 'predClass2': 1, 'predClass3': 0}})
        else:
            db.test.update_one(doc, {'$set': {'predClass0': 0, 'predClass1': 0, 'predClass2': 0, 'predClass3': 1}})


In [12]:
def statistics_result():
    naive_bayes_classifier()
    
    pipeline = aggregation_pipeline.get_statistics_result_pipeline()
    
    db.test.aggregate(pipeline)
    
    calculated_confusion_matrix = db.Results.find_one()['value']
    return calculated_confusion_matrix
    
    

In [14]:
create_test_collection_dataframe()


In [15]:
create_train_collection_dataframe()


In [16]:
a = statistics_result()
print(a)

{'class0_true': 7226.0, 'class0_false': 374.0, 'class1_true': 7468.0, 'class1_false': 132.0, 'class2_true': 7078.0, 'class2_false': 522.0, 'class3_true': 7130.0, 'class3_false': 470.0}


# Inference

In [None]:
def inference(text: str):
    # get necessary statistics data
    vocabulary_len = db.TotalCounts.find_one()['V']

    n = 120000.0
    class_0 = db.TotalCounts.find_one()['cl0']
    class_1 = db.TotalCounts.find_one()['cl1']
    class_2 = db.TotalCounts.find_one()['cl2']
    class_3 = db.TotalCounts.find_one()['cl3']
    
    classes_vocab_len = np.array([class_0, class_1, class_2, class_3])
    classes_sample_num = np.array([30000.0, 30000.0, 30000.0, 30000.0])
    denominator = classes_vocab_len + vocabulary_len    # smooth P(x)
    probability_class = classes_sample_num / n        # P(C_k)
    
    
    # Process input text
    words = text_to_words(text).strip().split()
    # P(C_k) * P(x|C_k)
    label_prob = np.array([0.0, 0.0, 0.0, 0.0])
    for word in words:
        dict_two = db.WordCounts.find_one({"_id": word})
        if dict_two is not None:
            word_class = [dict_two['value']['class0'], dict_two['value']['class1'], dict_two['value']['class2'], dict_two['value']['class3']]
        else:
            word_class = [0.0, 0.0, 0.0, 0.0]
        label_prob += np.log10((np.array(word_class) + 1.0) / denominator)  # P(x|C_k)
        
    label_prob += np.log10(probability_class)  # P(C_k) * P(x|C_k)
    
    label = np.argmax(label_prob)  # pred label
    index_to_label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci_Tech"}
    return index_to_label[label]


In [17]:
text = 'Comets, Asteroids and Planets around a Nearby Star (SPACE.com) SPACE.com - A nearby star thought to harbor comets and asteroids now appears to be home to planets, too. The presumed worlds are smaller than Jupiter and could be as tiny as Pluto, new observations suggest.'"classification copy.ipynb"


In [18]:
print(inference(text))

Sci_Tech
