In [None]:
%load_ext autoreload
%autoreload 2

### imports and utilities

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
# from std lib
import re, string
from collections import Counter

# from thrid party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy.special import softmax
from scipy.stats import norm
from scipy.stats import entropy as calculate_entropy


from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

#### Utilities

In [None]:
def get_wordnet_pos(word, use_pos):
    if not use_pos:
        return 'n'

    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "r":wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# clean out the new line characters from text in docs
def clean_doc(doc, use_pos=False):
    ''' remove unwanter characters line new line '''

    unwanted_chrs = list(string.punctuation)
    # unwanted_chrs = [')', '(', '{', '}', '\t', '\n', '\r', "'", '"', "!", ",", ".", "?", ">", "<", "[", "]"]

    doc = doc.lower()
    for unwanted_chr in unwanted_chrs:
        doc = doc.replace(unwanted_chr, ' ')

    doc = word_tokenize(doc)

    word_count = len(doc)
    doc = " ".join([wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word, use_pos)) for word in doc])

    status = (len(doc) != 0 and not doc.isspace())

    return status, doc, word_count

def calculate_sparsity(matrix):
    non_zero = np.count_nonzero(matrix)
    total_val = np.product(matrix.shape)
    sparsity = (total_val - non_zero) / total_val
    return sparsity

def calculate_word_inference_weight(word_word_pr_distr):
    entropy_values = calculate_entropy(word_word_pr_distr, axis=0)
    entropy_values_norm = (entropy_values.max() - entropy_values) / entropy_values.max()
    entropy_values_norm = pd.DataFrame(data=entropy_values_norm, index=word_word_pr_distr.columns)[0]
    return entropy_values_norm

### load dataset

In [None]:
dataset = "newsgroup"

# total number of samples needed
randomize = False

# retrieve dataset
categories = ['rec.autos', 'talk.politics.mideast', 'alt.atheism', 'sci.space']

all_docs = fetch_20newsgroups(subset='train', shuffle=randomize, remove=('headers', 'footers', 'quotes'), categories=categories)
all_docs, old_labels, categories = all_docs.data, all_docs.target, all_docs.target_names

In [None]:
# dataset = "bbc"

# data = pd.read_csv('bbcsport.csv')

# all_docs = data["text"].to_list()
# old_labels = data["topic"].to_list()
# categories = classes = np.unique(data["topic"]).tolist()

### clean dataset

In [None]:
datasize = 40
min_document_length = 160
max_document_length = 256


index = -1
docs, labels, label_indices = [], [], []

sizes = [0]*len(categories)

with tqdm(total=len(categories)*datasize) as pbar:
    while sum(sizes) < len(categories)*datasize:
        index += 1
        label_index = old_labels[index]
            
        if sizes[label_index] == datasize:
            continue
        
        doc = all_docs[index]
        status, doc, word_count = clean_doc(doc, True)
        
        if not status:
            continue
            
        if min_document_length is not None and len(doc) < min_document_length:
            continue
            
        if max_document_length is not None and len(doc) > max_document_length:
            continue
        
        label_indices.append(label_index)
        labels.append(categories[label_index])
        
        docs.append(doc)
        sizes[label_index] += 1
        pbar.update(1)

labels = np.array(labels)
label_indices = np.array(label_indices)

In [None]:
doc_index = 3
print(f"Topic: {labels[doc_index]}\n{'='*50}\n{docs[doc_index][:512]}")

In [None]:
print(sizes)
assert min(sizes) == max(sizes) == datasize

### Split data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size =.3)

In [None]:
print(f"there are {len(docs)} total docs, {len(y_train)} train and {len(y_test)} test")

### Initialize Vectorizer

In [None]:
vectorizer_type = "not-tfidf"
stop_words = "english"
stop_words = None

# initialize the count vectorizer
if vectorizer_type == "tfidf":
    vectorizer = TfidfVectorizer(stop_words=stop_words)
else:
    vectorizer = CountVectorizer(stop_words=stop_words)

# fit it to dataset
vectorizer.fit(x_train)

vocabulary = np.array(vectorizer.get_feature_names())
print("word_count is", len(vocabulary))

### Prepare Datatset

In [None]:
# create doc count vectors
train_doc_vectors = vectorizer.transform(x_train).toarray()
test_doc_vectors = vectorizer.transform(x_test).toarray()

wdf_train = pd.DataFrame(train_doc_vectors, columns=vocabulary)
wdf_test = pd.DataFrame(test_doc_vectors, columns=vocabulary)

In [None]:
wdf_train.head()

In [None]:
print(f"train_word_doc_freq matrix sparsity = {calculate_sparsity(wdf_train):.4f}")

### Word Word Co-Occurence Probability

In [None]:
alpha = 0
wdf_train_prime = wdf_train.copy()

wdt_train = wdf_train_prime.copy()
wdt_train["__labels__"] = y_train

word_doc_count = wdf_train_prime.sum(0)
word_word_pr_distr = pd.DataFrame(data=0.0, columns=vocabulary, index=vocabulary)

for word in tqdm(vocabulary):
    pxy = (wdf_train_prime[wdf_train_prime[word] > 0].sum(0) + alpha) / (word_doc_count[word] + alpha)
    word_word_pr_distr[word] = pxy * (word_doc_count[word] / word_doc_count)

print(f"word_word_pr_distr shape = {word_word_pr_distr.shape}")

In [None]:
word_word_pr_distr.head()

In [None]:
print(f"word_word_pr_distr matrix sparsity = {calculate_sparsity(word_word_pr_distr):.4f}")

In [None]:
word = "space"
given_word = "science"
word_word_pr_distr[word][given_word]

### Word Word Co-Occurence Prime Probability

In [None]:
num_of_iterations = 1

In [None]:
def max_weight(x, pbar, word_word_pr_distr_prime):
    pbar.update(1)
    return word_word_pr_distr_prime.apply(lambda y: x*y, axis=0).max(0)

def sum_weight(x, pbar, word_word_pr_distr_prime):
    pbar.update(1)
    return word_word_pr_distr_prime.apply(lambda y: x*y, axis=0).sum(0)

In [None]:
func = max_weight
# func = sum_weight

word_word_pr_distr_prime = word_word_pr_distr.copy()
with tqdm(total=len(vocabulary)*num_of_iterations) as pbar:
    for _ in range(num_of_iterations):
        word_word_pr_distr_prime = word_word_pr_distr_prime.apply(func, axis=1, args=(pbar, word_word_pr_distr_prime))

print(f"word_word_pr_distr_prime shape = {word_word_pr_distr_prime.shape}")

In [None]:
word_word_pr_distr_prime.head()

In [None]:
# scale to integers
min_value = word_word_pr_distr_prime[word_word_pr_distr_prime > 0].min().min()
word_word_pr_distr_prime_scale = word_word_pr_distr_prime / min_value

# to int and aproximated to nearest integer
word_word_pr_distr_prime_scale = np.rint(word_word_pr_distr_prime_scale).astype(int)

word_word_pr_distr_prime_scale.head()

### Modified word word pr distr properties

In [None]:
wwds = calculate_sparsity(word_word_pr_distr)
wwdm = word_word_pr_distr.mean().mean()
wwdstd = word_word_pr_distr.mean().mean()

wwdps = calculate_sparsity(word_word_pr_distr_prime)
wwdpm = word_word_pr_distr_prime.mean().mean()
wwdpstd = word_word_pr_distr_prime.mean().std()

print("                                           | sparsity | mean   | std")
print("-----------------------------------------------------------------------")
print(f"word_word_pr_distr matrix sparsity         | {wwds:.4f}   | {wwdm:.4f} | {wwdstd:.4f}")
print(f"word_word_pr_distr_prime matrix sparsity   | {wwdps:.4f}   | {wwdpm:.4f} | {wwdpstd:.4f}")

##### Word Entropy

In [None]:
word_inference_weight = calculate_word_inference_weight(word_word_pr_distr)
word_inference_weight_prime = calculate_word_inference_weight(word_word_pr_distr_prime)

# word_inference_weight.head()

In [None]:
words = ["science", "space", "religion"]

print(f'{"word":16s} | {"weight":6s} | {"prime_weight":6s}')
print("-----------------------------------------------------")
for w1 in words:
    print(f"{w1:16s} | {word_inference_weight[w1]:.4f} | {word_inference_weight_prime[w1]:.4f}")

#### word word relation comparison with modified word_word_co matrix

In [None]:
word = "space"
given_word1 = "science"
given_word2 = "politics"
print(f"{word:>16s} | {given_word1:16s}", word_word_pr_distr[word][given_word1], word_word_pr_distr_prime[word][given_word1])
print(f"{word:>16s} | {given_word2:16s}", word_word_pr_distr[word][given_word2], word_word_pr_distr_prime[word][given_word2])

In [None]:
doc_index1 = 1
doc_index2 = 10

doc_indices = [doc_index1, doc_index2]

for doc_index in doc_indices[:3]:
    print(f"Topic: {y_train[doc_index]}\n{'='*50}\n{x_train[doc_index][:512]}")
    print()

In [None]:
np.log1p

In [None]:
def get_wdf_prime(wdf, func):
    with tqdm(total=wdf.shape[0]) as pbar:
        wdf_prime = wdf.apply(func, axis=1, args=(pbar, word_word_pr_distr_prime_scale))

#     wdf_prime = np.log1p(wdf_prime)
#     wdf_prime = np.rint(wdf_prime).astype(int)
    return wdf_prime

func = sum_weight
# func = max_weight

wdf_train_prime = get_wdf_prime(wdf_train, func)
wdf_test_prime = get_wdf_prime(wdf_test, func)

print(f"wdf_train_prime shape = {wdf_train_prime.shape}")
print(f"wdf_test_prime shape = {wdf_test_prime.shape}")

In [None]:
# wdf_train.head()

In [None]:
wdf_train_prime.head()

In [None]:
from sklearn.naive_bayes import MultinomialNB as NB
# from sklearn.naive_bayes import GaussianNB as NB
# from sklearn.naive_bayes import BernoulliNB as NB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

le = LabelEncoder()
le.fit(y_train)

list(le.classes_)

In [None]:
nb = NB()
nb.fit(wdf_train, le.transform(y_train))

In [None]:
def evaluate_nb(clf, X, y):
    y_true = le.transform(y)
    y_pred = clf.predict(X)
    
    print(f"accuracy = {accuracy_score(y_true, y_pred)*100:.2f}%\n")
    print(f'classification report\n{"="*60}')
    print(classification_report(y_true, y_pred))
    
#     print(f'confusion_matrix\n{"="*60}')
#     print(confusion_matrix(y_true, y_pred))

In [None]:
# evaluate_nb(nb, wdf_train, y_train)

In [None]:
evaluate_nb(nb, wdf_test, y_test)

In [None]:
nb_prime = NB()
nb_prime.fit(wdf_train_prime, le.transform(y_train))

In [None]:
# evaluate_nb(nb_prime, wdf_train_prime, y_train)

In [None]:
evaluate_nb(nb_prime, wdf_test, y_test)

In [None]:
evaluate_nb(nb_prime, wdf_test_prime, y_test)