In [1]:
%load_ext autoreload
%autoreload 2

### imports and utilities

In [None]:
!pip install nltk scikit-learn matplotlib scipy -q

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
from collections import Counter
from utils import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy.special import softmax
from scipy.stats import norm
from scipy.stats import entropy as calculate_entropy

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### load dataset

In [None]:
dataset = "newsgroup"

# total number of samples needed
randomize = False

# retrieve dataset
categories = ['rec.autos', 'talk.politics.mideast', 'alt.atheism', 'sci.space']

all_docs = fetch_20newsgroups(subset='train', shuffle=randomize, remove=('headers', 'footers', 'quotes'), categories=categories)
all_docs, old_labels, categories = all_docs.data, all_docs.target, all_docs.target_names

In [None]:
# dataset = "bbc"

# data = pd.read_csv('bbcsport.csv')

# all_docs = data["text"].to_list()
# old_labels = data["topic"].to_list()
# categories = classes = np.unique(data["topic"]).tolist()

### clean dataset

In [None]:
datasize = 90
min_document_length = 256
max_document_length = 512


index = -1
docs, labels, label_indices = [], [], []

sizes = [0]*len(categories)

with tqdm(total=len(categories)*datasize) as pbar:
    while sum(sizes) != len(categories)*datasize:
        index += 1
        label_index = old_labels[index]
            
        if sizes[label_index] == datasize:
            continue
        
        doc = all_docs[index]
        status, doc, word_count = clean_doc(doc, True)
        
        if not status:
            continue
            
        if min_document_length is not None and len(doc) < min_document_length:
            continue
            
        if max_document_length is not None and len(doc) > max_document_length:
            continue
        
        label_indices.append(label_index)
        labels.append(categories[label_index])
        
        docs.append(doc)
        sizes[label_index] += 1
        pbar.update(1)

labels = np.array(labels)
label_indices = np.array(label_indices)

In [None]:
index = 3
print(f"Topic: {labels[index]}\n{'='*50}\n{docs[index][:512]}")

In [None]:
print(sizes)
assert min(sizes) == max(sizes) == datasize

## Split data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size =.3)

In [None]:
print(f"there are {len(docs)} total docs, {len(y_train)} train and {len(y_test)} test")

### Initialize Vectorizer

In [None]:
# initialize the count vectorizer
vectorizer = CountVectorizer()

# fit it to dataset
vectorizer.fit(x_train)

vocabulary = np.array(vectorizer.get_feature_names())
print("word_count is", len(vocabulary))

### Prepare Datatset

In [None]:
# create doc count vectors
train_doc_vectors = vectorizer.transform(x_train).toarray()
test_doc_vectors = vectorizer.transform(x_test).toarray()

In [None]:
# reduce freq in doc to bin value of 1 or 0
binary = False

word_freq_in_doc_train = pd.DataFrame(train_doc_vectors, columns=vocabulary)
word_freq_in_doc_test = pd.DataFrame(test_doc_vectors, columns=vocabulary)

word_doc_frequency_norm_train = (word_freq_in_doc_train.T / word_freq_in_doc_train.sum(1)).T
word_doc_frequency_norm_test = (word_freq_in_doc_test.T / word_freq_in_doc_test.sum(1)).T

if binary:
    wdf_train = (word_doc_frequency_norm_train > 0).astype(int)
    wdf_test = (word_doc_frequency_norm_test > 0).astype(int)
    
else:    
    wdf_train = word_doc_frequency_norm_train.copy()
    wdf_test = word_doc_frequency_norm_test.copy()

## Word Topic Probability Distr

In [None]:
wdt_train = wdf_train.copy()
wdt_test = wdf_test.copy()

wdt_test["__labels__"] = y_test
wdt_train["__labels__"] = y_train

word_doc_count = wdf_train.sum(0)
topic_doc_count = pd.DataFrame(Counter(y_train), index=[0]).T[0]
word_topic_pr_distr = pd.DataFrame(data=0.0, columns=categories, index=vocabulary)

for category in tqdm(categories):
    pxy = wdf_train[wdt_train["__labels__"] == category].sum(0) / topic_doc_count[category]
    word_topic_pr_distr[category] = pxy * (topic_doc_count[category] / word_doc_count)

print(f"word_topic_pr_distr shape = {word_topic_pr_distr.shape}")

In [None]:
word_topic_pr_distr.head()

In [None]:
topic_index = 3
topic = categories[topic_index]

print(f"Topwords: Topic - {topic}\n{'='*50}")
word_topic_pr_distr[topic].sort_values(ascending=False).head(10)

In [None]:
word = "space"

fig = plt.figure(figsize=(15, 5), tight_layout=True)
ax11 = fig.add_subplot(121)
ax12 = fig.add_subplot(122)

ax11.set_title(f"{word} against TOPICS relation")
ax11.bar(word_topic_pr_distr.columns, word_topic_pr_distr.loc[word])

word_topic_freq = Counter(wdt_train["__labels__"][wdt_train[word] > 0])
freqs = []
for topic in word_topic_pr_distr.columns:
#     freqs.append(word_topic_freq[topic])
    freqs.append(word_topic_freq[topic] / topic_doc_count[topic])

ax12.set_title(f"{word} against Words Frequency")
ax12.bar(word_topic_pr_distr.columns, freqs)

plt.show()

### Quick Topic Inference

In [None]:
doc_index = 0
num_of_topwords = 8

doc_index = 6
print(f"Document: {doc_index}, Topic: {y_train[doc_index]}\n{'='*50}\n{x_train[doc_index][:512]}\n")

In [None]:
doc_vector = wdf_train.loc[doc_index]
doc_word_topic_pr_distr, predicted_doc_topic = infer_topic(categories, doc_vector, word_topic_pr_distr)

print(f"Inference: Document {doc_index}\n{'='*50}")
print(f"actual = {wdt_train.loc[doc_index]['__labels__']}, predicted = {predicted_doc_topic}\n")

In [None]:
print(f"Topwords: Document {doc_index}\n{'='*50}")
print(wdf_train.loc[doc_index].sort_values(ascending=False).head(num_of_topwords))

In [None]:
plt.title(f"Document {doc_index} against WORDS-TOPICS")
plt.bar(categories, doc_word_topic_pr_distr.sum(0))
plt.show()

### Topic Word Distr - Train Accuracy

In [None]:
score_train = 0
for doc_index in tqdm(range(len(y_train))):
    doc_vector = wdf_train.loc[doc_index]
    _, predicted_doc_topic = infer_topic(categories, doc_vector, word_topic_pr_distr)
    
    mark = wdt_train.loc[doc_index]['__labels__'] == predicted_doc_topic
    score_train += mark

train_accuracy = score_train * 100 / len(y_train)
print(f"train accuracy = {train_accuracy: .2f}%")

### Topic Word Distr - Test Accuracy

In [None]:
score_test = 0
for doc_index in tqdm(range(len(y_test))):
    doc_vector = wdf_test.loc[doc_index]
    _, predicted_doc_topic = infer_topic(categories, doc_vector, word_topic_pr_distr)
    
    mark = wdt_test.loc[doc_index]['__labels__'] == predicted_doc_topic
    score_test += mark

test_accuracy = score_test * 100 / len(y_test)
print(f"test accuracy = {test_accuracy: .2f}%")

### Overrall Accuracy

In [None]:
overall_accuracy = (train_accuracy + test_accuracy)/2
print(f"overall_accuracy = {overall_accuracy: .2f}%")

## Word Word Probability Distr

In [None]:
word_word_pr_distr = pd.DataFrame(data=0.0, columns=vocabulary, index=vocabulary)

for word in tqdm(vocabulary):
    pxy = wdf_train[wdf_train[word] > 0].sum(0) / word_doc_count[word]
#     pxy = (wdf_train.T * wdf_train[word]).sum(1)/ word_doc_count[word]
    word_word_pr_distr[word] = pxy * (word_doc_count[word] / word_doc_count)

print(f"word_word_pr_distr shape = {word_word_pr_distr.shape}")

In [None]:
word_word_pr_distr.head()

In [None]:
word = "space"

print(f"Topwords: Word - {word}\n{'='*50}")
word_word_pr_distr[word].sort_values(ascending=False).head(10)

In [None]:
word = "space"
base_words = [word, "the", "war", "science"]

fig = plt.figure(figsize=(15, 5), tight_layout=True)
ax11 = fig.add_subplot(131)
ax12 = fig.add_subplot(132)
ax13 = fig.add_subplot(133)

ax11.set_title(f"{word} against BASE_WORDS relation")
ax11.bar(base_words, word_word_pr_distr.loc[word][base_words])

freqs, norm_freqs = [], []
wdc = (wdt_train[base_words] > 0).sum(0)
word_word_freq = (wdt_train[base_words][wdt_train[word] > 0] > 0).sum(0)

for w in base_words:
    freqs.append(word_word_freq[w])
    norm_freqs.append(word_word_freq[w] / wdc[w])

ax12.set_title(f"{word} against BASE_WORDS Frequency")
ax12.bar(base_words, freqs)

ax13.set_title(f"{word} against BASE_WORDS Normalized Frequency")
ax13.bar(base_words, norm_freqs)

plt.show()

## Trial

In [None]:
# the entropy and softma for reducing relation
word_word_pr_entropy = calculate_entropy(word_word_pr_distr, axis=1)
word_word_pr_entropy_softmax = softmax(word_word_pr_entropy.max() - word_word_pr_entropy)

In [None]:
word_word_pr_distr_prime = word_word_pr_distr.copy()

for word in tqdm(word_word_pr_distr.columns):
    word_word_pr_distr_prime.loc[word] *= word_word_pr_entropy_softmax

word_word_pr_distr_prime /= word_word_pr_distr_prime.max().max()

In [None]:
word = "space"
base_words = [word, "the", "war", "science"]

fig = plt.figure(figsize=(15, 5), tight_layout=True)
ax11 = fig.add_subplot(141)
ax12 = fig.add_subplot(142)
ax13 = fig.add_subplot(143)
ax14 = fig.add_subplot(144)

print(f'{" "*50}{word} against BASE_WORDS')
ax11.set_title("word_word_pr_distr")
ax11.bar(base_words, word_word_pr_distr.loc[word][base_words])

ax12.set_title("word_word_pr_distr_prime")
ax12.bar(base_words, word_word_pr_distr_prime.loc[word][base_words])


freqs, norm_freqs = [], []
wdc = (wdt_train[base_words] > 0).sum(0)
word_word_freq = (wdt_train[base_words][wdt_train[word] > 0] > 0).sum(0)

for w in base_words:
    freqs.append(word_word_freq[w])
    norm_freqs.append(word_word_freq[w] / wdc[w])

ax13.set_title("Frequency")
ax13.bar(base_words, freqs)

ax14.set_title("Normalized Frequency")
ax14.bar(base_words, norm_freqs)

plt.show()

In [None]:
num_of_iterations = 1

word_word_pr_distr_prime = word_word_pr_distr.copy()
word_word_pr_distr_prime /= word_word_pr_distr_prime.max().max()

for n_iter in tqdm(range(num_of_iterations)):
    word_word_pr_distr_last_prime = word_word_pr_distr_prime.copy()
    word_word_pr_distr_prime = 0 * word_word_pr_distr_last_prime
    
    if n_iter ==0:
        # the entropy and softma for reducing relation
        word_word_pr_entropy = calculate_entropy(word_word_pr_distr_last_prime, axis=1)
        word_word_pr_entropy_softmax = softmax(word_word_pr_entropy.max() - word_word_pr_entropy)
        # word_word_pr_entropy_softmax = normalize(np.reshape(word_word_pr_entropy.max() - word_word_pr_entropy, (1, -1))).flatten()

    for word in tqdm(vocabulary):
        denom = 0
        
        indices = (word_word_pr_distr_last_prime[word] > 0)
        xv = word_word_pr_distr_last_prime[word][indices]
        wwpes = word_word_pr_entropy_softmax[indices]

        for index, (wordx, word_pr) in enumerate(xv.iteritems()):
            gamma = word_pr * wwpes[index]
            denom += gamma
            word_word_pr_distr_prime[word] += gamma * word_word_pr_distr_last_prime[wordx]
            
        word_word_pr_distr_prime[word] /= denom
    word_word_pr_distr_prime /= word_word_pr_distr_prime.max().max()
    
    word = "space"
    base_words = [word, "the", "war", "science"]

    fig = plt.figure(figsize=(15, 5), tight_layout=True)
    ax11 = fig.add_subplot(141)
    ax12 = fig.add_subplot(142)
    ax13 = fig.add_subplot(143)
    ax14 = fig.add_subplot(144)

    print(f'{" "*50}{word} against BASE_WORDS')
    ax11.set_title("word_word_pr_distr")
    ax11.bar(base_words, word_word_pr_distr.loc[word][base_words])

    ax12.set_title("word_word_pr_distr_prime")
    ax12.bar(base_words, word_word_pr_distr_prime.loc[word][base_words])
    
    ax13.set_title("word_word_pr_distr_last_prime")
    ax13.bar(base_words, word_word_pr_distr_last_prime.loc[word][base_words])

    freqs, norm_freqs = [], []
    wdc = (wdt_train[base_words] > 0).sum(0)
    word_word_freq = (wdt_train[base_words][wdt_train[word] > 0] > 0).sum(0)

    for w in base_words:
        freqs.append(word_word_freq[w])
        norm_freqs.append(word_word_freq[w] / wdc[w])

#     ax14.set_title("Frequency")
#     ax14.bar(base_words, freqs)

    ax14.set_title("Normalized Frequency")
    ax14.bar(base_words, norm_freqs)

    plt.show()

In [None]:
Kmeans()