In [1]:
%load_ext autoreload
%autoreload 2

### imports and utilities

In [3]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\christian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\christian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\christian\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [151]:
# from std lib
import re, string
from collections import Counter

# from thrid party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy.special import softmax
from scipy.stats import norm
from scipy.stats import entropy as calculate_entropy


from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

#### Utilities

In [392]:
def get_wordnet_pos(word, use_pos):
    if not use_pos:
        return 'n'

    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "r":wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# clean out the new line characters from text in docs
def clean_doc(doc, use_pos=False):
    ''' remove unwanter characters line new line '''

    unwanted_chrs = list(string.punctuation)
    # unwanted_chrs = [')', '(', '{', '}', '\t', '\n', '\r', "'", '"', "!", ",", ".", "?", ">", "<", "[", "]"]

    doc = doc.lower()
    for unwanted_chr in unwanted_chrs:
        doc = doc.replace(unwanted_chr, ' ')

    doc = word_tokenize(doc)

    word_count = len(doc)
    doc = " ".join([wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word, use_pos)) for word in doc])

    status = (len(doc) != 0 and not doc.isspace())

    return status, doc, word_count

def calculate_sparsity(matrix):
    non_zero = np.count_nonzero(matrix)
    total_val = np.product(matrix.shape)
    sparsity = (total_val - non_zero) / total_val
    return sparsity

def calculate_word_inference_weight(word_word_pr_distr):
    entropy_values = calculate_entropy(word_word_pr_distr, axis=0)
    entropy_values_norm = (entropy_values.max() - entropy_values) / entropy_values.max()
    entropy_values_norm = pd.DataFrame(data=entropy_values_norm, index=word_word_pr_distr.columns)[0]
    return entropy_values_norm

### load dataset

In [5]:
dataset = "newsgroup"

# total number of samples needed
randomize = False

# retrieve dataset
categories = ['rec.autos', 'talk.politics.mideast', 'alt.atheism', 'sci.space']

all_docs = fetch_20newsgroups(subset='train', shuffle=randomize, remove=('headers', 'footers', 'quotes'), categories=categories)
all_docs, old_labels, categories = all_docs.data, all_docs.target, all_docs.target_names

In [6]:
# dataset = "bbc"

# data = pd.read_csv('bbcsport.csv')

# all_docs = data["text"].to_list()
# old_labels = data["topic"].to_list()
# categories = classes = np.unique(data["topic"]).tolist()

### clean dataset

In [7]:
datasize = 40
min_document_length = 160
max_document_length = 256


index = -1
docs, labels, label_indices = [], [], []

sizes = [0]*len(categories)

with tqdm(total=len(categories)*datasize) as pbar:
    while sum(sizes) < len(categories)*datasize:
        index += 1
        label_index = old_labels[index]
            
        if sizes[label_index] == datasize:
            continue
        
        doc = all_docs[index]
        status, doc, word_count = clean_doc(doc, True)
        
        if not status:
            continue
            
        if min_document_length is not None and len(doc) < min_document_length:
            continue
            
        if max_document_length is not None and len(doc) > max_document_length:
            continue
        
        label_indices.append(label_index)
        labels.append(categories[label_index])
        
        docs.append(doc)
        sizes[label_index] += 1
        pbar.update(1)

labels = np.array(labels)
label_indices = np.array(label_indices)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=160.0), HTML(value='')))




In [8]:
doc_index = 3
print(f"Topic: {labels[doc_index]}\n{'='*50}\n{docs[doc_index][:512]}")

Topic: rec.autos
not to mention my friend s 54 citroen traction avant with the light switch and dimmer integrate in a single stalk off the steer column those dumb french be apparently copying the japanese before the german


In [9]:
print(sizes)
assert min(sizes) == max(sizes) == datasize

[40, 40, 40, 40]


### Split data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size =.3)

In [11]:
print(f"there are {len(docs)} total docs, {len(y_train)} train and {len(y_test)} test")

there are 160 total docs, 112 train and 48 test


### Initialize Vectorizer

In [12]:
vectorizer_type = "not-tfidf"

# initialize the count vectorizer
if vectorizer_type == "tfidf":
    vectorizer = TfidfVectorizer
else:
    vectorizer = CountVectorizer()

# fit it to dataset
vectorizer.fit(x_train)

vocabulary = np.array(vectorizer.get_feature_names())
print("word_count is", len(vocabulary))

word_count is 1483


### Prepare Datatset

In [13]:
# create doc count vectors
train_doc_vectors = vectorizer.transform(x_train).toarray()
test_doc_vectors = vectorizer.transform(x_test).toarray()

wdf_train = pd.DataFrame(train_doc_vectors, columns=vocabulary)
wdf_test = pd.DataFrame(test_doc_vectors, columns=vocabulary)

In [14]:
wdf_train.head()

Unnamed: 0,000,031349,10,11,16th,1900,1940,1968,1982,1984,...,ya,yeah,year,yes,yo,yorker,you,your,zeuge,zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,3,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0


In [156]:
print(f"train_word_doc_freq matrix sparsity = {calculate_sparsity(wdf_train):.4f}")

train_word_doc_freq matrix sparsity = 0.9790


### Word Word Co-Occurence Probability

In [121]:
alpha = 0
wdf_train_prime = wdf_train.copy()

wdt_train = wdf_train_prime.copy()
wdt_train["__labels__"] = y_train

word_doc_count = wdf_train_prime.sum(0)
word_word_pr_distr = pd.DataFrame(data=0.0, columns=vocabulary, index=vocabulary)

for word in tqdm(vocabulary):
    pxy = (wdf_train_prime[wdf_train_prime[word] > 0].sum(0) + alpha) / (word_doc_count[word] + alpha)
    word_word_pr_distr[word] = pxy * (word_doc_count[word] / word_doc_count)

print(f"word_word_pr_distr shape = {word_word_pr_distr.shape}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1483.0), HTML(value='')))


word_word_pr_distr shape = (1483, 1483)


In [122]:
word_word_pr_distr.head()

Unnamed: 0,000,031349,10,11,16th,1900,1940,1968,1982,1984,...,ya,yeah,year,yes,yo,yorker,you,your,zeuge,zuma
000,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0
031349,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
10,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
11,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16th,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
print(f"word_word_pr_distr matrix sparsity = {calculate_sparsity(word_word_pr_distr):.4f}")

word_word_pr_distr matrix sparsity = 0.9603


In [135]:
word = "space"
given_word = "science"
word_word_pr_distr[word][given_word]

0.0

### Word Word Co-Occurence Prime Probability

In [401]:
num_of_iterations = 3

In [402]:
def func(x, pbar, word_word_pr_distr_prime):
    pbar.update(1)
    return word_word_pr_distr_prime.apply(lambda y: x*y, axis=0).max(0)

In [403]:
# word_word_pr_distr_prime = word_word_pr_distr.copy()
# with tqdm(total=len(vocabulary)*num_of_iterations) as pbar:
#     for _ in range(num_of_iterations):
#         word_word_pr_distr_prime = word_word_pr_distr_prime.apply(func, axis=1, args=(pbar, word_word_pr_distr_prime))

# print(f"word_word_pr_distr_prime shape = {word_word_pr_distr_prime.shape}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4449.0), HTML(value='')))


word_word_pr_distr_prime shape = (1483, 1483)


In [404]:
word_word_pr_distr_prime.head()

Unnamed: 0,000,031349,10,11,16th,1900,1940,1968,1982,1984,...,ya,yeah,year,yes,yo,yorker,you,your,zeuge,zuma
000,1.0,0.125,0.5,0.071429,0.071429,0.107143,0.071429,0.1,0.125,0.5,...,0.125,0.0625,0.107143,0.125,0.25,0.05,0.5,0.5,0.05,0.0625
031349,0.125,1.0,0.083333,0.166667,0.142857,0.166667,0.2,0.125,0.083333,0.1,...,0.083333,0.071429,0.166667,0.142857,0.166667,0.142857,1.0,1.0,0.041667,0.1
10,1.0,0.166667,1.0,0.083333,0.125,0.115385,0.083333,0.2,0.166667,1.0,...,0.166667,0.076923,0.125,0.166667,0.5,0.083333,1.0,1.0,0.1,0.083333
11,0.0625,0.075,0.042857,1.0,0.25,0.25,0.1,0.0625,0.055556,0.055556,...,0.125,0.125,0.25,0.107143,0.125,0.083333,0.431472,0.25,0.083333,0.125
16th,0.166667,0.166667,0.142857,0.5,1.0,0.25,0.166667,0.166667,0.125,0.166667,...,0.333333,0.125,0.25,0.333333,0.428571,0.214286,0.666667,0.466667,0.166667,0.133333


### Modified word word pr distr properties

In [405]:
wwds = calculate_sparsity(word_word_pr_distr)
wwdm = word_word_pr_distr.mean().mean()
wwdstd = word_word_pr_distr.mean().mean()

wwdps = calculate_sparsity(word_word_pr_distr_prime)
wwdpm = word_word_pr_distr_prime.mean().mean()
wwdpstd = word_word_pr_distr_prime.mean().mean()

print("                                           | sparsity | mean   | std")
print("-----------------------------------------------------------------------")
print(f"word_word_pr_distr matrix sparsity         | {wwds:.4f}   | {wwdm:.4f} | {wwdstd:.4f}")
print(f"word_word_pr_distr_prime matrix sparsity   | {wwdps:.4f}   | {wwdpm:.4f} | {wwdpstd:.4f}")

                                           | sparsity | mean   | std
-----------------------------------------------------------------------
word_word_pr_distr matrix sparsity         | 0.9603   | 0.0215 | 0.0215
word_word_pr_distr_prime matrix sparsity   | 0.0000   | 0.1748 | 0.1748


##### Word Entropy

In [406]:
word_inference_weight = calculate_word_inference_weight(word_word_pr_distr)
word_inference_weight_prime = calculate_word_inference_weight(word_word_pr_distr_prime)

# word_inference_weight.head()

In [407]:
words = ["the", "be", "science", "space", "god", "religion"]

print(f'{"word":16s} | {"weight":6s} | {"prime_weight":6s}')
print("-----------------------------------------------------")
for w1 in words:
    print(f"{w1:16s} | {word_inference_weight[w1]:.4f} | {word_inference_weight_prime[w1]:.4f}")

word             | weight | prime_weight
-----------------------------------------------------
the              | 0.0000 | 0.0003
be               | 0.0038 | 0.0000
science          | 0.4704 | 0.0300
space            | 0.6439 | 0.0352
god              | 0.5162 | 0.0311
religion         | 0.4124 | 0.0272


#### word word relation comparison with modified word_word_co matrix

In [573]:
word = "politics"
given_word = "race"
word_word_pr_distr[word][given_word], word_word_pr_distr_prime[word][given_word]

(0.0, 0.16666666666666666)

In [409]:
doc_index1 = 1
doc_index2 = 10

doc_indices = [doc_index1, doc_index2]

for doc_index in doc_indices[:3]:
    print(f"Topic: {y_train[doc_index]}\n{'='*50}\n{x_train[doc_index][:512]}")
    print()

Topic: talk.politics.mideast
your ignorance be obvious from your post 1 cyprus be an independent country with turkish greek inhabitant not a greek island like your ignorant post claim 2 the name should be cyprus in english next time read and learn before you post

Topic: rec.autos
well the mgb be currently in production for the english market built by rover it now have a v8 improve suspention and a slightly update body too bad it s only available in gb and would set one of u back about 42 000



In [553]:
ct = Counter()
ctp = Counter()

for given_word, wfx in wdf_train.iloc[doc_index1][wdf_train.iloc[doc_index1] > 0].items():
    if not wfx > 0:
        continue
        
    for word, wfy in wdf_train.iloc[doc_index2][wdf_train.iloc[doc_index2] > 0].items():
        xv = word_word_pr_distr[word][given_word] * word_word_pr_distr[given_word][word]# * word_inference_weight[given_word] * word_inference_weight[word]
        if xv > ct[given_word]:
#             print(given_word, word, xv)
            ct[given_word] = xv
            
        xv = word_word_pr_distr_prime[word][given_word] * word_word_pr_distr_prime[given_word][word]#word_inference_weight_prime[given_word] * word_inference_weight_prime[word]
        if xv > ctp[given_word]:
            print(given_word, word, xv)
            ctp[given_word] = xv

an 000 0.022727272727272728
an about 0.1122994652406417
an be 0.14120904476234425
an the 0.14887406171809842
and 000 0.013214146910221531
and 42 0.024096385542168676
and about 0.09922041105598865
and and 1.0
be 000 0.023579498935647612
be about 0.12433098818182271
be and 0.3418751146718855
be be 1.0
before 000 0.041666666666666664
before 42 0.08333333333333333
before english 0.16666666666666666
claim 000 0.041666666666666664
claim 42 0.08333333333333333
claim english 0.16666666666666666
country 000 0.0625
country 42 0.125
country english 0.25
cyprus 000 0.125
cyprus 42 0.25
cyprus english 0.5
english 000 0.25
english 42 0.5
english english 1.0
from 000 0.009615384615384616
from 42 0.019230769230769232
from about 0.0407239819004525
from and 0.0778498609823911
from be 0.1030847325263569
from it 0.1054945054945055
greek 000 0.08333333333333333
greek 42 0.16666666666666666
greek english 0.3333333333333333
ignorance 000 0.0625
ignorance 42 0.125
ignorance english 0.25
ignorant 000 0.0416666

In [554]:
num_of_topwords = 30

tw = ct.most_common(num_of_topwords)
twp = ctp.most_common(num_of_topwords)

num_of_topwords = len(tw)

print("====================================================")
print(f"printing {num_of_topwords} top co occuring words")
print("====================================================\n")

print(f'{"word":16s} | {"pr":6s} | {"pr_prime":6s}')
print("--------------------------------------------------")

for i in range(num_of_topwords):
    print(f"{tw[i][0][:16]:16s} | {tw[i][1]:.4f} | {twp[i][1]:.4f}")

printing 30 top co occuring words

word             | pr     | pr_prime
--------------------------------------------------
and              | 1.0000 | 1.0000
be               | 1.0000 | 1.0000
english          | 1.0000 | 1.0000
in               | 1.0000 | 1.0000
the              | 1.0000 | 1.0000
cyprus           | 0.5000 | 0.5000
independent      | 0.5000 | 0.5000
inhabitant       | 0.5000 | 0.5000
island           | 0.5000 | 0.5000
learn            | 0.5000 | 0.5000
name             | 0.5000 | 0.5000
turkish          | 0.5000 | 0.5000
you              | 0.3520 | 0.3520
greek            | 0.3333 | 0.3333
not              | 0.3169 | 0.3169
country          | 0.2500 | 0.2500
ignorance        | 0.2500 | 0.2500
next             | 0.2500 | 0.2500
with             | 0.1997 | 0.1997
before           | 0.1667 | 0.1667
claim            | 0.1667 | 0.1667
ignorant         | 0.1667 | 0.1667
obvious          | 0.1667 | 0.1667
your             | 0.1645 | 0.1645
an               | 0.1489 | 0.1489
re

#### Doc doc relation

In [412]:
given_doc_index = 1
print(f"Topic: {y_train[given_doc_index]}\n{'='*50}\n{x_train[given_doc_index][:512]}")
print()

Topic: talk.politics.mideast
your ignorance be obvious from your post 1 cyprus be an independent country with turkish greek inhabitant not a greek island like your ignorant post claim 2 the name should be cyprus in english next time read and learn before you post



In [555]:
given_doc = wdf_train.iloc[given_doc_index][wdf_train.iloc[given_doc_index] > 0]
doc_doc_pr_distr = pd.DataFrame(data=0.0, columns=given_doc.index.tolist(), index=wdf_train.index.tolist())

for doc_index in tqdm(range(len(y_train))):
    doc = wdf_train.iloc[doc_index][wdf_train.iloc[doc_index] > 0]
    
    for given_word, wfx in given_doc.items():
        doc_doc_pr_distr.iloc[doc_index][given_word] = max(
            word_word_pr_distr_prime[word][given_word] * word_word_pr_distr_prime[given_word][word] for word, wfy in doc.items()
#             word_word_pr_distr_prime[word][given_word] * word_inference_weight_prime[word] for word, wfy in doc.items()
#             word_word_pr_distr_prime[word][given_word] * word_inference_weight_prime[word] * word_inference_weight_prime[given_word] for word, wfy in doc.items()
        )
        
print(f"doc_doc_pr_distr shape = {doc_doc_pr_distr.shape}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=112.0), HTML(value='')))


doc_doc_pr_distr shape = (112, 31)


In [556]:
doc_doc_pr_distr.head()

Unnamed: 0,an,and,be,before,claim,country,cyprus,english,from,greek,...,obvious,post,read,should,the,time,turkish,with,you,your
0,1.0,0.465237,1.0,0.111111,1.0,0.166667,0.333333,0.166667,1.0,0.222222,...,0.111111,1.0,0.083333,0.089286,1.0,0.170455,0.333333,0.199705,1.0,0.25
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.141209,0.341875,1.0,0.048387,0.106667,0.060606,0.12,0.06,0.120445,0.090909,...,0.055838,0.131579,0.042453,0.087156,0.669725,0.075359,0.12,0.17132,1.0,1.0
3,0.148874,0.465237,1.0,0.047619,0.050761,0.045455,0.090909,0.045455,0.103085,0.060606,...,0.107143,0.15,0.050459,0.087156,1.0,1.0,0.090909,1.0,0.35199,0.154315
4,0.178977,0.465237,1.0,0.166667,0.166667,0.09375,0.1875,0.09375,1.0,0.125,...,0.105263,1.0,0.050459,0.089286,1.0,0.075359,0.1875,0.199705,1.0,0.25


In [557]:
n_clusters = 2

# cluster the topics
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(doc_doc_pr_distr)
print("latent_vector, kmeans_inertia = ", kmeans.inertia_)

for ii in range(n_clusters):
    print(Counter(y_train[kmeans.labels_ == ii]))

print()

latent_vector, kmeans_inertia =  145.34738323975643
Counter({'talk.politics.mideast': 6, 'alt.atheism': 4, 'rec.autos': 4, 'sci.space': 2})
Counter({'rec.autos': 28, 'sci.space': 25, 'alt.atheism': 22, 'talk.politics.mideast': 21})



In [561]:
topsize = 100

In [562]:
indices = kmeans.transform(doc_doc_pr_distr).mean(1).argsort(axis=0)[::-1]
indices[:3]

Counter(y_train[indices][:topsize]).most_common()

[('rec.autos', 27),
 ('talk.politics.mideast', 25),
 ('alt.atheism', 25),
 ('sci.space', 23)]

In [563]:
Counter(y_train[doc_doc_pr_distr.mean(1).sort_values(ascending=False).head(topsize).index]).most_common()

[('rec.autos', 29),
 ('talk.politics.mideast', 25),
 ('alt.atheism', 24),
 ('sci.space', 22)]