In [1]:
%load_ext autoreload
%autoreload 2

### imports and utilities

In [3]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\christian\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\christian\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\christian\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [151]:
# from std lib
import re, string
from collections import Counter

# from thrid party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy.special import softmax
from scipy.stats import norm
from scipy.stats import entropy as calculate_entropy


from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

#### Utilities

In [152]:
def get_wordnet_pos(word, use_pos):
    if not use_pos:
        return 'n'

    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "r":wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# clean out the new line characters from text in docs
def clean_doc(doc, use_pos=False):
    ''' remove unwanter characters line new line '''

    unwanted_chrs = list(string.punctuation)
    # unwanted_chrs = [')', '(', '{', '}', '\t', '\n', '\r', "'", '"', "!", ",", ".", "?", ">", "<", "[", "]"]

    doc = doc.lower()
    for unwanted_chr in unwanted_chrs:
        doc = doc.replace(unwanted_chr, ' ')

    doc = word_tokenize(doc)

    word_count = len(doc)
    doc = " ".join([wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word, use_pos)) for word in doc])

    status = (len(doc) != 0 and not doc.isspace())

    return status, doc, word_count

def calculate_sparsity(matrix):
    non_zero = np.count_nonzero(matrix)
    total_val = np.product(matrix.shape)
    sparsity = (total_val - non_zero) / total_val
    return sparsity



### load dataset

In [5]:
dataset = "newsgroup"

# total number of samples needed
randomize = False

# retrieve dataset
categories = ['rec.autos', 'talk.politics.mideast', 'alt.atheism', 'sci.space']

all_docs = fetch_20newsgroups(subset='train', shuffle=randomize, remove=('headers', 'footers', 'quotes'), categories=categories)
all_docs, old_labels, categories = all_docs.data, all_docs.target, all_docs.target_names

In [6]:
# dataset = "bbc"

# data = pd.read_csv('bbcsport.csv')

# all_docs = data["text"].to_list()
# old_labels = data["topic"].to_list()
# categories = classes = np.unique(data["topic"]).tolist()

### clean dataset

In [7]:
datasize = 40
min_document_length = 160
max_document_length = 256


index = -1
docs, labels, label_indices = [], [], []

sizes = [0]*len(categories)

with tqdm(total=len(categories)*datasize) as pbar:
    while sum(sizes) < len(categories)*datasize:
        index += 1
        label_index = old_labels[index]
            
        if sizes[label_index] == datasize:
            continue
        
        doc = all_docs[index]
        status, doc, word_count = clean_doc(doc, True)
        
        if not status:
            continue
            
        if min_document_length is not None and len(doc) < min_document_length:
            continue
            
        if max_document_length is not None and len(doc) > max_document_length:
            continue
        
        label_indices.append(label_index)
        labels.append(categories[label_index])
        
        docs.append(doc)
        sizes[label_index] += 1
        pbar.update(1)

labels = np.array(labels)
label_indices = np.array(label_indices)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=160.0), HTML(value='')))




In [8]:
doc_index = 3
print(f"Topic: {labels[doc_index]}\n{'='*50}\n{docs[doc_index][:512]}")

Topic: rec.autos
not to mention my friend s 54 citroen traction avant with the light switch and dimmer integrate in a single stalk off the steer column those dumb french be apparently copying the japanese before the german


In [9]:
print(sizes)
assert min(sizes) == max(sizes) == datasize

[40, 40, 40, 40]


### Split data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(docs, labels, test_size =.3)

In [11]:
print(f"there are {len(docs)} total docs, {len(y_train)} train and {len(y_test)} test")

there are 160 total docs, 112 train and 48 test


### Initialize Vectorizer

In [12]:
vectorizer_type = "not-tfidf"

# initialize the count vectorizer
if vectorizer_type == "tfidf":
    vectorizer = TfidfVectorizer
else:
    vectorizer = CountVectorizer()

# fit it to dataset
vectorizer.fit(x_train)

vocabulary = np.array(vectorizer.get_feature_names())
print("word_count is", len(vocabulary))

word_count is 1483


### Prepare Datatset

In [13]:
# create doc count vectors
train_doc_vectors = vectorizer.transform(x_train).toarray()
test_doc_vectors = vectorizer.transform(x_test).toarray()

wdf_train = pd.DataFrame(train_doc_vectors, columns=vocabulary)
wdf_test = pd.DataFrame(test_doc_vectors, columns=vocabulary)

In [14]:
wdf_train.head()

Unnamed: 0,000,031349,10,11,16th,1900,1940,1968,1982,1984,...,ya,yeah,year,yes,yo,yorker,you,your,zeuge,zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,3,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0


In [156]:
print(f"train_word_doc_freq matrix sparsity = {calculate_sparsity(wdf_train):.4f}")

train_word_doc_freq matrix sparsity = 0.9790


### Word Word Co-Occurence Probability

In [121]:
alpha = 0
wdf_train_prime = wdf_train.copy()

wdt_train = wdf_train_prime.copy()
wdt_train["__labels__"] = y_train

word_doc_count = wdf_train_prime.sum(0)
word_word_pr_distr = pd.DataFrame(data=0.0, columns=vocabulary, index=vocabulary)

for word in tqdm(vocabulary):
    pxy = (wdf_train_prime[wdf_train_prime[word] > 0].sum(0) + alpha) / (word_doc_count[word] + alpha)
    word_word_pr_distr[word] = pxy * (word_doc_count[word] / word_doc_count)

print(f"word_word_pr_distr shape = {word_word_pr_distr.shape}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1483.0), HTML(value='')))


word_word_pr_distr shape = (1483, 1483)


In [122]:
word_word_pr_distr.head()

Unnamed: 0,000,031349,10,11,16th,1900,1940,1968,1982,1984,...,ya,yeah,year,yes,yo,yorker,you,your,zeuge,zuma
000,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0
031349,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
10,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
11,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16th,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
print(f"word_word_pr_distr matrix sparsity = {calculate_sparsity(word_word_pr_distr):.4f}")

word_word_pr_distr matrix sparsity = 0.9603


In [135]:
word = "space"
given_word = "science"
word_word_pr_distr[word][given_word]

0.0

### Word Word Co-Occurence Prime Probability

In [168]:
num_of_iterations = 1

In [169]:
def func(x, pbar, word_word_pr_distr_prime):
    pbar.update(1)
    return word_word_pr_distr_prime.apply(lambda y: x*y, axis=0).max(0)

In [170]:
# word_word_pr_distr_prime = word_word_pr_distr.copy()
# with tqdm(total=len(vocabulary)*num_of_iterations) as pbar:
#     for _ in range(num_of_iterations):
#         word_word_pr_distr_prime = word_word_pr_distr_prime.apply(func, axis=1, args=(pbar, word_word_pr_distr_prime))

# print(f"word_word_pr_distr_prime shape = {word_word_pr_distr_prime.shape}")

In [171]:
word_word_pr_distr_prime.head()

Unnamed: 0,000,031349,10,11,16th,1900,1940,1968,1982,1984,...,ya,yeah,year,yes,yo,yorker,you,your,zeuge,zuma
000,1.0,0.065217,0.5,0.035714,0.071429,0.107143,0.071429,0.1,0.071429,0.5,...,0.125,0.038462,0.107143,0.038462,0.25,0.033333,0.5,0.5,0.045455,0.034091
031349,0.068182,1.0,0.068182,0.1,0.142857,0.09375,0.2,0.03125,0.032258,0.090909,...,0.043478,0.03125,0.142857,0.02439,0.111111,0.04,1.0,1.0,0.016129,0.1
10,1.0,0.04,1.0,0.035714,0.066667,0.107143,0.068966,0.2,0.071429,1.0,...,0.166667,0.076923,0.125,0.076923,0.5,0.066667,1.0,1.0,0.090909,0.068182
11,0.035714,0.065217,0.017857,1.0,0.25,0.25,0.1,0.035714,0.035714,0.035714,...,0.021739,0.125,0.25,0.05,0.043478,0.018349,0.431472,0.25,0.071429,0.1
16th,0.142857,0.142857,0.066667,0.5,1.0,0.066667,0.142857,0.058824,0.014286,0.066667,...,0.285714,0.018868,0.142857,0.333333,0.428571,0.066667,0.666667,0.466667,0.125,0.028571


#### Modified word word pr distr properties

In [172]:
wwds = calculate_sparsity(word_word_pr_distr)
wwdm = word_word_pr_distr.mean().mean()
wwdstd = word_word_pr_distr.mean().mean()

wwdps = calculate_sparsity(word_word_pr_distr_prime)
wwdpm = word_word_pr_distr_prime.mean().mean()
wwdpstd = word_word_pr_distr_prime.mean().mean()

print("                                           | sparsity | mean   | std")
print("-----------------------------------------------------------------------")
print(f"word_word_pr_distr matrix sparsity         | {wwds:.4f}   | {wwdm:.4f} | {wwdstd:.4f}")
print(f"word_word_pr_distr_prime matrix sparsity   | {wwdps:.4f}   | {wwdpm:.4f} | {wwdpstd:.4f}")

                                           | sparsity | mean   | std
-----------------------------------------------------------------------
word_word_pr_distr matrix sparsity         | 0.9603   | 0.0215 | 0.0215
word_word_pr_distr_prime matrix sparsity   | 0.0058   | 0.1225 | 0.1225


#### word word relation comparison with modified word_word_co matrix

In [173]:
word = "space"
given_word = "science"
word_word_pr_distr[word][given_word], word_word_pr_distr_prime[word][given_word]

(0.0, 0.1)

In [278]:
doc_index1 = 1
doc_index2 = 90

doc_indices = [doc_index1, doc_index2]

for doc_index in doc_indices[:3]:
    print(f"Topic: {y_train[doc_index]}\n{'='*50}\n{x_train[doc_index][:512]}")
    print()

Topic: talk.politics.mideast
your ignorance be obvious from your post 1 cyprus be an independent country with turkish greek inhabitant not a greek island like your ignorant post claim 2 the name should be cyprus in english next time read and learn before you post

Topic: alt.atheism
no wonder in the light of that you be a probably a theist who try to pas a an agnostic i still remember your post about your daughter sing chrismas carol and your feeling of it well



In [319]:
def calculate_word_inference_weight(word_word_pr_distr):
    entropy_values = calculate_entropy(word_word_pr_distr, axis=0)
    entropy_values_norm = (entropy_values.max() - entropy_values) / entropy_values.max()
    entropy_values_norm = pd.DataFrame(data=entropy_values_norm, index=word_word_pr_distr.columns)[0]
    return entropy_values_norm

word_inference_weight = calculate_word_inference_weight(word_word_pr_distr)
word_inference_weight_prime = calculate_word_inference_weight(word_word_pr_distr_prime)

word_inference_weight.head()

000       0.485069
031349    0.581125
10        0.595881
11        0.499414
16th      0.552060
Name: 0, dtype: float64

In [318]:
words = ["the", "be", "science", "space", "god", "religion"]

print(f'{"word":16s} | {"weight":6s} | {"prime_weight":6s}')
print("-----------------------------------------------------")
for w1 in words:
    print(f"{w1:16s} | {word_inference_weight[w1]:.4f} | {word_inference_weight_prime[w1]:.4f}")

word             | weight | prime_weight
-----------------------------------------------------
the              | 0.9800 | 0.9989
be               | 0.9805 | 0.9989
science          | 0.9993 | 0.9994
space            | 0.9998 | 0.9994
god              | 0.9995 | 0.9992
religion         | 0.9989 | 0.9992


In [310]:
words = ["the", "be", "science", "space", "god", "religion"]

print(f'{"word":16s} | {"weight":6s} | {"prime_weight":6s}')
print("-----------------------------------------------------")
for w1 in words:
    print(f"{w1:16s} | {word_inference_weight[w1]:.4f} | {word_inference_weight_prime[w1]:.4f}")

word             | weight | prime_weight
-----------------------------------------------------
the              | 0.0000 | 0.0059
be               | 0.0151 | 0.0052
science          | 0.6605 | 0.1025
space            | 0.8279 | 0.1046
god              | 0.7022 | 0.0505
religion         | 0.5921 | 0.0605


In [290]:
ct = Counter()
ctp = Counter()

for given_word, wfx in wdf_train.iloc[doc_index1][wdf_train.iloc[doc_index1] > 0].items():
    if not wfx > 0:
        continue
        
    for word, wfy in wdf_train.iloc[doc_index2][wdf_train.iloc[doc_index2] > 0].items():
        xv = word_word_pr_distr[word][given_word] * word_inference_weight[given_word]
        if xv > ct[given_word]:
#             print(given_word, word, xv)
            ct[given_word] = xv
            
        xv = word_word_pr_distr_prime[word][given_word] * word_inference_weight_prime[given_word]
        if xv > ctp[given_word]:
#             print(given_word, word, xv)
            ctp[given_word] = xv

In [291]:
num_of_topwords = 30

tw = ct.most_common(num_of_topwords)
twp = ctp.most_common(num_of_topwords)

num_of_topwords = len(tw)

print("====================================================")
print(f"printing {num_of_topwords} top co occuring words")
print("====================================================\n")

print(f'{"word":16s} | {"pr":6s} | {"pr_prime":6s}')
print("--------------------------------------------------")

for i in range(num_of_topwords):
    print(f"{tw[i][0][:16]:16s} | {tw[i][1]:.4f} | {twp[i][1]:.4f}")

printing 30 top co occuring words

word             | pr     | pr_prime
--------------------------------------------------
cyprus           | 0.7747 | 0.0922
independent      | 0.7747 | 0.0922
inhabitant       | 0.7747 | 0.0922
island           | 0.7747 | 0.0922
learn            | 0.7747 | 0.0922
name             | 0.7747 | 0.0922
turkish          | 0.7747 | 0.0922
ignorant         | 0.6987 | 0.0593
greek            | 0.6887 | 0.0588
ignorance        | 0.6833 | 0.0577
country          | 0.6778 | 0.0530
next             | 0.6713 | 0.0499
english          | 0.6619 | 0.0418
obvious          | 0.6079 | 0.0400
claim            | 0.6039 | 0.0364
before           | 0.6037 | 0.0224
read             | 0.5542 | 0.0199
should           | 0.4495 | 0.0153
from             | 0.3511 | 0.0087
post             | 0.3411 | 0.0077
your             | 0.2966 | 0.0073
an               | 0.2757 | 0.0059
time             | 0.2677 | 0.0052
like             | 0.2445 | 0.0045
not              | 0.2042 | 0.0038
wi

#### Doc doc relation

In [292]:
given_doc_index = 1
print(f"Topic: {y_train[given_doc_index]}\n{'='*50}\n{x_train[given_doc_index][:512]}")
print()

Topic: talk.politics.mideast
your ignorance be obvious from your post 1 cyprus be an independent country with turkish greek inhabitant not a greek island like your ignorant post claim 2 the name should be cyprus in english next time read and learn before you post



In [295]:
given_doc = wdf_train.iloc[given_doc_index][wdf_train.iloc[given_doc_index] > 0]
doc_doc_pr_distr = pd.DataFrame(data=0.0, columns=given_doc.index.tolist(), index=wdf_train.index.tolist())

for doc_index in tqdm(range(len(y_train))):
    doc = wdf_train.iloc[doc_index][wdf_train.iloc[doc_index] > 0]
    
    for given_word, wfx in given_doc.items():
        doc_doc_pr_distr.iloc[doc_index][given_word] = max(
            word_word_pr_distr[word][given_word] * word_inference_weight_prime[given_word] for word, wfy in doc.items()
        )
        
print(f"doc_doc_pr_distr shape = {doc_doc_pr_distr.shape}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=112.0), HTML(value='')))


doc_doc_pr_distr shape = (112, 31)


In [296]:
doc_doc_pr_distr.head()

Unnamed: 0,an,and,be,before,claim,country,cyprus,english,from,greek,...,obvious,post,read,should,the,time,turkish,with,you,your
0,0.003777,0.003152,0.005179,0.036386,0.01993,0.052979,0.092243,0.057707,0.00219,0.059331,...,0.022434,0.008743,0.039952,0.015304,0.005917,0.004476,0.092243,0.0034,0.000639,0.006183
1,0.003777,0.003792,0.005179,0.036386,0.01993,0.052979,0.092243,0.057707,0.00219,0.059331,...,0.022434,0.008743,0.039952,0.015304,0.005917,0.006155,0.092243,0.004139,0.000639,0.007728
2,0.003091,0.002969,0.005179,0.036386,0.01993,0.052979,0.092243,0.057707,0.002022,0.059331,...,0.022434,0.006558,0.039952,0.013117,0.005347,0.004476,0.092243,0.003105,0.000639,0.007728
3,0.003091,0.003152,0.005179,0.036386,0.01993,0.052979,0.092243,0.057707,0.002022,0.059331,...,0.022434,0.008197,0.039952,0.015304,0.005917,0.006155,0.092243,0.004139,0.000521,0.006183
4,0.003091,0.003152,0.005179,0.036386,0.01993,0.052979,0.092243,0.057707,0.00219,0.059331,...,0.022434,0.008743,0.039952,0.015304,0.005917,0.004476,0.092243,0.0034,0.000639,0.006183


In [300]:
topsize =  20
doc_doc_pr_distr.mean(1).sort_values(ascending=False).head(topsize)

1      0.037381
90     0.037281
48     0.037265
75     0.037264
100    0.037257
69     0.037255
26     0.037254
102    0.037246
29     0.037244
85     0.037241
77     0.037241
37     0.037239
99     0.037233
81     0.037233
3      0.037233
60     0.037232
78     0.037228
61     0.037225
32     0.037225
13     0.037224
dtype: float64

In [301]:
y_train[doc_doc_pr_distr.mean(1).sort_values(ascending=False).head(topsize).index]

array(['talk.politics.mideast', 'alt.atheism', 'talk.politics.mideast',
       'talk.politics.mideast', 'alt.atheism', 'rec.autos',
       'talk.politics.mideast', 'alt.atheism', 'talk.politics.mideast',
       'rec.autos', 'talk.politics.mideast', 'alt.atheism',
       'talk.politics.mideast', 'talk.politics.mideast', 'sci.space',
       'rec.autos', 'rec.autos', 'talk.politics.mideast',
       'talk.politics.mideast', 'rec.autos'], dtype='<U21')

In [302]:
n_clusters = 2

# cluster the topics
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(doc_doc_pr_distr)
print("latent_vector, kmeans_inertia = ", kmeans.inertia_)

for ii in range(n_clusters):
    print(Counter(y_train[kmeans.labels_ == ii]))

print()

latent_vector, kmeans_inertia =  0.004794562966925015
Counter({'rec.autos': 30, 'talk.politics.mideast': 27, 'sci.space': 27, 'alt.atheism': 26})
Counter({'rec.autos': 2})



In [259]:
kmeans.cluster_centers_.shape

(2, 31)

In [692]:
n_clusters = 4

wdf_train_prime = wdf_train.copy()
wdf_test_prime = wdf_test.copy()
    
# cluster the topics
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(wdf_train_prime)
print("wdf_train, kmeans_inertia = ", kmeans.inertia_)

clusters = []
categories_prime = list(range(n_clusters))

y_train_prime = kmeans.predict(wdf_train_prime)
y_test_prime = kmeans.predict(wdf_test_prime)

for ii in range(n_clusters):
    clusters.append(Counter(y_train[y_train_prime == ii]))
    print(clusters[-1])

print()

wdf_train, kmeans_inertia =  4230.109369977496
Counter({'sci.space': 17, 'talk.politics.mideast': 16, 'rec.autos': 16, 'alt.atheism': 12})
Counter({'talk.politics.mideast': 3, 'alt.atheism': 2, 'rec.autos': 1})
Counter({'sci.space': 10, 'rec.autos': 8, 'talk.politics.mideast': 6, 'alt.atheism': 4})
Counter({'alt.atheism': 7, 'talk.politics.mideast': 4, 'sci.space': 4, 'rec.autos': 2})



In [693]:
# y_train[doc_indices]

In [694]:
doc_index1 = 0
print(f"Topic: {y_train[doc_index1]}\n{'='*50}\n{x_train[doc_index1][:512]}")
print()

doc_index2 = 3
print(f"Topic: {y_train[doc_index2]}\n{'='*50}\n{x_train[doc_index2][:512]}")

Topic: rec.autos
not to mention my friend s 54 citroen traction avant with the light switch and dimmer integrate in a single stalk off the steer column those dumb french be apparently copying the japanese before the german

Topic: rec.autos
right in the thirty both buick and packard have two spare mount in well in the front fender of course that be back when the front fender be long enough to provide room there be a couple of other marque that do this a well but memory fade


In [695]:
dfx_li[doc_index1][doc_index2].max(1).sort_values(ascending=False).head(10)

integrate     1.0
in            1.0
and           1.0
apparently    1.0
avant         1.0
be            1.0
before        1.0
citroen       1.0
column        1.0
copying       1.0
dtype: float64

In [696]:
dfx_li[doc_index1][doc_index2].mean(1).sort_values(ascending=False).head(10)

in         0.216949
the        0.187317
be         0.187181
light      0.185714
to         0.184783
and        0.181193
mention    0.171429
steer      0.171429
single     0.171429
off        0.171429
dtype: float64

In [697]:
print(xv[xv < 0].mean(0).mean())
xv[xv < 0].mean(0).sort_values(ascending=False).head(10)

-0.5468186228727564


reuseable     -0.328365
reminder      -0.328365
version       -0.328365
nls           -0.328365
bastard       -0.328365
definitely    -0.328365
thank         -0.328365
suitable      -0.328365
sub           -0.328365
spacelifter   -0.328365
dtype: float64

In [549]:
dfx_li[doc_index1][doc_index2].max(0).sort_values(ascending=False).head(16)

that         1.0
be           1.0
have         1.0
the          1.0
police       1.0
will         1.0
with         1.0
you          0.6
believe      0.5
close        0.5
detection    0.5
detector     0.5
off          0.5
enough       0.5
forget       0.5
frequency    0.5
dtype: float64

In [550]:
dfx_li[doc_index1][doc_index2].max(1).sort_values(ascending=False).head(16)

read          1.0
with          1.0
will          1.0
be            1.0
conception    1.0
concern       1.0
concrete      1.0
creation      1.0
detail        1.0
differ        1.0
during        1.0
fit           1.0
force         1.0
have          1.0
idea          1.0
above         1.0
dtype: float64

In [210]:
wdf_train.iloc[idx2].sort_values(ascending=True)

000         0
pity        0
pimentel    0
pilot       0
piece       0
           ..
to          2
fuel        2
how         2
the         2
be          3
Name: 7, Length: 1467, dtype: int64

In [186]:
dfx.max(0).sort_values(ascending=True).head(15)

memory     0.050847
thirty     0.050847
buick      0.050847
room       0.050847
packard    0.050847
fade       0.050847
fender     0.050847
marque     0.050847
spare      0.067797
course     0.067797
mount      0.067797
front      0.067797
two        0.080000
both       0.107317
when       0.120000
dtype: float64

In [183]:
dfx.sum(0).sort_values(ascending=False).head(15)

the       4.303757
be        3.994595
to        3.977974
and       3.803540
in        3.734835
of        3.445151
that      2.753275
have      2.407538
do        2.102149
this      2.010799
but       1.688663
there     1.236046
well      0.891024
couple    0.836783
long      0.835313
dtype: float64

In [129]:
result = []
doc_index = 3
print(f"Topic: {labels[doc_index]}\n{'='*50}\n{docs[doc_index][:512]}")

width = 0
for i, word in enumerate(wdf_train.columns):
    result.append(gnbs[word].predict_log_proba(wdf_test.loc[[doc_index]])[0])
    if len(result) > width:
        width = len(result)

res = pd.DataFrame(data=result, index=wdf_train.columns).fillna(0)

Topic: rec.autos
not to mention my friend s 54 citroen traction avant with the light switch and dimmer integrate in a single stalk off the steer column those dumb french be apparently copying the japanese before the german


In [134]:
res.sum(1).sort_values()

and     -5.121664e+10
the     -5.065995e+10
mount   -3.989702e+10
spare   -3.989702e+10
issue   -3.711350e+10
             ...     
make    -1.855667e+09
we      -1.855666e+09
at      -1.670100e+09
me      -1.670099e+09
get     -1.113398e+09
Length: 1467, dtype: float64

In [113]:
res[1].sort_values(ascending=True).head(20)

be          -4.082481e+09
the         -7.422644e+08
that        -5.567038e+08
you         -3.711371e+08
of          -1.855654e+08
not         -4.879017e+03
play         0.000000e+00
planetary    0.000000e+00
plane        0.000000e+00
placement    0.000000e+00
place        0.000000e+00
pity         0.000000e+00
pimentel     0.000000e+00
pilot        0.000000e+00
piece        0.000000e+00
pickup       0.000000e+00
phone        0.000000e+00
phobos       0.000000e+00
peter        0.000000e+00
pertains     0.000000e+00
Name: 0, dtype: float64

In [99]:
np.log(0)

  """Entry point for launching an IPython kernel.


-inf

In [27]:
wdf_train_prime.head()

Unnamed: 0,000,031349,10,1000,11,14,16th,1900,1940,1982,...,yeah,year,yes,yo,yorker,you,your,yourself,zeuge,__labels__
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [64]:
kmeans.predict(wdf_train_prime)

array([2, 0, 2, 2, 2, 0, 0, 3, 1, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 3, 0,
       0, 0, 3, 3, 2, 0, 2, 2, 2, 3, 0, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0,
       0, 1, 0, 3, 0, 3, 3, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       3, 0, 0, 3, 2, 0, 2, 2, 0, 0, 0, 0, 2, 0, 1, 2, 0, 0, 3, 3, 0, 0,
       0, 0, 3, 2, 3, 0, 0, 1, 0, 2, 0, 2, 1, 0, 1, 0, 0, 0, 0, 3, 0, 2,
       0, 3])

In [18]:
# the word topic distr
wdt_train = wdf_train_prime.copy()
wdt_test = wdf_test_prime.copy()

wdt_test["__labels__"] = y_test_prime
wdt_train["__labels__"] = y_train_prime

word_doc_count = wdf_train_prime.sum(0)
topic_doc_count = pd.DataFrame(Counter(y_train_prime), index=[0]).T[0]
word_topic_pr_distr = pd.DataFrame(data=0.0, columns=categories_prime, index=vocabulary)

for category in tqdm(categories_prime):
    pxy = wdf_train_prime[wdt_train["__labels__"] == category].sum(0) / topic_doc_count[category]
    word_topic_pr_distr[category] = pxy * (topic_doc_count[category] / word_doc_count)

print(f"word_topic_pr_distr shape = {word_topic_pr_distr.shape}")

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


word_topic_pr_distr shape = (1467, 4)


In [19]:
word_topic_pr_distr.head()

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,0.0
31349,1.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0
1000,1.0,0.0,0.0,0.0
11,0.0,0.0,1.0,0.0


In [62]:
word_topic_entropy = calculate_entropy(word_topic_pr_distr, axis=1)
word_topic_entropy_norm = (word_topic_entropy.max() - word_topic_entropy) / word_topic_entropy.max()

order_factor = word_topic_entropy_norm #* (word_doc_count / word_doc_count.max())

order_indices = order_factor.argsort()[::-1]
word_topic_pr_distr.iloc[order_indices].head(10)

Unnamed: 0,0,1,2,3
zeuge,1.0,0.0,0.0,0.0
ftp,0.0,0.0,1.0,0.0
full,0.0,1.0,0.0,0.0
fundraise,0.0,0.0,1.0,0.0
funny,1.0,0.0,0.0,0.0
further,1.0,0.0,0.0,0.0
fusi,1.0,0.0,0.0,0.0
gain,1.0,0.0,0.0,0.0
gallon,0.0,0.0,0.0,1.0
game,0.0,1.0,0.0,0.0


In [61]:
word = "space"
calculate_entropy(word_topic_pr_distr.loc[word]), npword_doc_count[word]

(0.6730116670092565, 5)

In [60]:
word_doc_count.mean

<bound method Series.mean of 000          2
031349       1
10           1
1000         1
11           2
            ..
yorker       1
you         87
your        23
yourself     1
zeuge        1
Length: 1467, dtype: int64>

In [20]:
# the entropy and softma for reducing relation
word_topic_pr_entropy = calculate_entropy(word_topic_pr_distr, axis=1)

# the norm freq of the word in topic
word_topic_pr_freq_weight = word_topic_pr_distr.sum(1)


# word_topic_pr_infl = 1 - softmax(word_topic_pr_entropy)

# influence
# wdf_train_prime *= word_topic_pr_infl

In [21]:
wdf_train_prime.max().max()

7

In [22]:
word_topic_pr_freq_weight.max()

1.0

In [23]:
word_topic_pr_entropy

array([0.       , 0.       , 0.       , ..., 1.0554635, 0.       ,
       0.       ])