In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### imports

In [2]:
print(__doc__)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

Automatically created module for IPython interactive environment


### utilities

In [70]:
def sigmoid(x):
    return 1 / (1 + (np.e**-x))
                
def clean_documents(docs):
    def clean_doc(doc):
        ''' remove unwanter characters line new line '''

        unwanted_chrs = [')', '(', '{', '}', '\t', '\n', '\r', "'", '"', "!"]
        for unwanted_chr in unwanted_chrs:
            doc = doc.replace(unwanted_chr, ' ')

        return doc.strip()
    
    return [clean_doc(doc) for doc in docs]

def build_topic_word_distr(topics, word_topic_cos, words, topic_word_window_width, word_doc_frequency):
    topic_word_distr = pd.DataFrame(data=0.0, columns=topics, index=words)

    for topic in tqdm(range(len(topics))):
        word_topic_co = word_topic_cos[topic]
        word_word_co = pd.DataFrame(data=0.0, columns=word_topic_co[:topic_word_window_width].index, index=words)

        for index, (top_word, corelation) in enumerate(word_topic_co.items()):
            if index == topic_word_window_width:
                break

            word_word_frequency = corelation * word_doc_binary_freqency[word_doc_binary_freqency[top_word] > 0].sum(0)
            trust_factor = sigmoid(word_doc_frequency)

            word_word_co[top_word] = (word_word_frequency * trust_factor) / word_doc_frequency
        topic_word_distr[topics[topic]] = word_word_co.max(1)
    return topic_word_distr

def infer_topic(topics, doc_vector, topic_word_distr):
    doc_topic_word_distr = topic_word_distr.copy()
    doc_word_freq_norm = (doc_vector > 0).astype(int)
#     doc_word_freq_norm = doc_vector / doc_vector.sum() if doc_vector.sum() else 0

    for topic in topics:
        doc_topic_word_distr[topic] *= doc_word_freq_norm
    
    return doc_topic_word_distr, np.max(doc_topic_word_distr).idxmax()

### load dataset

In [71]:
# total number of samples needed
datasize = 1000
randomize = False

# retrieve dataset
docs = fetch_20newsgroups(subset='train', shuffle=randomize, remove=('headers', 'footers', 'quotes'))
docs, old_labels, classes = docs.data[:datasize], docs.target[:datasize], docs.target_names

In [72]:
# the actual labels as np array
old_labels = np.array(old_labels)
labels = np.zeros(old_labels.shape, dtype=int)

# the new classes
label_classes = list(set([x.split('.')[0] for x in classes]))

# restructuring classes  from 19 to less
for label, cl in enumerate(classes):
    labels[old_labels == label] = label_classes.index(cl.split('.')[0])

In [73]:
print(f"there are {len(docs)} docs and {len(label_classes)} classes: {label_classes}")

there are 1000 docs and 7 classes: ['comp', 'talk', 'sci', 'soc', 'alt', 'rec', 'misc']


### clean dataset

In [74]:
# clean out the new line characters from text in docs
clean_docs = clean_documents(docs)
clean_docs[0]

'morgan and guzman will have era s 1 run higher than last year, and  the cubs will be idiots and not pitch harkey as much as hibbard.  castillo won t be good  i think he s a stud pitcher'

### count words

In [75]:
# initialize the count vectorizer
count_vectorizer = CountVectorizer()
# count_vectorizer = TfidfVectorizer()

# fit it to dataset
count_vectorizer.fit(clean_docs)

# create dataset
word_count = pd.DataFrame(count_vectorizer.vocabulary_, index=[0])

print("word_count shape is", word_count.shape)

word_count shape is (1, 19476)


In [76]:
word_count.head()

Unnamed: 0,morgan,and,guzman,will,have,era,run,higher,than,last,...,optilink,molested,w4wg,lastdrive,refund,lurch,conical,cornea,skysweepers,skies
0,12098,2746,8904,19022,9063,7336,15421,9215,17511,10791,...,12940,12042,18740,10792,14779,11250,5281,5505,16267,16249


### Prepare Datatset

In [77]:
# create doc count vectors
doc_vectors = count_vectorizer.transform(clean_docs).toarray()

train_doc_vectors, test_doc_vectors, train_labels, test_labels = train_test_split(doc_vectors, labels, test_size=.33, random_state=42)
print(f"{len(train_labels)} train_docs, {len(test_labels)} test docs")

670 train_docs, 330 test docs


In [78]:
document_word_frequency = pd.DataFrame(train_doc_vectors, columns=count_vectorizer.get_feature_names())
document_word_binary_frequency = (document_word_frequency > 0).astype('int')

document_word_frequency["__labels__"] = train_labels
document_word_binary_frequency["__labels__"] = train_labels

print("document_word_frequency shape is", document_word_frequency.shape)

document_word_frequency shape is (670, 19477)


In [79]:
print(f"there are {len(clean_docs)} docs and {len(label_classes)} classes")

there are 1000 docs and 7 classes


In [80]:
document_word_frequency.head()

Unnamed: 0,00,000,0000,00000000,00000000b,00000001,00000001b,00000010,00000010b,00000011,...,zoom,zoomed,zooming,zubov,zum,zupancic,zx,zx900a,zzz,__labels__
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [81]:
document_word_binary_frequency.head()

Unnamed: 0,00,000,0000,00000000,00000000b,00000001,00000001b,00000010,00000010b,00000011,...,zoom,zoomed,zooming,zubov,zum,zupancic,zx,zx900a,zzz,__labels__
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


### Cherry pick dataset

In [82]:
# trim the data to words that spread round the corpus

#reduce freq in doc to bin value of 1 or 0
word_doc_binary_freqency = document_word_binary_frequency.drop(["__labels__"], axis='columns')

#the sum vertically of bin freq
word_doc_frequency = word_doc_binary_freqency.sum(0) 

### Topic and word corelation

In [83]:
word_topic_cos = []
for topic, label in enumerate(label_classes):
    word_topic_frequency = word_doc_binary_freqency[document_word_frequency['__labels__'] == topic].sum(0)
    trust_factor = sigmoid(word_doc_frequency)
    
    word_topic_co = (word_topic_frequency * trust_factor) / word_doc_frequency
    word_topic_co = word_topic_co[word_topic_co > 0.5].sort_values(ascending=False)
    
    word_topic_cos.append(word_topic_co)
    print(f"topic {topic} has {word_topic_co.shape} skew words")

topic 0 has (2108,) skew words
topic 1 has (4005,) skew words
topic 2 has (2502,) skew words
topic 3 has (694,) skew words
topic 4 has (217,) skew words
topic 5 has (2444,) skew words
topic 6 has (352,) skew words


In [84]:
label_classes

['comp', 'talk', 'sci', 'soc', 'alt', 'rec', 'misc']

In [85]:
topic_index = label_classes.index('sci')
word_topic_cos[topic_index][word_topic_cos[topic_index] > 0].sort_values(ascending=False)

encryption    0.999998
clipper       0.999994
escrow        0.999665
lunar         0.999089
orbit         0.997527
                ...   
step          0.555487
sort          0.550000
safety        0.545445
science       0.538460
cause         0.526316
Length: 2502, dtype: float64

### Building topic model

In [86]:
converged = False
last_accuracy = last_max_accuracy = 0
window_size = 100
window_step = window_base_step = 100
decay_factor = 10

while not converged:
    print("Building Topic_word_distr_prime...")
    topic_word_distr_prime = build_topic_word_distr(label_classes, word_topic_cos, word_doc_binary_freqency.columns, window_size, word_doc_frequency)

    score = 0
    print("Evaluating Topic Model...")
    for doc_index in tqdm(range(len(train_labels))):
        doc_vector = train_doc_vectors[doc_index]
        doc_topic_word_distr, doc_topic = infer_topic(label_classes, doc_vector, topic_word_distr_prime)
        score += int(doc_topic == label_classes[train_labels[doc_index]])

    accuracy = score / (doc_index + 1)
    print(f"==> topic_word_distr_prime has shape {topic_word_distr_prime.shape} from window_size {window_size} and window_step {window_step} accuracy is {accuracy*100:.2f}%\n")
    
    if abs(accuracy - last_max_accuracy) < .001:
        print("accuracy low", abs(accuracy - last_max_accuracy))
        converged = True
        
    elif accuracy >= last_max_accuracy:
        window_size += window_step
        last_max_accuracy = accuracy
    
    else:
        if last_accuracy == last_max_accuracy:
            window_size -= window_step
            window_step = int(window_step / decay_factor)
            
            if not window_step:
                print("window decayed!!")
                converged = False
        window_size += window_step
        
    last_accuracy = accuracy

Building Topic_word_distr_prime...


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Evaluating Topic Model...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


==> topic_word_distr_prime has shape (19476, 7) from window_size 100 and window_step 100 accuracy is 76.12%

Building Topic_word_distr_prime...


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Evaluating Topic Model...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


==> topic_word_distr_prime has shape (19476, 7) from window_size 200 and window_step 100 accuracy is 82.54%

Building Topic_word_distr_prime...


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Evaluating Topic Model...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


==> topic_word_distr_prime has shape (19476, 7) from window_size 300 and window_step 100 accuracy is 85.67%

Building Topic_word_distr_prime...


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Evaluating Topic Model...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


==> topic_word_distr_prime has shape (19476, 7) from window_size 400 and window_step 100 accuracy is 85.52%

Building Topic_word_distr_prime...


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Evaluating Topic Model...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


==> topic_word_distr_prime has shape (19476, 7) from window_size 310 and window_step 10 accuracy is 85.67%

accuracy low 0.0


### Testing Topic Model

In [87]:
score = 0
print("Evaluating Topic Model...")
for doc_index in tqdm(range(len(test_labels))):
    doc_vector = test_doc_vectors[doc_index]
    doc_topic_word_distr, doc_topic = infer_topic(label_classes, doc_vector, topic_word_distr_prime)
    score += int(doc_topic == label_classes[test_labels[doc_index]])
    
    if score:
        continue
    
#     print(clean_docs[len(train_labels)+doc_index])
#     print("{:8s} {:16s} {:6s}".format("topic", "word", "relation"))
#     print("="*40)
#     for label, word in doc_topic_word_distr.idxmax().items():
#         print("{:8s} {:16s} {:.4f}".format(label, word, doc_topic_word_distr[label][word]))

#     print(f"\nthe topic predicted is ==> '{np.max(doc_topic_word_distr).idxmax()}'")
#     print(f"the actual topic is ==> '{label_classes[labels[doc_index]]}'")

accuracy = score / (doc_index + 1)
print(f"==> topic_word_distr has shape {topic_word_distr_prime.shape} from window_size {window_size} and window_step {window_step} test-accuracy is {accuracy*100:.2f}%\n")

Evaluating Topic Model...


HBox(children=(FloatProgress(value=0.0, max=330.0), HTML(value='')))


==> topic_word_distr has shape (19476, 7) from window_size 310 and window_step 10 test-accuracy is 62.73%

