In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### imports

In [2]:
print(__doc__)

import time

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import entropy
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

Automatically created module for IPython interactive environment


### utilities

In [33]:
def sigmoid(x):
    return 1 / (1 + (np.e**-x))
                
def clean_documents(docs):
    def clean_doc(doc):
        ''' remove unwanter characters line new line '''

        unwanted_chrs = [')', '(', '{', '}', '\t', '\n', '\r', "'", '"', "!"]
        for unwanted_chr in unwanted_chrs:
            doc = doc.replace(unwanted_chr, ' ')

        return doc.strip()
    
    return [clean_doc(doc) for doc in docs]

def build_topic_word_distr(topics, word_topic_cos, words, topic_word_window_width, word_doc_frequency):
    topic_word_distr = pd.DataFrame(data=0.0, columns=topics, index=words)

    for topic, word_topic_co in enumerate(word_topic_cos):
        word_word_co = pd.DataFrame(data=0.0, columns=word_topic_co[:topic_word_window_width].index, index=words)

        for index, (top_word, corelation) in enumerate(word_topic_co.items()):
            if index == topic_word_window_width:
                break

            word_word_frequency = corelation * word_doc_binary_freqency[word_doc_binary_freqency[top_word] > 0].sum(0)
            trust_factor = sigmoid(word_doc_frequency)

            word_word_co[top_word] = (word_word_frequency * trust_factor) / word_doc_frequency
        topic_word_distr[topics[topic]] = word_word_co.max(1)
    return topic_word_distr

def infer_topic(topics, doc_vector, topic_word_distr):
    doc_topic_word_distr = topic_word_distr.copy()
    doc_word_freq_norm = doc_vector / doc_vector.sum() if doc_vector.sum() else 0

    for topic in topics:
        doc_topic_word_distr[topic] *= doc_word_freq_norm
        
    return np.max(doc_topic_word_distr).idxmax()

### load dataset

In [4]:
# total number of samples needed
datasize = 1000

# retrieve dataset
docs = fetch_20newsgroups(subset='train', shuffle=False, remove=('headers', 'footers', 'quotes'))
docs, old_labels, classes = docs.data[:datasize], docs.target[:datasize], docs.target_names

In [5]:
# the actual labels as np array
old_labels = np.array(old_labels)
labels = np.zeros(old_labels.shape, dtype=int)

# the new classes
label_classes = list(set([x.split('.')[0] for x in classes]))

# restructuring classes  from 19 to less
for label, cl in enumerate(classes):
    labels[old_labels == label] = label_classes.index(cl.split('.')[0])

In [6]:
print(f"there are {len(docs)} docs and {len(label_classes)} classes: {label_classes}")

there are 1000 docs and 7 classes: ['comp', 'misc', 'alt', 'soc', 'sci', 'rec', 'talk']


### clean dataset

In [7]:
# clean out the new line characters from text in docs
clean_docs = clean_documents(docs)
clean_docs[0]

'morgan and guzman will have era s 1 run higher than last year, and  the cubs will be idiots and not pitch harkey as much as hibbard.  castillo won t be good  i think he s a stud pitcher'

### count words

In [8]:
# initialize the count vectorizer
count_vectorizer = CountVectorizer()

# fit it to dataset
count_vectorizer.fit(clean_docs)

# create dataset
word_count = pd.DataFrame(count_vectorizer.vocabulary_, index=[0])

print("word_count shape is", word_count.shape)

word_count shape is (1, 19476)


In [9]:
word_count.head()

Unnamed: 0,morgan,and,guzman,will,have,era,run,higher,than,last,...,optilink,molested,w4wg,lastdrive,refund,lurch,conical,cornea,skysweepers,skies
0,12098,2746,8904,19022,9063,7336,15421,9215,17511,10791,...,12940,12042,18740,10792,14779,11250,5281,5505,16267,16249


### Prepare Datatset

In [10]:
# create doc count vectors
doc_vectors = count_vectorizer.transform(clean_docs).toarray()

document_word_frequency = pd.DataFrame(doc_vectors, columns=count_vectorizer.get_feature_names())
document_word_binary_frequency = (document_word_frequency > 0).astype('int')

document_word_frequency["__labels__"] = labels
document_word_binary_frequency["__labels__"] = labels

print("document_word_frequency shape is", document_word_frequency.shape)

document_word_frequency shape is (1000, 19477)


In [11]:
print(f"there are {len(clean_docs)} docs and {len(label_classes)} classes")

there are 1000 docs and 7 classes


In [12]:
document_word_frequency.head()

Unnamed: 0,00,000,0000,00000000,00000000b,00000001,00000001b,00000010,00000010b,00000011,...,zoom,zoomed,zooming,zubov,zum,zupancic,zx,zx900a,zzz,__labels__
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [13]:
document_word_binary_frequency.head()

Unnamed: 0,00,000,0000,00000000,00000000b,00000001,00000001b,00000010,00000010b,00000011,...,zoom,zoomed,zooming,zubov,zum,zupancic,zx,zx900a,zzz,__labels__
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


### Cherry pick dataset

In [14]:
# trim the data to words that spread round the corpus

#reduce freq in doc to bin value of 1 or 0
word_doc_binary_freqency = document_word_binary_frequency.drop(["__labels__"], axis='columns')

#the sum vertically of bin freq
word_doc_frequency = word_doc_binary_freqency.sum(0) 

### Topic and word corelation

In [15]:
word_topic_cos = []
for topic, label in enumerate(label_classes):
    word_topic_frequency = word_doc_binary_freqency[document_word_frequency['__labels__'] == topic].sum(0)
    trust_factor = sigmoid(word_doc_frequency)
    
    word_topic_co = (word_topic_frequency * trust_factor) / word_doc_frequency
    word_topic_co = word_topic_co[word_topic_co > 0.5].sort_values(ascending=False)
    
    word_topic_cos.append(word_topic_co)
    print(f"topic {topic} has {word_topic_co.shape} skew words")

topic 0 has (2828,) skew words
topic 1 has (410,) skew words
topic 2 has (280,) skew words
topic 3 has (725,) skew words
topic 4 has (3275,) skew words
topic 5 has (3320,) skew words
topic 6 has (4516,) skew words


In [16]:
label_classes

['comp', 'misc', 'alt', 'soc', 'sci', 'rec', 'talk']

In [19]:
topic_index = label_classes.index('sci')
word_topic_cos[topic_index][word_topic_cos[topic_index] > 0].sort_values(ascending=False)

encryption    1.000000
clipper       1.000000
escrow        0.999877
lunar         0.999665
sci           0.999089
                ...   
gain          0.545445
bits          0.545445
technical     0.533333
higher        0.529412
space         0.515152
Length: 3275, dtype: float64

### Building topic model

In [34]:
window_size = 1
topic_word_distr = build_topic_word_distr(label_classes, word_topic_cos, word_doc_binary_freqency.columns, window_size, word_doc_frequency)

score = 0
for doc_index, doc_vector in tqdm_notbook(enumerate(doc_vectors)):
    doc_topic = infer_topic(label_classes, doc_vector, topic_word_distr)
    score += int(doc_topic == label_classes[labels[doc_index]])

accuracy = score / (doc_index + 1)
print(f"topic_word_distr has shape {topic_word_distr.shape} from window_size {topic_word_window_width} accuracy is {accuracy}")

topic_word_distr has shape (19476, 7)
accuracy is 0.442


In [38]:
# from tqdm.notebook import tqdm
import tqdm

ModuleNotFoundError: No module named 'tqdm'

In [29]:
doc_topic

'rec'