In [1]:
document_fname="/home/hank/Backup_data/data/finalex_reuters-cleaned-document_without_zeros.txt"

### Reading documents & Calculating term frequency

In [2]:
from collections import Counter, defaultdict
import sys
from utils import get_process_memory

M_term_doc = defaultdict(lambda: {})
with open(document_fname, encoding='utf-8') as f:
    for d, doc in enumerate(f):
        tf = Counter(doc.split())
        for t, freq in tf.items():
            M_term_doc[t][d] = freq
        if d % 1000 == 0: 
            sys.stdout.write('\r inserting ... %d docs, mem= %.3f Gb' %(d+1, get_process_memory()))

 inserting ... 203001 docs, mem= 2.508 Gb

### Importing BOC models

In [2]:
import glob

word2concept_fname=[ef for ef in glob.glob("/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c*.csv")]
print(word2concept_fname)
print(len(word2concept_fname))

['/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d300_w8_mf50_c100.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d200_w8_mf50_c200.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d300_w8_mf50_c300.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d200_w8_mf50_c100.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d100_w8_mf50_c300.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d200_w8_mf50_c300.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d100_w8_mf50_c100.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d100_w8_mf50_c200.csv', '/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d300_w8_mf50_c200.csv']
9


### Method for calculating co-occurence between two terms within same document

In [4]:
def cooccurrence(w1, w2):
    docs1 = M_term_doc.get(w1, {})
    docs2 = M_term_doc.get(w2, {})
    cooccurrence = 0
    for d1, tf_d1w1 in docs1.items():
        tf_d1w2 = docs2.get(d1, 0)
        if not tf_d1w2:
            continue
        cooccurrence += max(tf_d1w1, tf_d1w2)
    return cooccurrence

### Methods for calculating sparsity score

In [13]:
norm = lambda x, p: 0 if (not x or p == 0) else pow(sum(v ** p for v in x.values()), 1/p)
sparsity = lambda x, m, n, km_over_kn: (km_over_kn - (norm(x,m)/norm(x,n))) / (0.000000000000000001 + km_over_kn - 1)

### Iteratively calculate sparsity score for words in all of the BOC models

In [14]:
for es in word2concept_fname:
    outputname=es.split("/")[-1][:-4]+"_sparsity.csv"
    print(outputname)
    concept_to_words = defaultdict(lambda: [])
    with open(es, encoding='utf-8') as f:
        for row in f:
            cols = row.strip().split(',')
            concept = int(cols[-1])
            words = ','.join(cols[:-1])
            concept_to_words[concept].append(words)
    M_cooccurrence = defaultdict(lambda: {})
    for concept, words in concept_to_words.items():
        #print('concept= %d (%d words) ... ' % (concept, len(words)), end='')
        for w1 in words:
            for w2 in words:
                if w1 <= w2: continue
                cooc = cooccurrence(w1, w2)
                M_cooccurrence[w1][w2] = cooc
                M_cooccurrence[w2][w1] = cooc
        #print('done. mem= %.3f Gb' % get_process_memory())
    word_to_sparsity = {}

    m = 1
    n = 2

    i_words = 0
    n_words = sum((len(words) for words in concept_to_words.values()))

    for concept, words in concept_to_words.items():
        k = len(words)
        km_over_kn = pow(k, 1/m) / pow(k, 1/n)

        for word in words:
            i_words += 1
            if i_words % 100 == 0:
                args = (i_words, n_words, get_process_memory())
                sys.stdout.write('\r computing sparsity ... %d words in %d. mem= %.3f Gb' % args)

            cooccurrence_vector = M_cooccurrence.get(word, {})

            if not cooccurrence_vector:
                continue

            if len(cooccurrence_vector) == 1:
                word_to_sparsity[word] = 1
                continue
            try:
                word_to_sparsity[word] = sparsity(cooccurrence_vector, m, n, km_over_kn)
            except ZeroDivisionError:
                print("...Zero Division Error!")
                word_to_sparsity[word] = -1
                continue
    print('\ndone')
    get_sparsity = lambda w:word_to_sparsity[w]
    get_tf = lambda w:sum(M_term_doc[w].values())
    get_df = lambda w:len(M_term_doc[w])
    
    with open(outputname, "w") as f:
        for concept, words in concept_to_words.items():
            topk_words = sorted(words, key=lambda x:word_to_sparsity[x])
            for w in topk_words:
                f.write('%d, %s, %.3f, %d, %d\n' % (concept, w, get_sparsity(w), get_tf(w), get_df(w)))
    print('....%s created' % outputname)

w2c_d200_w8_mf50_c200_sparsity.csv
 computing sparsity ... 40000 words in 40069. mem= 3.391 Gb
done
....w2c_d200_w8_mf50_c200_sparsity.csv created
w2c_d300_w8_mf50_c300_sparsity.csv
 computing sparsity ... 35000 words in 40069. mem= 3.391 Gb...Zero Division Error!
 computing sparsity ... 40000 words in 40069. mem= 3.391 Gb...Zero Division Error!

done
....w2c_d300_w8_mf50_c300_sparsity.csv created
w2c_d200_w8_mf50_c100_sparsity.csv
 computing sparsity ... 40000 words in 40069. mem= 3.410 Gb
done
....w2c_d200_w8_mf50_c100_sparsity.csv created
w2c_d100_w8_mf50_c300_sparsity.csv
 computing sparsity ... 6400 words in 40069. mem= 3.409 Gb...Zero Division Error!
 computing sparsity ... 9100 words in 40069. mem= 3.409 Gb...Zero Division Error!
 computing sparsity ... 9400 words in 40069. mem= 3.409 Gb...Zero Division Error!
...Zero Division Error!
...Zero Division Error!
 computing sparsity ... 11500 words in 40069. mem= 3.409 Gb...Zero Division Error!
 computing sparsity ... 13400 words in 4