In [6]:
document_fname="/home/hank/Backup_data/data/finalex_reuters-cleaned-document_without_zeros.txt"

### Reading documents & Calculating term frequency

In [8]:
from collections import Counter, defaultdict
import sys
from utils import get_process_memory

M_term_doc = defaultdict(lambda: {})
with open(document_fname, encoding='utf-8') as f:
    for d, doc in enumerate(f):
        tf = Counter(doc.split())
        for t, freq in tf.items():
            M_term_doc[t][d] = freq
        if d % 1000 == 0: 
            sys.stdout.write('\r inserting ... %d docs, mem= %.3f Gb' %(d+1, get_process_memory()))

 inserting ... 203001 docs, mem= 2.509 Gb

### Importing BOC models

In [15]:
import glob

#word2concept_fname=[ef for ef in glob.glob("/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c*.csv")]
word2concept_fname=['/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d200_w8_mf50_c200.csv']
print(word2concept_fname)

['/home/hank/Desktop/projects/auto_concept_labeling_POC/trained_results/w2c_d200_w8_mf50_c200.csv']


### Method for calculating co-occurence between two terms within same document

In [10]:
def cooccurrence(w1, w2):
    docs1 = M_term_doc.get(w1, {})
    docs2 = M_term_doc.get(w2, {})
    cooccurrence = 0
    for d1, tf_d1w1 in docs1.items():
        tf_d1w2 = docs2.get(d1, 0)
        if not tf_d1w2:
            continue
        cooccurrence += max(tf_d1w1, tf_d1w2)
    return cooccurrence

In [13]:
get_tf = lambda w:sum(M_term_doc[w].values())

### Iteratively calculate PMI score for words within their respective concept

In [18]:
import math

for es in word2concept_fname:
    outputname=es.split("/")[-1][:-4]+"_pmi.csv"
    print(outputname)
    concept_to_words = defaultdict(lambda: [])
    with open(es, encoding='utf-8') as f:
        for row in f:
            cols = row.strip().split(',')
            concept = int(cols[-1])
            words = ','.join(cols[:-1])
            concept_to_words[concept].append(words)
    M_cooccurrence = defaultdict(lambda: {})
    for concept, words in concept_to_words.items():
        #print('concept= %d (%d words) ... ' % (concept, len(words)), end='')
        for w1 in words:
            for w2 in words:
                if w1 <= w2: continue
                cooc = cooccurrence(w1, w2)
                M_cooccurrence[w1][w2] = cooc
                M_cooccurrence[w2][w1] = cooc
    print('done. mem= %.3f Gb' % get_process_memory())
    word_to_pmi = {}

    m = 1
    n = 2

    i_words = 0
    n_words = sum((len(words) for words in concept_to_words.values()))

    for concept, words in concept_to_words.items():
        for word in words:
            pmi=0
            i_words += 1
            if i_words % 100 == 0:
                args = (i_words, n_words, get_process_memory())
                sys.stdout.write('\r computing sparsity ... %d words in %d. mem= %.3f Gb' % args)

            cooccurrence_vector = M_cooccurrence.get(word, {})

            if not cooccurrence_vector:
                continue
            
            for word2, cooc in cooccurrence_vector:
                pmi+=math.log(cooc/(get_tf(word)*get_tf(word2)))
            
            word_to_pmi[word]=pmi
            
    print('\ndone')
    
    with open(outputname, "w") as f:
        for concept, words in concept_to_words.items():
            topk_words = sorted(words, key=lambda x:word_to_pmi[x])
            for w in topk_words:
                f.write('%d, %s, %.3f, %d, %d\n' % (concept, w, word_to_pmi(w), get_tf(w), get_df(w)))
    print('....%s created' % outputname)

w2c_d200_w8_mf50_c200pmi.csv
done. mem= 2.754 Gb
done. mem= 2.754 Gb
done. mem= 2.754 Gb
done. mem= 2.754 Gb
done. mem= 2.754 Gb
done. mem= 2.754 Gb


KeyboardInterrupt: 