In [6]:
#run preprocessing
%run preprocessing.ipynb

### Calculate TF-IDF using self-implemented method

In [17]:

# obtain term-frequency matrix
import numpy as np
import os
import json
import time
import pandas as pd

def tf_idf_processing(raw_doc_path):
    word2index = {}
    document2index = {}
    index2document = {}
    document_word_vectors = {}
    w_cnt = 0
    d_cnt = 0
    for root, dirs, files in os.walk(raw_doc_path):
        for f in files:
            print('.', end='')
            document_word_vectors[f] = []
            document2index[f] = d_cnt
            index2document[d_cnt] = f
            d_cnt+=1
            with open(root+'/'+f) as fs:
                try:
                    for line in fs:
                        #loads json file, preprocess the content
                        obj = json.loads(line)
                        textType = obj['type']
                        if textType == 'paragraph':
                            words = preprocessing_spacy(obj['content'].lower())
                            for w in words:
                                if w not in word2index:
                                    #reserve word_index dict, for referening later on
                                    word2index[w] = w_cnt
                                    w_cnt+=1
                                document_word_vectors[f].append(word2index[w])
                except:
                    print ('Error while processing: ',f)

    #create word_frequency matrix                        
    w_f_matrix = np.zeros((len(word2index),len(document2index)))
    for doc in document_word_vectors:
        i = document2index[doc]
        for j in document_word_vectors[doc]:
            w_f_matrix[j,i]+=1 

    # obtain normalized term-frequency matrix
    t_f = np.copy(w_f_matrix)
    sum_f = np.zeros(len(document2index))
    for i in range(len(document2index)):
        sum_f[i] = np.sum(t_f[:,i])
    t_f = np.divide(t_f,sum_f)  

    # obtaining tf-idf matrix
    inv_doc_freq = np.count_nonzero(t_f,axis=1)
    def normalize(a,x):
        return np.log(x/a)
    norm = np.vectorize(normalize)
    inv_doc_freq = norm(inv_doc_freq,len(document2index))
    raw_tf_idf = np.multiply(t_f,inv_doc_freq.reshape(-1,1))
    return raw_tf_idf, word2index, index2document

In [18]:
def find_top_k_words(tf_idf, word2index, index2document, k):
    document_freq_words = {}
    for i in range(tf_idf.shape[1]):
        freq_words = []
        #get indices of k-maximum values in numpy column
        index = np.argpartition(tf_idf[:, i], -k)[-k:]
        index = index[np.argsort(tf_idf[:, i][index])]
        for ind in index:
            #find frequent words with coresponding index
            freq_words.append(list(word2index.keys())[list(word2index.values()).index(ind)])
        filename = index2document[i].split('.')[0]
        document_freq_words[filename] = freq_words
        df = pd.DataFrame(document_freq_words)
    return df

In [32]:
# %%time
# raw_doc_path = '/home/bit/ma0/LabShare/data/three-companies/json'
# raw_tf_idf, word2index, index2document = tf_idf_processing(raw_doc_path)

### Find most frequent keywords using self implemented tf-idf library

In [33]:
# find_top_k_words(raw_tf_idf, word2index, index2document, 10)

### Calculate TF-IDF on preprocessed documents, to find most frequent words

In [38]:
import pickle

def tf_idf_PrerocessedDoc(processed_docu_path, document_limit):
    word2index = {}
    document2index = {}
    index2document = {}
    document_word_vectors = {}
    w_cnt = 0
    d_cnt = 0
    for root, dirs, files in os.walk(processed_docu_path):
        for f in files:
            print('.', end='')
            document_limit -= 1
            document_word_vectors[f] = []
            document2index[f] = d_cnt
            index2document[d_cnt] = f
            d_cnt+=1
            with open(root+'/'+f, 'rb') as fs:
                try:
                    words = pickle.load(fs)
                    for w in words:
                        if w not in word2index:
                            word2index[w] = w_cnt
                            w_cnt += 1
                        document_word_vectors[f].append(word2index[w])
                except:
                    print ('Error while processing: ',f)
            if document_limit == 0:
                break

    #create word_frequency matrix                        
    w_f_matrix = np.zeros((len(word2index),len(document2index)))
    for doc in document_word_vectors:
        i = document2index[doc]
        for j in document_word_vectors[doc]:
            w_f_matrix[j,i]+=1 

    # obtain normalized term-frequency matrix
    t_f = np.copy(w_f_matrix)
    sum_f = np.zeros(len(document2index))
    for i in range(len(document2index)):
        sum_f[i] = np.sum(t_f[:,i])
    t_f = np.divide(t_f,sum_f)  

    # obtaining tf-idf matrix
    inv_doc_freq = np.count_nonzero(t_f,axis=1)
    def normalize(a,x):
        return np.log(x/a)
    norm = np.vectorize(normalize)
    inv_doc_freq = norm(inv_doc_freq,len(document2index))
    new_tf_idf = np.multiply(t_f,inv_doc_freq.reshape(-1,1))
    return new_tf_idf, word2index, index2document, inv_doc_freq

In [39]:
# %%time
# processed_doc_path = './LabShare/data/chui_ma/spacy_corpus'
# new_tf_idf, new_word2index, new_index2document, inv_doc_freq = tf_idf_PrerocessedDoc(processed_doc_path, 1000)

In [40]:
# %%time
# find_top_k_words(new_tf_idf, new_word2index, new_index2document, 10)