### Calculate TF-IDF from scrach -- complete code

In [48]:
#run preprocessing
%run preprocessing.ipynb

In [39]:
%%time
# obtain term-frequency matrix
import numpy as np
import os
import json
import time
import pandas as pd
word2index = {}
document2index = {}
index2document = {}
document_word_vectors = {}
w_cnt = 0
d_cnt = 0
for root, dirs, files in os.walk('sample'):
    for f in files:
        document_word_vectors[f] = []
        document2index[f] = d_cnt
        index2document[d_cnt] = f
        d_cnt+=1
        with open(root+'/'+f) as fs:
            start_time = time.time()
            try:
                for line in fs:
                    #loads json file, preprocess the content
                    obj = json.loads(line)
                    textType = obj['type']
                    if textType == 'paragraph':
#                         words = nltkPreprocessing(obj['content'].lower())
                        words = spacyPreprocessing(obj['content'].lower())
                        for w in words:
                            if w not in word2index:
                                #reserve word_index dict, for referening later on
                                word2index[w] = w_cnt
                                w_cnt+=1
                            document_word_vectors[f].append(word2index[w])
            except:
                print ('error in: ',f)
            end_time = time.time()
#         print(f, 'running time: ', end_time-start_time)

#create word_frequency matrix                        
w_f_matrix = np.zeros((len(word2index),len(document2index)))
for doc in document_word_vectors:
    i = document2index[doc]
    for j in document_word_vectors[doc]:
        w_f_matrix[j,i]+=1 
        
# obtain normalized term-frequency matrix
t_f = np.copy(w_f_matrix)
sum_f = np.zeros(len(document2index))
for i in range(len(document2index)):
    sum_f[i] = np.sum(t_f[:,i])
t_f = np.divide(t_f,sum_f)  

# obtaining tf-idf matrix
inv_doc_freq = np.count_nonzero(t_f,axis=1)
def normalize(a,x):
    return np.log(x/a)
norm = np.vectorize(normalize)
inv_doc_freq = norm(inv_doc_freq,len(document2index))
raw_tf_idf = np.multiply(t_f,inv_doc_freq.reshape(-1,1))


def find_top_k_words(tf_idf, k):
    document_freq_words = {}
    for i in range(tf_idf.shape[1]):
        freq_words = []
        #get indices of k-maximum values in numpy column
        index = np.argpartition(tf_idf[:, i], -k)[-k:]
        index = index[np.argsort(tf_idf[:, i][index])]
        for ind in index:
            #find frequent words with coresponding index
            freq_words.append(list(word2index.keys())[list(word2index.values()).index(ind)])
        filename = index2document[i].split('.')[0]
        document_freq_words[filename] = freq_words
        df = pd.DataFrame(document_freq_words)
    return df

CPU times: user 11min 27s, sys: 32.1 s, total: 11min 59s
Wall time: 1min 29s


In [46]:
find_top_k_words(raw_tf_idf, 15)

Unnamed: 0,BMW-AnnualReport-2017,BVB-AnnualReport-2015,BVB-AnnualReport-2016,BVB-AnnualReport-2017,CarlZeissMeditec-AnnualReport-2017,CarlZeissMeditec-AnnualReport-2015,CarlZeissMeditec-AnnualReport-2016,BMW-AnnualReport-2016,BMW-AnnualReport-2015
0,weltweit,geschäftsführungsgmbh,champions,champions,microsurgery,chirurgische,chirurgische,weltweit,auto
1,motorrad,ziff,signal,kgaa,geräte,sbu,oraya,motorrad,werk
2,roce,kgaa,kgaa,spieler,sbu,ophthalmologie,aaren,roce,mobilität
3,next,signal,sportlich,signal,ole,therapeutics,microsurgery,next,d
4,mobilität,sportlich,haftend,league,aaren,patienten,geschäftseinheit,mobilität,x
5,rollsroyce,haftend,eur,sportlich,ärzte,mikrochirurgie,ophthalmologie,rollsroyce,vorzugsaktien
6,mio,mannschaft,iduna,haftend,thompson,geschäftseinheit,devices,mio,modelle
7,vorzugsaktien,league,saison,bvb,devices,diagnose,ophthalmic,vorzugsaktien,fahrzeug
8,motorräder,iduna,fc,uefa,veracity,jena,sbu,motorräder,motorräder
9,fahrzeug,uefa,bvb,iduna,ophthalmic,oraya,patienten,fahrzeug,fahrzeuge


### Calculate TF-IDF using Scikit-Learn

In [26]:
%%time
def documentProcessing():
    dictionary = set()
    documents_list = []
    files_list = []
    for root, dirs, files in os.walk('sample'):
        for f in files:
            with open(root+'/'+f) as fs:
                files_list.append(f)
                words_list = ''
                try:
                    for line in fs:
                        #loads json file, preprocess the content
                        obj = json.loads(line)
                        textType = obj['type']
                        if textType == 'paragraph':
#                             words = nltkPreprocessing(obj['content'].lower())
                            words = spacyPreprocessing(obj['content'])
                            for word in words:
                                dictionary.add(word)
                                words_list += word + ' '
                except:
                    print (f)
            documents_list.append(words_list)
    return dictionary, documents_list, files_list

dictionary, documents_list, files_list = documentProcessing()

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

sklearn_vector = TfidfVectorizer(vocabulary=dictionary)
sklearn_tf_idf = sklearn_vector.fit_transform(documents_list)

def sklearn_find_top_k_words(sklearn_vector, tf_idf, k, filenames):
    document_freq_words = {}
    corpus = sklearn_vector.get_feature_names()
    for i in range(tf_idf.shape[1]):
        freq_words = []
        #get indices of k-maximum values in numpy column
        index = np.argpartition(tf_idf[:, i], -k)[-k:]
        for ind in index:
            freq_words.append(corpus[ind])
        document_freq_words[filenames[i].split('.')[0]] = freq_words
        df = pd.DataFrame(document_freq_words)
    return df

CPU times: user 11min 27s, sys: 30.9 s, total: 11min 58s
Wall time: 1min 29s


In [35]:
sk_tf_idf = np.transpose(sklearn_tf_idf)
sklearn_find_top_k_words(sklearn_vector, sk_tf_idf.toarray(), 10, files_list)

Unnamed: 0,BMW-AnnualReport-2017,BVB-AnnualReport-2015,BVB-AnnualReport-2016,BVB-AnnualReport-2017,CarlZeissMeditec-AnnualReport-2017,CarlZeissMeditec-AnnualReport-2015,CarlZeissMeditec-AnnualReport-2016,BMW-AnnualReport-2016,BMW-AnnualReport-2015
0,vorstands,höhe,vorjahr,bvb,patienten,wesentlich,wesentlich,vorstands,vorstand
1,vorstand,kgaa,uefa,höhe,geschäftsjahr,unternehmen,geschäftsjahr,vorstand,vorstands
2,risiken,vorjahr,league,juni,konzerns,mio,mio,risiken,risiken
3,unternehmen,teur,kgaa,vorjahr,mio,geschäftsjahr,zeiss,unternehmen,unternehmen
4,höhe,eur,dortmund,sportlich,vorjahr,vj,unternehmen,höhe,automobile
5,wesentlich,gmbh,sportlich,teur,tsd,konzern,carl,wesentlich,höhe
6,automobile,borussia,bvb,eur,konzern,carl,konzern,automobile,group
7,group,dortmund,gmbh,gmbh,carl,tsd,vj,group,mio
8,mio,saison,teur,borussia,meditec,meditec,meditec,mio,bmw
9,bmw,sportlich,borussia,dortmund,zeiss,zeiss,tsd,bmw,aufsichtsrat
