# Calculate TF-IDF

In [None]:
#run preprocessing
%run preprocessing.ipynb

### Read files and construct term frequency matrix

In [57]:
%%time
# obtain term-frequency matrix
import numpy as np
import os
import json
import spacy
import string
import time
word2index = {}
document2index = {}
index2document = {}
document_word_vectors = {}
w_cnt = 0
d_cnt = 0
nlp = spacy.load('de')
for root, dirs, files in os.walk('sample'):
    for f in files:
        document_word_vectors[f] = []
        document2index[f] = d_cnt
        index2document[d_cnt] = f
        d_cnt+=1
        with open(root+'/'+f) as fs:
            print('processing: ', f)
            start_time = time.time()
            try:
                for line in fs:
                    #loads json file, preprocess the content
                    obj = json.loads(line)
                    textType = obj['type']
                    if textType == 'paragraph':
                        #remove digits
                        dig_translator = str.maketrans('', '', '0123456789')
                        text = obj['content'].lower().translate(dig_translator)
                        #remove punctuation
                        str_translator = str.maketrans('', '', string.punctuation)
                        text = text.translate(str_translator)
                        documents = nlp(text)
                        for w in documents:
                            if w.is_stop == False and w.is_punct == False and w.is_digit == False and w.is_space == False:
                                if w.lemma_ not in word2index.keys():
                                    word2index[w.lemma_] = w_cnt
                                    w_cnt+=1
                                document_word_vectors[f].append(word2index[w.lemma_])
#                         words = textPreprocess(obj['content'])
#                         for w in words:
#                             if w not in word2index:
#                                 #reserve word_index dict, for referening later on
#                                 word2index[w] = w_cnt
#                                 w_cnt+=1
#                             document_word_vectors[f].append(word2index[w])
            except:
                print (f)
            end_time = time.time()
            print('running time: ', end_time-start_time)

#create word_frequency matrix                        
w_f_matrix = np.zeros((len(word2index),len(document2index)))
for doc in document_word_vectors:
    i = document2index[doc]
    for j in document_word_vectors[doc]:
        w_f_matrix[j,i]+=1 

processing:  BMW-AnnualReport-2017.json
running time:  12.872901439666748
processing:  BVB-AnnualReport-2015.json
running time:  7.554543733596802
processing:  BVB-AnnualReport-2016.json
running time:  8.048486471176147
processing:  BVB-AnnualReport-2017.json
running time:  9.21908950805664
processing:  CarlZeissMeditec-AnnualReport-2017.json
running time:  8.141857147216797
processing:  CarlZeissMeditec-AnnualReport-2015.json
running time:  7.0159618854522705
processing:  CarlZeissMeditec-AnnualReport-2016.json
running time:  7.3509156703948975
processing:  BMW-AnnualReport-2016.json
running time:  12.51590609550476
processing:  BMW-AnnualReport-2015.json
running time:  14.098376989364624
CPU times: user 11min 6s, sys: 30.4 s, total: 11min 36s
Wall time: 1min 27s


### Obtain term-frequency matrix

In [58]:
# obtain normalized term-frequency matrix
t_f = np.copy(w_f_matrix)
sum_f = np.zeros(len(document2index))
for i in range(len(document2index)):
    sum_f[i] = np.sum(t_f[:,i])
t_f = np.divide(t_f,sum_f)  

### Obtain tf-idf matrix

In [59]:
# obtaining tf-idf matrix
inv_doc_freq = np.count_nonzero(t_f,axis=1)
def normalize(a,x):
    return np.log(x/a)
norm = np.vectorize(normalize)
inv_doc_freq = norm(inv_doc_freq,len(document2index))
tf_idf = np.multiply(t_f,inv_doc_freq.reshape(-1,1))

In [60]:
tf_idf.shape

(20113, 9)

In [61]:
tf_idf

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.93019491e-04, 1.51762151e-04, 8.25878619e-05, ...,
        0.00000000e+00, 3.93019491e-04, 4.05164129e-04],
       [1.70923424e-04, 9.42872417e-05, 0.00000000e+00, ...,
        0.00000000e+00, 1.70923424e-04, 1.84595824e-04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 6.27312447e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 6.27312447e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 6.27312447e-05]])

### Find most frequent words in each documents

In [103]:
#https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array

def find_top_k_words(tf_idf, k):
    for i in range(tf_idf.shape[1]):
        freq_words = []
        #get indices of k-maximum values in numpy column
        index = np.argpartition(tf_idf[:, i], -k)[-k:]
        index = index[np.argsort(tf_idf[:, i][index])]
        print('Most frequent words in: ', index2document[i])
        for ind in index:
            #find frequent words with coresponding index
            freq_words.append(list(word2index.keys())[list(word2index.values()).index(ind)])
        print(freq_words)
               

In [104]:
find_top_k_words(tf_idf, 10)

Most frequent words in:  BMW-AnnualReport-2017.json
['mio', 'motorräder', 'vorzugsaktien', 'fahrzeug', '€', 'group', 'fahrzeuge', 'mini', 'automobile', 'bmw']
Most frequent words in:  BVB-AnnualReport-2015.json
['haftend', 'mannschaft', 'league', 'iduna', 'uefa', 'saison', 'eur', 'borussia', 'teur', 'dortmund']
Most frequent words in:  BVB-AnnualReport-2016.json
['haftend', 'iduna', 'saison', 'bvb', 'fc', 'league', 'uefa', 'teur', 'borussia', 'dortmund']
Most frequent words in:  BVB-AnnualReport-2017.json
['sportlich', 'haftend', 'bvb', 'uefa', 'iduna', 'saison', 'eur', 'borussia', 'teur', 'dortmund']
Most frequent words in:  CarlZeissMeditec-AnnualReport-2017.json
['thompson', 'devices', '€', 'veracity', 'ophthalmic', 'patienten', 'tsd', 'meditec', 'carl', 'zeiss']
Most frequent words in:  CarlZeissMeditec-AnnualReport-2015.json
['geschäftseinheit', '€', 'diagnose', 'jena', 'oraya', 'vj', 'tsd', 'meditec', 'carl', 'zeiss']
Most frequent words in:  CarlZeissMeditec-AnnualReport-2016.js

#### Calculate the Cosine distance between u and v, defined as
$1 - \frac{u \cdot v}{||u||_2 ||v||_2}$ 

In [67]:
#defining the similarity function
from scipy import spatial
import numpy as np
def similarity(u,v):
    return 1 - spatial.distance.cosine(u, v)

In [113]:
# preprocssing the query vector
q = 'bmw dortmund zeiss zeiss'
query = []
q_v = np.zeros(len(word2index))
for w in q.split():
    q_v[word2index[w]]+=1
    query.append(word2index[w])
sum_ = np.sum(q_v)
def normalize_query(i,sum_):
    return 0.5+(0.5*i)/sum_
norm_q = np.vectorize(normalize_query)
q_v = norm_q(q_v,sum_)

# normalizing the query
q_v = np.multiply(q_v,inv_doc_freq)

### Find most relevant documents according to the query

In [105]:
def find_top_k(k,doc_sim,index2document):
    for doc,sim in sorted(doc_sim.items(),key = lambda x:x[1], reverse=True):
        print (index2document[doc])
        k-=1
        if k==0:
            break

#### q is mapped to the k-concept space
$q=q^{T}U_k\sum^{-1}_{k}$

In [87]:
# latent semantic indexing (LSI)
from numpy import linalg
#query:indices of words appear in matrix
#k:dimention of latent space
def latent_semantic_indexing(query, tf_idf, k, topk, index2document):
    doc_similarity = {}
    u,s,v = linalg.svd(tf_idf)
    u = u[:,:k]
    s_ = np.zeros((k,k))
    for i in range(k):
        s_[i,i] = s[i]
    v = v[:k,:]
    q_v = np.zeros(len(word2index))
    for q in query:
        q_v[q]+=1
    q_v = q_v.reshape(1,-1)
    q_v = np.matmul(q_v,u)
    s_ = linalg.inv(s_)
    q_v = np.matmul(q_v,s_)
    
    for i in range(v.shape[1]):
        sim = similarity(q_v,v[:,i])
        doc_similarity[i] = sim 
    find_top_k(topk, doc_similarity,index2document)

In [93]:
latent_semantic_indexing(query, tf_idf, 3, 5, index2document)

CarlZeissMeditec-AnnualReport-2017.json
CarlZeissMeditec-AnnualReport-2015.json
CarlZeissMeditec-AnnualReport-2016.json
BMW-AnnualReport-2015.json
BMW-AnnualReport-2017.json
