In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ufal.morphodita import *

In [7]:
data = pd.read_csv('../clustering/articles_cleaned.csv',  sep='|')
data.columns = ['id','query','url', 'headline', 'paragraphs']
data['paragraphs'] = data['paragraphs'].fillna('')

In [8]:
data =  data[data['paragraphs'] != '\xa0']  

In [10]:
lemma_column = data.shape[0]*[[]]
data['lemmas'] = lemma_column

In [14]:
tfidf = TfidfVectorizer(ngram_range=(1,1))
tagger = Tagger.load('../../morphodita/czech-morfflex-pdt-161115.tagger')
forms = Forms()

In [34]:
from string import punctuation
import re

In [37]:
for index, row in data.iterrows():
    paragraph = row['paragraphs']
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    tokenizer.setText(paragraph)
    allLemmas = []
    while tokenizer.nextSentence(forms, tokens):
        newLemmas = []
        tagger.tag(forms, lemmas)
        newLemmas = [l.lemma for l in lemmas]
        allLemmas.extend(newLemmas)
    row['lemmas'] = ' '.join(l for l in allLemmas if l not in punctuation and not re.match("[0-9]+", l))
#     print(row['lemmas'] )

In [42]:
data =  data[data['lemmas'] != '']

In [44]:
# data['lemmas']

In [45]:
tfidf_matrix = tfidf.fit_transform(data['lemmas'])

In [46]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [47]:
indices = pd.Series(data.index, index=data['url'])

In [48]:
indexes = [1, 3, 5, 7, 8]

In [49]:
dfs = [pd.DataFrame(cosine_sim[index]) for index in indexes]
sim_scores = pd.concat(dfs, axis=1)    

In [50]:
average_scores = sim_scores.mean(axis=1)

In [51]:
average_scores.columns = ['index', 'score']

In [52]:
average_scores = average_scores.sort_values(ascending=False)

In [53]:
article_indices = average_scores.nlargest(10).keys()

In [54]:
article_indices

Int64Index([8, 3, 5, 10, 1, 12, 14, 7, 15, 3873], dtype='int64')

In [55]:
data['url'].iloc[article_indices]

8       https://eurozpravy.cz/pocasi/pocasi-v-cr/23965...
3       https://eurozpravy.cz/pocasi/pocasi-v-cr/23974...
5       https://eurozpravy.cz/pocasi/pocasi-v-cr/23935...
11      https://eurozpravy.cz/pocasi/pocasi-v-cr/23942...
1       https://eurozpravy.cz/pocasi/pocasi-v-cr/23958...
14      https://eurozpravy.cz/pocasi/pocasi-v-cr/23928...
16      https://moneymag.cz/aktuality/10769-predpoved-...
7       https://globe24.cz/pocasi/63694-predpoved-poca...
17      https://globe24.cz/pocasi/63721-predpoved-poca...
4132    http://tn.nova.cz/clanek/martin-na-bilem-koni-...
Name: url, dtype: object