## This notebook is for following along (and modifying and testing) the code as shown in the book "Blueprints for Text Analytics Using Python"

## Chapter 5

In [1]:
sentences = ["It was the best of times", "it was the worst of times", "it was the age of wisdom", 
            "it was the age of foolishness"]

In [2]:
tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

In [3]:
tokenized_sentences

[['It', 'was', 'the', 'best', 'of', 'times'],
 ['it', 'was', 'the', 'worst', 'of', 'times'],
 ['it', 'was', 'the', 'age', 'of', 'wisdom'],
 ['it', 'was', 'the', 'age', 'of', 'foolishness']]

In [4]:
vocabulary = set([w for s in tokenized_sentences for w in s])

In [5]:
vocabulary

{'It',
 'age',
 'best',
 'foolishness',
 'it',
 'of',
 'the',
 'times',
 'was',
 'wisdom',
 'worst'}

In [12]:
import pandas as pd

In [13]:
pd.DataFrame([[w, i] for i, w in enumerate(vocabulary)])

Unnamed: 0,0,1
0,age,0
1,times,1
2,it,2
3,the,3
4,best,4
5,was,5
6,of,6
7,worst,7
8,foolishness,8
9,It,9


In [14]:
# Time to now do one-hot encoding

In [19]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

In [20]:
onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]

In [21]:
onehot

[[0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0],
 [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0],
 [1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1],
 [1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0]]

In [22]:
for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0]: It was the best of times
[0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0]: it was the worst of times
[1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1]: it was the age of wisdom
[1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0]: it was the age of foolishness


In [23]:
# What if introduce a document not in here?

In [26]:
onehot_encode("the age of wisdom is the best of times".split()), "the age of wisdom is the best of times"

([1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1], 'the age of wisdom is the best of times')

In [27]:
# if no word in the vocabulary is shown
onehot_encode("Is the football season now maybe".split()), "Is the football season now maybe"

([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], 'Is the football season now maybe')

### Document-Term Matrix

In [28]:
pd.DataFrame(onehot, columns=vocabulary)

Unnamed: 0,age,times,it,the,best,was,of,worst,foolishness,It,wisdom
0,0,1,0,1,1,1,1,0,0,1,0
1,0,1,1,1,0,1,1,1,0,0,0
2,1,0,1,1,0,1,1,0,0,0,1
3,1,0,1,1,0,1,1,0,1,0,0


In [34]:
# First two sentences
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
print(sim)

[0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0]


In [35]:
print(sum(sim))

4


In [37]:
# Scalar dot product way
import numpy as np
np.dot(onehot[0], onehot[1])

4

In [39]:
# Similarity Matrix 
np.dot(onehot, np.transpose(onehot)), sentences

(array([[6, 4, 3, 3],
        [4, 6, 4, 4],
        [3, 4, 6, 5],
        [3, 4, 5, 6]]),
 ['It was the best of times',
  'it was the worst of times',
  'it was the age of wisdom',
  'it was the age of foolishness'])

In [41]:
# Diagnoal above is itself which you can see by:
np.dot(onehot[0], onehot[0])

6

In [43]:
# Doing this encoding in scikit-learn with multiple words like here
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()
lb.fit([vocabulary])


MultiLabelBinarizer()

In [46]:
lb.transform(tokenized_sentences)

array([[1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0]])

## Bag of Words Model

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [48]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                             "Mary also likes to watch football games."]

In [49]:
cv.fit(more_sentences)

CountVectorizer()

In [55]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
               encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1,
               ngram_range=(1,1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
               tokenizer=None, vocabulary=None)

CountVectorizer()

In [56]:
print(cv.get_feature_names())

['age', 'also', 'best', 'foolishness', 'football', 'games', 'it', 'john', 'likes', 'mary', 'movies', 'of', 'the', 'times', 'to', 'too', 'was', 'watch', 'wisdom', 'worst']


In [57]:
# Transforming documents to vectors
dt = cv.transform(more_sentences)

In [58]:
dt

<6x20 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [59]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


In [60]:
# To calculate similarities, dot product no longer works when length varies
# Euclidean distance fails as dimensions increase

from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [61]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.833333,0.666667,0.666667,0.0,0.0
1,0.833333,1.0,0.666667,0.666667,0.0,0.0
2,0.666667,0.666667,1.0,0.833333,0.0,0.0
3,0.666667,0.666667,0.833333,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.524142
5,0.0,0.0,0.0,0.0,0.524142,1.0


## TF-IDF

In [63]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)


In [65]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names()) # notice scaled down for 'it' but not for 'wisdom'

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305609,0.501208,0.250604,0.611219,0.0,0.0,0.0,0.250604,0.305609,0.0,0.250604,0.0,0.0
5,0.0,0.419233,0.0,0.0,0.419233,0.419233,0.0,0.0,0.343777,0.343777,0.0,0.0,0.0,0.0,0.343777,0.0,0.0,0.343777,0.0,0.0


In [67]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43076
5,0.0,0.0,0.0,0.0,0.43076,1.0


In [68]:
# Download ABC Dataset from Kaggle -- https://www.kaggle.com/therohk/million-headlines/data

In [107]:
headlines = pd.read_csv('abcnews-date-text.csv', parse_dates=['publish_date'])
print(len(headlines))
headlines.head()

1226258


Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [70]:
!head abcnews-date-text.csv

publish_date,headline_text
20030219,aba decides against community broadcasting licence
20030219,act fire witnesses must be aware of defamation
20030219,a g calls for infrastructure protection summit
20030219,air nz staff in aust strike for pay rise
20030219,air nz strike to affect australian travellers
20030219,ambitious olsson wins triple jump
20030219,antic delighted with record breaking barca
20030219,aussie qualifier stosur wastes four memphis match
20030219,aust addresses un security council over iraq


In [71]:
# Now, create tfidf vector
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])

In [72]:
dt

<1226258x104691 sparse matrix of type '<class 'numpy.float64'>'
	with 7933451 stored elements in Compressed Sparse Row format>

In [73]:
%%time
cosine_similarity(dt[0:1000], dt[0:1000]) # doing for 1000 since that'll run faster than 10k in the book

CPU times: user 5.56 ms, sys: 3.28 ms, total: 8.84 ms
Wall time: 50 ms


array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [79]:
# run if not installed already
# !pip install spacy

In [80]:
# Stop words
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
print(len(stopwords))
tfidf = TfidfVectorizer(stop_words=stopwords)

326


In [81]:
dt = tfidf.fit_transform(headlines["headline_text"])
dt



<1226258x104411 sparse matrix of type '<class 'numpy.float64'>'
	with 6394835 stored elements in Compressed Sparse Row format>

In [83]:
# % reduced
(7933451-6394853)/(7933451.0)*100, "%"

(19.39380478936594, '%')

In [84]:
# Words have to appear at least twice
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1226258x63385 sparse matrix of type '<class 'numpy.float64'>'
	with 6353809 stored elements in Compressed Sparse Row format>

In [85]:
# You can also filter via word appearing certain %
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=0.0001)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1226258x6921 sparse matrix of type '<class 'numpy.float64'>'
	with 5447786 stored elements in Compressed Sparse Row format>

In [132]:
# Also eliminating words that appear too frequently -- like 10%
tfidf = TfidfVectorizer(stop_words=stopwords, max_df=0.1)
dt = tfidf.fit_transform(headlines['headline_text'])
dt

<1226258x104411 sparse matrix of type '<class 'numpy.float64'>'
	with 6394835 stored elements in Compressed Sparse Row format>

In [102]:
# Run this if below cell doesn't work
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [101]:
# Linguistic analysis
import spacy

nlp = spacy.load("en_core_web_sm")
nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]

In [137]:
headlines_5000 = headlines[0:5000]
headlines_5000.head()

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [138]:
# This takes extremely long so doing for only 5000
for i, row in headlines_5000.iterrows():
    doc = nlp(str(row["headline_text"]))
    headlines_5000.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    headlines_5000.at[i, "nav"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [141]:
headlines_5000

Unnamed: 0,publish_date,headline_text,lemmas,nav
0,2003-02-19,aba decides against community broadcasting lic...,aba decide against community broadcasting licence,aba decide community broadcasting licence
1,2003-02-19,act fire witnesses must be aware of defamation,act fire witness must be aware of defamation,act fire witness be aware defamation
2,2003-02-19,a g calls for infrastructure protection summit,a g call for infrastructure protection summit,g call infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise,air nz staff in aust strike for pay rise,air nz staff aust strike pay rise
4,2003-02-19,air nz strike to affect australian travellers,air nz strike to affect australian traveller,air nz strike affect australian traveller
...,...,...,...,...
4995,2003-03-14,slater stars for blues on day one,slater star for blue on day one,slater star blue day
4996,2003-03-14,sopranos filming delayed over contract dispute,sopranos filming delay over contract dispute,sopranos filming delay contract dispute
4997,2003-03-14,souris outlines regional roads funding,souris outline regional road fund,souris outline regional road fund
4998,2003-03-14,south east water licensees to pay levy,south east water licensees to pay levy,south east water licensees pay levy


In [143]:
tfidf_5000 = TfidfVectorizer(stop_words=stopwords)
dt_5000 = tfidf_5000.fit_transform(headlines_5000["lemmas"].map(str))
dt_5000



<5000x5569 sparse matrix of type '<class 'numpy.float64'>'
	with 24853 stored elements in Compressed Sparse Row format>

In [144]:
pd.DataFrame(dt_5000.toarray(), columns=tfidf_5000.get_feature_names())

Unnamed: 0,10,100,10000,100000,100th,1035,106,108,11,110,...,zanetti,zealand,zellweger,ziege,zim,zimbabwe,zimbabwean,zimmerman,zone,zoning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
tfidf_5000 = TfidfVectorizer(stop_words=stopwords)
dt_5000 = tfidf_5000.fit_transform(headlines_5000["nav"].map(str))
dt_5000



<5000x5435 sparse matrix of type '<class 'numpy.float64'>'
	with 24321 stored elements in Compressed Sparse Row format>

In [146]:
top_10000 = pd.read_csv("https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt", header=None)

In [154]:
tfidf_5000 = TfidfVectorizer(stop_words=set(top_10000.iloc[:,0].values))
dt_5000 = tfidf_5000.fit_transform(headlines_5000["nav"].map(str))
dt_5000

<5000x2692 sparse matrix of type '<class 'numpy.float64'>'
	with 5722 stored elements in Compressed Sparse Row format>

In [155]:
# % change
(24321-5722)/(24321.0)*100, "%"

(76.47300686649398, '%')

In [158]:
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)

(1226258, 629481)
76644720


In [159]:
# Tri-gram
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)

(1226258, 845569)
82521160


In [161]:
tfidf_5000 = TfidfVectorizer(ngram_range=(1,2), stop_words=set(top_10000.iloc[:,0].values))
dt_5000 = tfidf_5000.fit_transform(headlines_5000["nav"].map(str))
dt_5000

<5000x4697 sparse matrix of type '<class 'numpy.float64'>'
	with 7829 stored elements in Compressed Sparse Row format>

## Syntactic Similarity

In [162]:
stopwords.add("test") # test headlines
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2,
                       norm='l2')

In [163]:
dt = tfidf.fit_transform(headlines["headline_text"])



In [164]:
made_up = tfidf.transform(["austrailia and new zealand discuss optimal apple size"])

# cosine similarity
sim = cosine_similarity(made_up, dt)

In [165]:
headlines.iloc[np.argmax(sim)]

publish_date     2015-06-04 00:00:00
headline_text       new zealand wool
Name: 956081, dtype: object

In [166]:
made_up_2 = tfidf.transform(["Gas prices rising and unemployment soars high"])

sim_2 = cosine_similarity(made_up_2, dt)
headlines.iloc[np.argmax(sim_2)]

publish_date       2008-10-16 00:00:00
headline_text    uk unemployment soars
Name: 417655, dtype: object

In [170]:
%%time
# Finding similar headlines
np.dot(dt[0:10000], np.transpose(dt[0:10000]))

CPU times: user 29 ms, sys: 19.9 ms, total: 48.9 ms
Wall time: 47.7 ms


<10000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1818430 stored elements in Compressed Sparse Row format>

In [171]:
dt.shape[0]

1226258

In [172]:
%%time
batch = 10000
max_sim = 0.0
max_a = None
max_b = None

for a in range(0, dt.shape[0], batch):
    for b in range(0, a+batch, batch):
        r = np.dot(dt[a:a+batch],np.transpose(dt[b:b+batch]))
        # eliminate identical vectors by setting their similarity to np.nan which gets sorted out
        r[r > 0.9999] = np.nan
        sim = r.max()
        if sim > max_sim:
            # argmax returns a single value which we have to map to the two dimensions
            (max_a, max_b) = np.unravel_index(np.argmax(r), r.shape)
            # adjust offsets in ocrpus (this is a submatrix)
            max_a += a
            max_b += b
            max_sim = sim

CPU times: user 9min 11s, sys: 14.2 s, total: 9min 26s
Wall time: 9min 26s


In [173]:
print(headlines.iloc[max_a])
print(headlines.iloc[max_b])

publish_date                                2014-09-18 00:00:00
headline_text    vline fails to meet punctuality targets report
Name: 903760, dtype: object
publish_date                         2008-02-15 00:00:00
headline_text    vline fails to meet punctuality targets
Name: 364007, dtype: object


In [174]:
tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)
dt_word = tfidf_word.fit_transform(headlines["headline_text"])

In [177]:
# for vocabulary similarity
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r,0)

In [178]:
r

array([[0.        , 0.00554026, 0.00074709, ..., 0.0013774 , 0.00181772,
        0.00531874],
       [0.00554026, 0.        , 0.00068462, ..., 0.00113481, 0.00167518,
        0.0046172 ],
       [0.00074709, 0.00068462, 0.        , ..., 0.00036739, 0.        ,
        0.00209499],
       ...,
       [0.0013774 , 0.00113481, 0.00036739, ..., 0.        , 0.00347424,
        0.        ],
       [0.00181772, 0.00167518, 0.        , ..., 0.00347424, 0.        ,
        0.        ],
       [0.00531874, 0.0046172 , 0.00209499, ..., 0.        , 0.        ,
        0.        ]])

In [179]:
voc = tfidf_word.get_feature_names()
size = r.shape[0] # quadratic
size

1281

In [181]:
voc

['10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '18',
 '19',
 '20',
 '2013',
 '2014',
 '2015',
 '2016',
 '30',
 '50',
 'abbott',
 'abc',
 'aboriginal',
 'abuse',
 'accc',
 'access',
 'accident',
 'accused',
 'act',
 'action',
 'address',
 'adelaide',
 'admits',
 'afghan',
 'afghanistan',
 'afl',
 'africa',
 'aged',
 'agreement',
 'ahead',
 'aid',
 'aims',
 'air',
 'airport',
 'al',
 'alcohol',
 'alert',
 'alice',
 'allegations',
 'alleged',
 'alp',
 'ambulance',
 'amid',
 'analysis',
 'andrew',
 'anger',
 'angry',
 'animal',
 'anniversary',
 'announces',
 'anti',
 'anzac',
 'apologises',
 'appeal',
 'approval',
 'april',
 'armed',
 'army',
 'arrest',
 'arrested',
 'arrests',
 'art',
 'asbestos',
 'ashes',
 'asia',
 'asian',
 'asked',
 'assault',
 'asylum',
 'attack',
 'attacks',
 'attempted',
 'august',
 'aussie',
 'aussies',
 'aust',
 'australia',
 'australian',
 'australians',
 'australias',
 'authorities',
 'award',
 'awards',
 'away',
 'baby',
 'backs',
 'bad',
 'baghdad'

In [182]:
for index in np.argsort(r.flatten())[::-1][0:40]:
    a = int(index/size)
    b = index%size
    if a > b: # avoid repetitions
        print('"%s" related to "%s"' % (voc[a], voc[b]))

"kong" related to "hong"
"sri" related to "lanka"
"covid" related to "19"
"seekers" related to "asylum"
"springs" related to "alice"
"trump" related to "donald"
"hour" related to "country"
"pleads" related to "guilty"
"hill" related to "broken"
"vs" related to "summary"
"violence" related to "domestic"
"climate" related to "change"
"royal" related to "commission"
"care" related to "aged"
"driving" related to "drink"
"gold" related to "coast"
"wall" related to "street"
"mental" related to "health"
"scott" related to "morrison"
"north" related to "korea"
