In [1]:
# NLP exercise. Get bags-of-words, term frequencies and similarities of texts through word vectors
# Corpus is four articles manually obtained: three small Daily Mail articles about the Duchess of Cambridge and her recent activities, 
# and one unrelated article on corolavirus deaths.
# Expected to get high topic similarity on first three and low on the forth to others

# The result down below is:

# Same topic:
# ------------------
# Similarity of texts (1, 2) is 0.4896257193424013
# Similarity of texts (1, 3) is 0.5677055366699043
# Similarity of texts (2, 3) is 0.6053098409985186

# Between topics
# ------------------
# Similarity of texts (1, 4) is 0.11484741046963563
# Similarity of texts (2, 4) is 0.0969307223278006
# Similarity of texts (3, 4) is 0.13043618289729517


In [None]:
import nltk
import pandas as pd
import string
import numpy as np
import math

from collections import Counter
from nltk.tokenize import TreebankWordTokenizer
nltk.download('stopwords')

In [52]:
# read a text file with 4 articles. Every article stars with a # and a link to that article

texts = list()
i = 0

with open('texts.txt', 'r') as text_file:
    for line in text_file:
        if line.startswith("#") and not i:
            string = '' 
            i += 1
        elif line.startswith("#") and i:
            texts.append(string)
            i += 1
            string = ''
        else:
            string += line
    texts.append(string)
      
assert len(texts) != 0

In [39]:
# break texts into lists of tokens

tokenizer = TreebankWordTokenizer()
tokens = list()

for i, text in enumerate(texts):
    tokens.append(tokenizer.tokenize(text))
    
assert len(tokens) == len(texts)
for i in range(len(texts)):
    assert len(tokens[i]) != 0
    

In [40]:
# clean up tokens from numbers or punctuation

clean_tokens = list()

for i, doci_tokens in enumerate(tokens):
    clean_tokens.append(list())
    for token in doci_tokens:
        if token.isalpha():
            clean_tokens[i].append(token.strip("'").lower())
        else:
            continue
            
assert len(clean_tokens) == len(texts)
for i in range(len(texts)):
    assert len(clean_tokens[i]) != 0
            

In [42]:
# get rid of stop words from tokens

stop_words = nltk.corpus.stopwords.words('english')

clean_no_stop_tokens = list()

for i, doci_text in enumerate(clean_tokens):
    clean_no_stop_tokens.append([token for token in clean_tokens[i] if token not in stop_words])


assert len(clean_tokens) != len(clean_tokens[i])
for i in range(len(texts)):
    assert len(clean_no_stop_tokens[i]) != 0

In [43]:
# random eye-check

clean_no_stop_tokens[0][:20]

['giovanna',
 'fletcher',
 'revealed',
 'duchess',
 'cambridge',
 'easy',
 'like',
 'mum',
 'met',
 'motherhood',
 'kate',
 'middleton',
 'appeared',
 'special',
 'episode',
 'giovanna',
 'happy',
 'mum',
 'happy',
 'baby']

In [44]:
# get bags-of-words dicts

text_counts = list()

for i, doci_text in enumerate(clean_no_stop_tokens):
    text_counts.append(dict(Counter(clean_no_stop_tokens[i])))

In [45]:
# check the most popular terms in each "bag"

for i in range(len(texts)):
    print(Counter(clean_no_stop_tokens[i]).most_common(10))
    print()


[('giovanna', 6), ('kate', 6), ('early', 6), ('said', 6), ('episode', 4), ('podcast', 4), ('years', 4), ('children', 4), ('time', 4), ('us', 4)]

[('survey', 10), ('early', 8), ('james', 7), ('years', 7), ('duchess', 7), ('kate', 5), ('instagram', 5), ('big', 5), ('questions', 5), ('happy', 5)]

[('podcast', 6), ('early', 6), ('kate', 5), ('years', 5), ('work', 5), ('said', 5), ('royal', 4), ('source', 4), ('duchess', 4), ('deeply', 3)]

[('people', 14), ('said', 11), ('cases', 11), ('two', 9), ('italy', 9), ('health', 9), ('virus', 9), ('lombardy', 8), ('towns', 8), ('tested', 8)]



In [46]:
# build a corpus dictionary

dictionary = list()

for i, keys in enumerate(text_counts):
    for key in text_counts[i]:
        if key not in dictionary:
            dictionary.append(key)

dict_len = len(dictionary)

In [47]:
# create dicts with term frequencies for each text by dividing term freqs to the corresponding lens of text_count dicts

document_vector_lens = list()
document_vectors = list()

for i, values in enumerate(text_counts):
    document_vector_lens.append(len(clean_no_stop_tokens[i]))
    document_vectors.append(dict())
    for (key, value) in text_counts[i].items():
        assert document_vector_lens[i] != 0
        document_vectors[i][key] = (value / document_vector_lens[i])


In [48]:
# create a whole corpus table

vectors = pd.DataFrame(document_vectors, index = ['text_1', 'text_2', 'text_3', 'text_4']).fillna(0)
vectors.head()


Unnamed: 0,abandon,able,abroad,according,across,activities,activity,actually,added,adhanom,...,works,world,worrying,would,written,wrote,year,years,youngest,zaia
text_1,0.0,0.003425,0.0,0.0,0.0,0.0,0.0,0.003425,0.0,0.0,...,0.0,0.0,0.0,0.003425,0.0,0.0,0.0,0.013699,0.003425,0.0
text_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003067,0.0,...,0.0,0.0,0.0,0.0,0.003067,0.003067,0.0,0.021472,0.003067,0.0
text_3,0.004098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004098,0.0,0.004098,0.0,0.004098,0.020492,0.004098,0.0
text_4,0.0,0.0,0.001513,0.004539,0.001513,0.003026,0.001513,0.0,0.0,0.001513,...,0.001513,0.001513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001513


In [49]:
# calculating cosine similarity = v1 * v2 / |v1||v2|

magnitudes = list()
for i in range(len(texts)):
    magnitudes.append(math.sqrt(vectors.iloc[i].map(lambda x: x*x).sum()))

assert (magnitudes != 0)


In [50]:
import itertools

dot_products = list()
similarities = list()

comb = list()
combinations = itertools.combinations((0,1,2,3), 2)
for i,j in combinations:
    comb.append((i,j))

for i,j in comb:
    dot_i_j = sum(np.multiply(vectors.values[i], vectors.values[j]))
    dot_products.append(dot_i_j)
    similarities.append(dot_i_j / (magnitudes[i] * magnitudes[j]))
    
for i in range(len(comb)):
    print("Similarity of texts {} is {}".format((comb[i][0] + 1, comb[i][1] + 1), similarities[i]))
    

Similarity of texts (1, 2) is 0.4896257193424013
Similarity of texts (1, 3) is 0.5677055366699043
Similarity of texts (1, 4) is 0.11484741046963563
Similarity of texts (2, 3) is 0.6053098409985186
Similarity of texts (2, 4) is 0.0969307223278006
Similarity of texts (3, 4) is 0.13043618289729517
