In [52]:
from collections import Counter

import numpy as np

from nltk import word_tokenize, sent_tokenize
import nltk
stop_words = nltk.corpus.stopwords.words('english')

from urllib import request
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.stem import WordNetLemmatizer

In [46]:
# original content to handle with import numpy as np
sentence1  = "Oslo is the economic and governmental centre of Norway. The city is also a hub of Norwegian trade, banking, industry and shipping. It is an important centre for maritime industries and maritime trade in Europe. The city is home to many companies within the maritime sector, some of which are among the world's largest shipping companies, shipbrokers and maritime insurance brokers. Oslo is a pilot city of the Council of Europe and the European Commission intercultural cities programme."
sentence2 = "Natural language processing (NLP) is a subfield of " \
            "linguistics, computer science, and artificial intelligence " \
            "concerned with the interactions between computers and " \
            "human language, in particular how to program computers " \
            "to process and analyze large amounts of natural language data." \
            " The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation."
sentence3 = "Trondheim has a very mild climate for its northerly latitude, resulting in moderate summers and winters that often remain above the freezing point in seaside areas. On higher elevation, though, the microclimate is colder and snowier. The city functions as the seat of the County Mayor of Trøndelag county, but not as the administrative centre, which is Steinkjer. This is to make the county more efficient and not too centralized, as Trøndelag is the third largest county in Norway."

corpusList = [sentence1,sentence2,sentence3]

In [75]:
def genCorpus(corpusList: list):
    corpus = ''.join(corpusList)
    return corpus


def tokenText(text):
    token = word_tokenize(text)
    token = [token.lower() for token in token]
    token = [token.strip() for token in token]
    token = [word for word in token if word not in stop_words]
    token = [word for word in token if word.isalnum()]
    lemmatizer = WordNetLemmatizer()
    token = [lemmatizer.lemmatize(word, pos ="a") for word in token]
    return token

def genBow(token):
    bow = Counter(token)
    return bow


def genTfVec(bow,query_str):
    query_frequency = Counter(query_str)
    # num_unique_words = len(bow)
    # tf = times_sentence1_appears # / num_unique_words
    # print(tf)
    #%%
    tf_vec = []
    for word in bow:
        if word in query_str:
            tf = query_frequency[word] #/ len(bow)
        else:
            tf=0
        tf_vec.append(tf)

    return tf_vec


In [76]:
corpus = genCorpus(corpusList)
tokens_full = tokenText(''.join(corpus))

print(tokens_full)
print(len(tokens_full))


['oslo', 'economic', 'governmental', 'centre', 'norway', 'city', 'also', 'hub', 'norwegian', 'trade', 'banking', 'industry', 'shipping', 'important', 'centre', 'maritime', 'industries', 'maritime', 'trade', 'europe', 'city', 'home', 'many', 'companies', 'within', 'maritime', 'sector', 'among', 'world', 'large', 'shipping', 'companies', 'shipbrokers', 'maritime', 'insurance', 'brokers', 'oslo', 'pilot', 'city', 'council', 'europe', 'european', 'commission', 'intercultural', 'cities', 'language', 'processing', 'nlp', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interactions', 'computers', 'human', 'language', 'particular', 'program', 'computers', 'process', 'analyze', 'large', 'amounts', 'natural', 'language', 'data', 'goal', 'computer', 'capable', 'understanding', 'contents', 'documents', 'including', 'contextual', 'nuances', 'language', 'within', 'technology', 'accurately', 'extract', 'information', 'insights', 'contained', 'documents', 

In [77]:
bow = genBow(tokens_full)
bow.most_common(10)

[('language', 7),
 ('city', 4),
 ('maritime', 4),
 ('natural', 4),
 ('county', 4),
 ('centre', 3),
 ('large', 3),
 ('documents', 3),
 ('oslo', 2),
 ('norway', 2)]

In [80]:
tokens_sent1 = tokenText(sentence1)
print(tokens_sent1)

query = "Oslo is the centre of Norway."
query_token = tokenText(query)
print(query_token)

['oslo', 'economic', 'governmental', 'centre', 'norway', 'city', 'also', 'hub', 'norwegian', 'trade', 'banking', 'industry', 'shipping', 'important', 'centre', 'maritime', 'industries', 'maritime', 'trade', 'europe', 'city', 'home', 'many', 'companies', 'within', 'maritime', 'sector', 'among', 'world', 'large', 'shipping', 'companies', 'shipbrokers', 'maritime', 'insurance', 'brokers', 'oslo', 'pilot', 'city', 'council', 'europe', 'european', 'commission', 'intercultural', 'cities', 'programme']
['oslo', 'centre', 'norway']


In [81]:
tf_vec = genTfVec(bow,query_token)
print(tf_vec)
print(bow.most_common(10))


[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[('language', 7), ('city', 4), ('maritime', 4), ('natural', 4), ('county', 4), ('centre', 3), ('large', 3), ('documents', 3), ('oslo', 2), ('norway', 2)]


In [82]:
query_str = 'Oslo language. It was established in 1842 following a parliamentary decision from 1836. Originally located in the Royal Palace, Oslo, it got its own museum building in 1882, designed by Heinrich Ernst and Adolf Schirmer. Former names of the museum include Den norske stats sentralmuseum for billedkunst and from 1903 to 1920 Statens Kunstmuseum. Directors include Jens Thiis (1908–1941), Sigurd Willoch (1946–1973), Knut Berg (1975–1995), Tone Skedsmo (1995–2000) and Anniken Thue (2001–2003).'
tokens_query = tokenText(query_str)
tf_vec_query = genTfVec(bow,tokens_query)
print(tf_vec_query)
print(bow)

[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Counter({'language': 7, 'city': 4, 'maritime': 4, 'natural': 4, 'county': 4, 'centre': 3, 'large': 3, 'documents': 3, 'oslo': 2, 'norway': 2, 'trade': 2, 'shipping': 2, 'europe': 2, 'companies': 2, 'within': 2, 'processing': 2, 'computer': 2, 'computers': 2, 'understanding': 2, 'trøndelag': 2, 'economic': 1, 'governmental': 1, 'also': 1, 'hub': 1, 'norwegian': 1, 'banking': 1, 'industry': 1, 'important': 1, 'industries': 1, 'home': 1, 'many': 1, 'sector': 1, 'among': 1, 'world': 1, 'shipbrokers': 1, 'insurance': 1, 'brokers': 1, 'pilot': 1, 'council': 1, 'european': 1, 'commission': 1, 'intercultural': 1, 'cities': 1, 'nlp': 1, 'subfield': 1, 'linguistics': 1, 'science'

In [83]:
print(bow.most_common(10))
print(bow.keys())
print(tf_vec_query)

[('language', 7), ('city', 4), ('maritime', 4), ('natural', 4), ('county', 4), ('centre', 3), ('large', 3), ('documents', 3), ('oslo', 2), ('norway', 2)]
dict_keys(['oslo', 'economic', 'governmental', 'centre', 'norway', 'city', 'also', 'hub', 'norwegian', 'trade', 'banking', 'industry', 'shipping', 'important', 'maritime', 'industries', 'europe', 'home', 'many', 'companies', 'within', 'sector', 'among', 'world', 'large', 'shipbrokers', 'insurance', 'brokers', 'pilot', 'council', 'european', 'commission', 'intercultural', 'cities', 'language', 'processing', 'nlp', 'subfield', 'linguistics', 'computer', 'science', 'artificial', 'intelligence', 'concerned', 'interactions', 'computers', 'human', 'particular', 'program', 'process', 'analyze', 'amounts', 'natural', 'data', 'goal', 'capable', 'understanding', 'contents', 'documents', 'including', 'contextual', 'nuances', 'technology', 'accurately', 'extract', 'information', 'insights', 'contained', 'well', 'categorize', 'organize', 'challeng

In [84]:
# calculate similarity
# compare tf1, tf2 and tf3
tokens1 = tokenText(sentence1)
tokens2 = tokenText(sentence2)
tokens3 = tokenText(sentence3)
tf1 = genTfVec(bow,tokens1)
tf2 = genTfVec(bow,tokens2)
tf3 = genTfVec(bow,tokens3)
print(tf1)
print(tf2)
print(tf3)

[2, 1, 1, 2, 1, 3, 1, 1, 1, 2, 1, 1, 2, 1, 4, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 2, 1, 1, 1, 1, 1, 1]


In [85]:
d12 = sum(np.asarray(tf1)- np.asarray(tf2))
print(d12)
d13 = sum(np.asarray(tf1)- np.asarray(tf3))
print(d13)

-15
6
