In [2]:
import numpy as np
import pandas as pd

from nltk import word_tokenize, sent_tokenize
import nltk
from collections import Counter
stop_words = nltk.corpus.stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer

from urllib import request
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
tromso_description = """Tromsø is a beautiful city between FJORDS, ISLANDS AND MOUNTAINS, with a visible past, a fascinating history, a lively, colourful city centre, an inclusive nightlife and numerous attractions. Use the city as a base to foray into Arctic wilderness chasing Midnight Sun and Northern Lights. 01. 02."""
oslo_description = """Oslo is considered as a global city and is the major Norwegian hub for trading, shipping and banking. Location of Oslo: OSLO IS POSITIONED AT THE NORTHERNMOST END OF THE OSLOFJORD and occupies around 40 big and small islands within its limits. The climate of the region is temperate, humid."""
mining_course_description = """The aim of the course is to introduce the students to the concepts and techniques of natural languages processing and analysis, unstructured information analysis and management for better decision- making by deriving valuable insights from enterprise content regardless of source or format. The course provides deep and rich knowledge of text analysis techniques and applications including sentiment analysis and opinion mining, information access and text mining, document classification, topic extraction and other techniques and applications using real-world data and cases."""

In [4]:
def token_pro(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens] # everything lowercase
    tokens = [word.strip() for word in tokens] # strips whitespaces
    tokens = [word for word in tokens if word.isalnum()] # only alpha numerical values
    tokens = [word for word in tokens if not word in stop_words] # removes stopwords
    return tokens 

def genBow(tokens):
    bow = Counter(tokens)
    return bow

def printBow(bow, queryString):
    query_frequency = Counter(queryString)
    tf_vec = []
    for word in bow:
        if word in queryString:
            tf = query_frequency[word] #/len(bow)
        else:
            tf = 0
        tf_vec.append(tf)
    return tf_vec

In [5]:
token_tromso = token_pro(tromso_description)
token_oslo = token_pro(oslo_description)
token_nlp =  token_pro(mining_course_description)


print(token_tromso[:10])
print(token_oslo[:10])
print(token_nlp[:10])


['tromsø', 'beautiful', 'city', 'fjords', 'islands', 'mountains', 'visible', 'past', 'fascinating', 'history']
['oslo', 'considered', 'global', 'city', 'major', 'norwegian', 'hub', 'trading', 'shipping', 'banking']
['aim', 'course', 'introduce', 'students', 'concepts', 'techniques', 'natural', 'languages', 'processing', 'analysis']


In [6]:
url = "https://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
print(raw[:200])

token_CP = token_pro(raw)
print(token_CP[:10])

The Project Gutenberg eBook of Crime and Punishment, by Fyodor Dostoevsky

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with a
['project', 'gutenberg', 'ebook', 'crime', 'punishment', 'fyodor', 'dostoevsky', 'ebook', 'use', 'anyone']


In [7]:
corpus = [tromso_description, oslo_description, mining_course_description, raw]
tokens_corpus = token_pro((' ').join(corpus))
tokens_corpus[:10]

['tromsø',
 'beautiful',
 'city',
 'fjords',
 'islands',
 'mountains',
 'visible',
 'past',
 'fascinating',
 'history']

In [8]:
bow_corpus = genBow(tokens_corpus)
bow_corpus.most_common(10)

[('raskolnikov', 782),
 ('one', 638),
 ('would', 572),
 ('know', 524),
 ('said', 518),
 ('could', 496),
 ('come', 476),
 ('man', 474),
 ('like', 452),
 ('though', 443)]

In [9]:
bow_tromso = genBow(token_tromso)
bow_oslo = genBow(token_oslo)
bow_nlp = genBow(token_nlp)
bow_CP = genBow(token_CP)

#running our sentences through the tf function:
tf_tromso = printBow(bow_corpus, bow_tromso)
tf_oslo = printBow(bow_corpus, bow_oslo)
tf_nlp = printBow(bow_corpus, bow_nlp)
tf_CP = printBow(bow_corpus, bow_CP)

#Converting to dataframe for visualization
tf_df= pd.DataFrame([tf_tromso, tf_oslo, tf_nlp, tf_CP])
tf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9334,9335,9336,9337,9338,9339,9340,9341,9342,9343
0,1,1,3,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,7,3,0,4,0,1,34,5,6,...,1,1,1,1,1,1,1,1,1,1


In [37]:
vectorizer = CountVectorizer(stop_words='english', lowercase=True, token_pattern=r'(?u)\b[A-Za-z]+\b')
X = vectorizer.fit_transform(corpus)
sklearnBow = pd.DataFrame(X.toarray())
sklearnBow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9302,9303,9304,9305,9306,9307,9308,9309,9310,9311
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,3,10,1,1,2,1,2,2,1,...,3,10,90,2,1,2,1,1,1,87


In [38]:
from sklearn.metrics.pairwise import cosine_similarity
vec_sklearn = pd.DataFrame(cosine_similarity(X, dense_output=True), columns=["tromso", "oslo", "nlp", "CP"], index=["tromso", "oslo", "nlp", "CP"])
vec_sklearn.head()

Unnamed: 0,tromso,oslo,nlp,CP
tromso,1.0,0.121268,0.0,0.00829
oslo,0.121268,1.0,0.0,0.008098
nlp,0.0,0.0,1.0,0.026442
CP,0.00829,0.008098,0.026442,1.0


In [39]:
vec_bow = pd.DataFrame(cosine_similarity(tf_df, dense_output=True), columns=["tromso", "oslo", "nlp", "CP"], index=["tromso", "oslo", "nlp", "CP"])
vec_bow.head()

Unnamed: 0,tromso,oslo,nlp,CP
tromso,1.0,0.111154,0.0,0.008419
oslo,0.111154,1.0,0.0,0.010073
nlp,0.0,0.0,1.0,0.0264
CP,0.008419,0.010073,0.0264,1.0


In [40]:
queryString1 = "Tromsø is a beautiful city between FJORDS, ISLANDS AND MOUNTAINS, with a visible past, a fascinating history"
queryString2 = "Oslo is considered as a global city and is the major Norwegian hub for trading, shipping and banking"
queryString3 = "The aim of the course is to introduce the students to the concepts and techniques of natural languages valuable insights from enterprise content regardless of source or format"

query1 = token_pro(queryString1)
query2 = token_pro(queryString2)
query3 = token_pro(queryString3)

tf_query1 = printBow(bow_corpus, query1)
tf_query2 = printBow(bow_corpus, query2)
tf_query3 = printBow(bow_corpus, query3)

tf_query_df= pd.DataFrame([tf_query1, tf_query2, tf_query3])
tf_query_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9334,9335,9336,9337,9338,9339,9340,9341,9342,9343
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
vec_query = pd.DataFrame(cosine_similarity(tf_query_df, dense_output=True))
vec_query.head()

Unnamed: 0,0,1,2
0,1.0,0.1,0.0
1,0.1,1.0,0.0
2,0.0,0.0,1.0


In [41]:
d12 = sum(np.asarray(tf_query1)- np.asarray(tf_query2))
print(d12)
d13 = sum(np.asarray(tf_query1)- np.asarray(tf_query3))
print(d13)

0
-5
