# KUC, PGR210
# P6.1
# 12:30 - 14:30
# Suppose you have three texts, describing the city of Oslo, NLP, and the city of Trondheim, i.e.,

txt1  = "Oslo is the economic and governmental centre of Norway. The city is also a hub of Norwegian trade, banking, industry and shipping. It is an important centre for maritime industries and maritime trade in Europe. The city is home to many companies within the maritime sector, some of which are among the world's largest shipping companies, shipbrokers and maritime insurance brokers. Oslo is a pilot city of the Council of Europe and the European Commission intercultural cities programme."
txt2 = "Natural language processing (NLP) is a subfield of " \
            "linguistics, computer science, and artificial intelligence " \
            "concerned with the interactions between computers and " \
            "human language, in particular how to program computers " \
            "to process and analyze large amounts of natural language data." \
            " The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation."
tx3 = "Trondheim has a very mild climate for its northerly latitude, resulting in moderate summers and winters that often remain above the freezing point in seaside areas. On higher elevation, though, the microclimate is colder and snowier. The city functions as the seat of the County Mayor of Trøndelag county, but not as the administrative centre, which is Steinkjer. This is to make the county more efficient and not too centralized, as Trøndelag is the third largest county in Norway."

Implement the code to apply the following:
1. call the method token_pro() to generate tokens from three texts and then form a corpus.
2. Have a group discussion on corpus: compare your corpus with the corpus generated by your group member, and analyze the difference.
3. generate BoW (bag of words) from scratch, represent each text and understand the representation.
4. generate TF-IDF from scratch, represent each text and understand the representation.
5. implement cosine function by only import math
6. given a query string, apply the cosine function on the generated BoW and TF-IDF to find the most similar texts through BoW and TF-IDF, respectively.



In [137]:
txt1  = "Oslo is the economic and governmental centre of Norway. The city is also a hub of Norwegian trade, banking, industry and shipping. It is an important centre for maritime industries and maritime trade in Europe. The city is home to many companies within the maritime sector, some of which are among the world's largest shipping companies, shipbrokers and maritime insurance brokers. Oslo is a pilot city of the Council of Europe and the European Commission intercultural cities programme."
txt2 = "Natural language processing (NLP) is a subfield of " \
            "linguistics, computer science, and artificial intelligence " \
            "concerned with the interactions between computers and " \
            "human language, in particular how to program computers " \
            "to process and analyze large amounts of natural language data." \
            " The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation."
txt3 = "Trondheim has a very mild climate for its northerly latitude, resulting in moderate summers and winters that often remain above the freezing point in seaside areas. On higher elevation, though, the microclimate is colder and snowier. The city functions as the seat of the County Mayor of Trøndelag county, but not as the administrative centre, which is Steinkjer. This is to make the county more efficient and not too centralized, as Trøndelag is the third largest county in Norway."


In [138]:
import nltk
import numpy as np
import pandas as pd

from collections import Counter, OrderedDict
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords

import copy
import math

stops = stopwords.words('english')

In [179]:
def token_pro(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens] # everything lowercase
    tokens = [word.strip() for word in tokens] # strips whitespaces
    tokens = [word for word in tokens if word.isalnum()] # only alpha numerical values
    tokens = [word for word in tokens if not word in stops] # removes stopwords
    return tokens 

def genBow(tokens):
    bow = Counter(tokens)
    return bow

def printBow(bow, queryString):
    query_frequency = Counter(queryString)
    tf_vec = []
    for word in bow:
        if word in queryString:
            tf = query_frequency[word] #/len(bow)
        else:
            tf = 0
        tf_vec.append(tf)
    return tf_vec

def compute_tfidf_vectors(corpus, vector_template):
    tokenizer = TreebankWordTokenizer()
    tfidfDoc = []
    for doc in corpus:
        vec = copy.copy(vector_template)
        tokens = tokenizer.tokenize(doc.lower())
        token_counts = Counter(tokens)

        for key, value in token_counts.items():
            docs_containing_key = 0
            for _doc in corpus:
                if key in _doc.lower():
                    docs_containing_key += 1

            tf = value / len(tokens)

            if docs_containing_key:
                idf = len(corpus) / docs_containing_key
            else:
                idf = 0
            vec[key] = tf * idf

        tfidfDoc.append(vec)
    
    return tfidfDoc


In [171]:
corpus = [txt1, txt2, txt3]
tokens_corpus = token_pro((' ').join(corpus))
tokens_corpus

['oslo',
 'economic',
 'governmental',
 'centre',
 'norway',
 'city',
 'also',
 'hub',
 'norwegian',
 'trade',
 'banking',
 'industry',
 'shipping',
 'important',
 'centre',
 'maritime',
 'industries',
 'maritime',
 'trade',
 'europe',
 'city',
 'home',
 'many',
 'companies',
 'within',
 'maritime',
 'sector',
 'among',
 'world',
 'largest',
 'shipping',
 'companies',
 'shipbrokers',
 'maritime',
 'insurance',
 'brokers',
 'oslo',
 'pilot',
 'city',
 'council',
 'europe',
 'european',
 'commission',
 'intercultural',
 'cities',
 'programme',
 'natural',
 'language',
 'processing',
 'nlp',
 'subfield',
 'linguistics',
 'computer',
 'science',
 'artificial',
 'intelligence',
 'concerned',
 'interactions',
 'computers',
 'human',
 'language',
 'particular',
 'program',
 'computers',
 'process',
 'analyze',
 'large',
 'amounts',
 'natural',
 'language',
 'data',
 'goal',
 'computer',
 'capable',
 'understanding',
 'contents',
 'documents',
 'including',
 'contextual',
 'nuances',
 'languag

In [172]:
print(len(tokens_corpus))

147


In [173]:
bow_corpus = genBow(tokens_corpus)
print(bow_corpus.most_common(10))

[('language', 7), ('natural', 5), ('city', 4), ('maritime', 4), ('county', 4), ('centre', 3), ('documents', 3), ('oslo', 2), ('norway', 2), ('trade', 2)]


In [174]:
bow_txt1 = genBow(token_pro(txt1))
bow_txt2 = genBow(token_pro(txt2))
bow_txt3 = genBow(token_pro(txt3))

#running our sentences through the tf function:
tf_txt1 = printBow(bow_corpus, bow_txt1)
tf_txt2 = printBow(bow_corpus, bow_txt2)
tf_txt3 = printBow(bow_corpus, bow_txt3)

#Converting to dataframe for visualization
tf_df= pd.DataFrame([tf_txt1, tf_txt2, tf_txt3])
tf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,2,1,1,2,1,3,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,0,0,0,0,...,1,4,1,2,1,1,1,1,1,1


In [182]:
vector_template = OrderedDict((token, 0) for token in tokens_corpus)

tfidf = compute_tfidf_vectors(corpus, vector_template)

In [168]:
type(tfidf)

list

In [169]:
tfidf

[OrderedDict([('oslo', 0.07142857142857142),
              ('economic', 0.03571428571428571),
              ('governmental', 0.03571428571428571),
              ('centre', 0.03571428571428571),
              ('norway', 0),
              ('city', 0.05357142857142857),
              ('also', 0.03571428571428571),
              ('hub', 0.03571428571428571),
              ('norwegian', 0.03571428571428571),
              ('trade', 0.07142857142857142),
              ('banking', 0.03571428571428571),
              ('industry', 0.03571428571428571),
              ('shipping', 0.03571428571428571),
              ('important', 0.03571428571428571),
              ('maritime', 0.14285714285714285),
              ('industries', 0.03571428571428571),
              ('europe', 0.03571428571428571),
              ('home', 0.03571428571428571),
              ('many', 0.03571428571428571),
              ('companies', 0.07142857142857142),
              ('within', 0.017857142857142856),
              ('

In [185]:
def cosineSimilarity(vec1, vec2):
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]

    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]

    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))

    return dot_prod / (mag_1 * mag_2)

In [190]:
print(cosineSimilarity(tfidf[0], tfidf[1]))
print(cosineSimilarity(tfidf[0], tfidf[2]))
print(cosineSimilarity(tfidf[1], tfidf[2]))



0.16494371011065506
0.2259734240110995
0.22019875950437795
