# KUC, PGR210
# P6.2
# 14:30 - 16:00
# Suppose you have three texts, describing the city of Oslo, NLP, and the city of Trondheim, i.e.,

txt1  = "Oslo is the economic and governmental centre of Norway. The city is also a hub of Norwegian trade, banking, industry and shipping. It is an important centre for maritime industries and maritime trade in Europe. The city is home to many companies within the maritime sector, some of which are among the world's largest shipping companies, shipbrokers and maritime insurance brokers. Oslo is a pilot city of the Council of Europe and the European Commission intercultural cities programme."
txt2 = "Natural language processing (NLP) is a subfield of " \
            "linguistics, computer science, and artificial intelligence " \
            "concerned with the interactions between computers and " \
            "human language, in particular how to program computers " \
            "to process and analyze large amounts of natural language data." \
            " The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation."
tx3 = "Trondheim has a very mild climate for its northerly latitude, resulting in moderate summers and winters that often remain above the freezing point in seaside areas. On higher elevation, though, the microclimate is colder and snowier. The city functions as the seat of the County Mayor of Trøndelag county, but not as the administrative centre, which is Steinkjer. This is to make the county more efficient and not too centralized, as Trøndelag is the third largest county in Norway."

Implement the code functioning the same as in P6.1, however, this time, the code should import sklearn.feature_extraction.text for the implementation:
1. Have a group discussion on corpus: compare the generated corpus with the corpus in P6.1.
2. generate BoW (bag of words) from scratch, represent each text and understand the representation.
3. generate TF-IDF from scratch, represent each text and understand the representation.
4. find the most similar two texts through BoW and TF-IDF combined with cosine_similarity , respectively among the given three texts.



In [1]:
import nltk
import numpy as np
import pandas as pd

from collections import Counter, OrderedDict
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

import copy
import math

stops = stopwords.words('english')

In [2]:
txt1  = "Oslo is the economic and governmental centre of Norway. The city is also a hub of Norwegian trade, banking, industry and shipping. It is an important centre for maritime industries and maritime trade in Europe. The city is home to many companies within the maritime sector, some of which are among the world's largest shipping companies, shipbrokers and maritime insurance brokers. Oslo is a pilot city of the Council of Europe and the European Commission intercultural cities programme."
txt2 = "Natural language processing (NLP) is a subfield of " \
            "linguistics, computer science, and artificial intelligence " \
            "concerned with the interactions between computers and " \
            "human language, in particular how to program computers " \
            "to process and analyze large amounts of natural language data." \
            " The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation."
txt3 = "Trondheim has a very mild climate for its northerly latitude, resulting in moderate summers and winters that often remain above the freezing point in seaside areas. On higher elevation, though, the microclimate is colder and snowier. The city functions as the seat of the County Mayor of Trøndelag county, but not as the administrative centre, which is Steinkjer. This is to make the county more efficient and not too centralized, as Trøndelag is the third largest county in Norway."


In [3]:
def token_pro(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens] # everything lowercase
    tokens = [word.strip() for word in tokens] # strips whitespaces
    tokens = [word for word in tokens if word.isalnum()] # only alpha numerical values
    tokens = [word for word in tokens if not word in stops] # removes stopwords
    return tokens 

def genBow(tokens):
    bow = Counter(tokens)
    return bow

def printBow(bow, queryString):
    query_frequency = Counter(queryString)
    tf_vec = []
    for word in bow:
        if word in queryString:
            tf = query_frequency[word] #/len(bow)
        else:
            tf = 0
        tf_vec.append(tf)
    return tf_vec

In [4]:
corpus = [txt1, txt2, txt3]
tokens_corpus = token_pro((' ').join(corpus))
tokens_corpus[:10]

['oslo',
 'economic',
 'governmental',
 'centre',
 'norway',
 'city',
 'also',
 'hub',
 'norwegian',
 'trade']

In [5]:
bow_corpus = genBow(tokens_corpus)
print(bow_corpus.most_common(10))

[('language', 7), ('natural', 5), ('city', 4), ('maritime', 4), ('county', 4), ('centre', 3), ('documents', 3), ('oslo', 2), ('norway', 2), ('trade', 2)]


In [6]:
bow_txt1 = genBow(token_pro(txt1))
bow_txt2 = genBow(token_pro(txt2))
bow_txt3 = genBow(token_pro(txt3))

#running our sentences through the tf function:
tf_txt1 = printBow(bow_corpus, bow_txt1)
tf_txt2 = printBow(bow_corpus, bow_txt2)
tf_txt3 = printBow(bow_corpus, bow_txt3)

#Converting to dataframe for visualization
tf_df= pd.DataFrame([tf_txt1, tf_txt2, tf_txt3])
tf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,101,102,103,104,105,106,107,108,109,110
0,2,1,1,2,1,3,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,1,0,0,0,0,...,1,4,1,2,1,1,1,1,1,1


In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
X.toarray()

array([[0.        , 0.        , 0.        , 0.09191602, 0.09191602,
        0.        , 0.09191602, 0.        , 0.27143541, 0.09191602,
        0.        , 0.        , 0.        , 0.09191602, 0.        ,
        0.09191602, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.139809  , 0.        , 0.09191602, 0.20971351,
        0.        , 0.        , 0.09191602, 0.18383204, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.09191602, 0.        , 0.        , 0.        , 0.09191602,
        0.        , 0.        , 0.18383204, 0.09191602, 0.        ,
        0.0699045 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.09191602, 0.        , 0.        , 0.09191602,
        0.        , 0.09191602, 0.        , 0.09191602, 0.05428708,
        0.        , 0.09191602, 0.09191602, 0.        , 0.        ,
        0.09191602, 0.        , 0.        , 0.09191602, 0.        ,
        0.27143541, 0.09191602, 0.        , 0.  

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
df2 = pd.DataFrame(cosine_similarity(X, dense_output=True))
df2.head()

Unnamed: 0,0,1,2
0,1.0,0.292792,0.36896
1,0.292792,1.0,0.289598
2,0.36896,0.289598,1.0
