In [17]:
import nltk
import numpy as np
import pandas as pd

from collections import Counter, OrderedDict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

import copy
import math

stops = stopwords.words('english')

In [28]:
txt1 = """The bustling city of New York is a melting pot of cultures and a hub for business and innovation. With its iconic skyline, Central Park, and the Statue of Liberty, New York City has a unique charm that draws millions of tourists every year. The city that never sleeps is known for its diverse neighborhoods, such as Chinatown, Little Italy, and Harlem, each offering a distinct cultural experience. The subway system, yellow taxis, and the famous Times Square are symbols of the city's energy and pace of life. New York is home to Wall Street, the epicenter of global finance, and Silicon Alley, a burgeoning tech hub. Whether you're exploring art at the Metropolitan Museum, enjoying a Broadway show, or savoring a classic New York-style pizza, the Big Apple has something for everyone."""
txt2 = """The Great Barrier Reef is a natural wonder of the world, located off the coast of Queensland, Australia. It is the largest coral reef system on the planet, spanning over 2,300 kilometers and comprising thousands of individual reefs and islands. The reef is a haven for marine biodiversity, housing a stunning array of coral species, fish, turtles, sharks, and other marine life. Snorkeling and diving in the crystal-clear waters of the Great Barrier Reef offer a chance to witness this underwater paradise up close. Tourists from around the globe flock to explore its vibrant coral formations, including the renowned Heart Reef. This ecological wonder, however, faces challenges from climate change and pollution, making conservation efforts crucial to protect its delicate ecosystem."""
txt3 = """New York City, often referred to as the "Big Apple," is a bustling metropolis on the East Coast of the United States. It's a city known for its iconic skyline, which includes the Empire State Building and One World Trade Center, and its diverse neighborhoods, such as Brooklyn, Queens, and the Bronx. Central Park, a massive urban green space, offers a peaceful escape in the heart of the city. The city is also famous for its culinary scene, featuring a wide range of international cuisines. From the historic charm of the Statue of Liberty to the vibrant arts scene in SoHo, New York City has something for everyone. Whether you're a tourist or a resident, there's always something new and exciting happening in the city that never sleeps."""

In [19]:
def token_pro(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens] # everything lowercase
    tokens = [word.strip() for word in tokens] # strips whitespaces
    tokens = [word for word in tokens if word.isalnum()] # only alpha numerical values
    tokens = [word for word in tokens if not word in stops] # removes stopwords
    return tokens 

def genBow(tokens):
    bow = Counter(tokens)
    return bow

def printBow(bow, queryString):
    query_frequency = Counter(queryString)
    tf_vec = []
    for word in bow:
        if word in queryString:
            tf = query_frequency[word] #/len(bow)
        else:
            tf = 0
        tf_vec.append(tf)
    return tf_vec

In [29]:
corpus = [txt1, txt2, txt3]
corpus_tokens = token_pro((' ').join(corpus))
corpus_tokens[:10]

['bustling',
 'city',
 'new',
 'york',
 'melting',
 'pot',
 'cultures',
 'hub',
 'business',
 'innovation']

In [30]:
print(len(corpus_tokens))

230


In [38]:
bow_corpus = genBow(corpus_tokens)

bow_txt1 = genBow(token_pro(txt1))
bow_txt2 = genBow(token_pro(txt2))
bow_txt3 = genBow(token_pro(txt3))

#running our sentences through the tf function:
tf_txt1 = printBow(bow_corpus, bow_txt1)
tf_txt2 = printBow(bow_corpus, bow_txt2)
tf_txt3 = printBow(bow_corpus, bow_txt3)

#Converting to dataframe for visualization
tf_df= pd.DataFrame([tf_txt1, tf_txt2, tf_txt3], columns = bow_corpus.keys())
tf_df

Unnamed: 0,bustling,city,new,york,melting,pot,cultures,hub,business,innovation,...,international,cuisines,historic,arts,soho,tourist,resident,always,exciting,happening
0,1,4,4,3,1,1,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,3,2,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [39]:
from sklearn.metrics.pairwise import cosine_similarity
df2 = pd.DataFrame(cosine_similarity(tf_df, dense_output=True))
df2.head()

Unnamed: 0,0,1,2
0,1.0,0.027156,0.539212
1,0.027156,1.0,0.035584
2,0.539212,0.035584,1.0


In [33]:
# remove stopwords, white spaces, and punctuation, and convert to lowercase, and only keep alpha-numeric values
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, token_pattern=r'(?u)\b[A-Za-z]+\b', norm=None)
X = vectorizer.fit_transform(corpus)
X.toarray()

array([[1.69314718, 1.28768207, 0.        , 1.69314718, 0.        ,
        0.        , 0.        , 1.28768207, 0.        , 1.69314718,
        0.        , 0.        , 0.        , 1.69314718, 1.69314718,
        1.28768207, 0.        , 1.28768207, 0.        , 0.        ,
        0.        , 1.28768207, 1.69314718, 5.15072829, 1.69314718,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.69314718, 1.69314718, 0.        , 1.69314718,
        1.28768207, 0.        , 1.69314718, 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.69314718, 1.69314718,
        1.69314718, 0.        , 0.        , 1.69314718, 0.        ,
        1.69314718, 0.        , 1.28768207, 0.        , 1.69314718,
        0.        , 0.        , 0.        , 1.69314718, 0.        ,
        0.        , 0.        , 0.        , 1.69314718, 0.        ,
        0.        , 0.        , 1.69314718, 0.  

In [37]:
df_tfidf = pd.DataFrame(cosine_similarity(X, dense_output=True), columns = ['txt1', 'txt2', 'txt3'], index = ['txt1', 'txt2', 'txt3'])
df_tfidf.head()

Unnamed: 0,txt1,txt2,txt3
txt1,1.0,0.012064,0.423433
txt2,0.012064,1.0,0.025243
txt3,0.423433,0.025243,1.0
