In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk

from scipy.spatial.distance import minkowski, cosine
from IPython.display import display
from typing import Sequence, Callable

In [None]:
df = pd.read_csv("arxiv_data.csv", nrows=500)

In [None]:
df.head()

# Bag of Words

In [None]:
def get(index):
    return df.loc[index, "summaries"]

In [None]:
def get_vocabulary(all_docs)->set:
    vocab = set()
    for doc in all_docs:
        for word in nltk.word_tokenize(doc):
            vocab|={word.lower()}
    return vocab

In [None]:
vocab = list(get_vocabulary(df["summaries"]))
len(vocab)

In [None]:
def get_stats(i: int):
    most_sim = mat[i,1:].argmin()
    print(f"Min. Dist. Index = {most_sim}")
    print(f"Min. Dist. = {mat[0, most_sim]}\n")
    print(get(i))
    print()
    print(get(most_sim))

## Term Existence

In [None]:
def bag_of_words(document: str, vocab: list):
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]=1
        
    return bow

In [None]:
def get_similarity_mat(N, dist):
    bows = [ bag_of_words(get(i), vocab) for i in range(N) ]
    
    sim_mat = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(n), index=range(n)).loc[0:5, 0:5])
    return sim_mat

### Euclidean Distance

In [None]:
mat = get_similarity_mat(500, minkowski)

In [None]:
get_stats(0)

> COMMENTS

### Cosine Distance

In [None]:
mat = get_similarity_mat(500, cosine)

In [None]:
get_stats(0)

> COMMENT

### Dot Product Distance

In [None]:
mat = get_similarity_mat(500, lambda a,b: 1/np.dot(a,b))

In [None]:
get_stats(0)

> Comments

> ## Final comments

## Term Weighting

In [None]:
def bag_of_words(document: str, vocab: list):
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]+=1
        
    return bow

In [None]:
def get_similarity_mat(N, dist):
    bows = [ bag_of_words(get(i), vocab) for i in range(N) ]
    
    sim_mat = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(n), index=range(n)).loc[0:5, 0:5])
    return sim_mat

### Euclidean Distance

In [None]:
mat = get_similarity_mat(500, minkowski)  

In [None]:
get_stats(0)

> Comments

### Cosine Distance

In [None]:
mat = get_similarity_mat(500, cosine)  

In [None]:
get_stats(0)

> comment

### Dot Product Distance

In [None]:
mat = get_similarity_mat(500, lambda a,b: 1/np.dot(a,b))

In [None]:
get_stats(0)

## Term Frequency Transformation

In [None]:
def bag_of_words(document: str, vocab: list, t: Callable) -> Sequence:
    bow = np.zeros(len(vocab))
    loc = dict(map(lambda pair: (pair[1], pair[0]), enumerate(vocab)))
    
    for word in nltk.word_tokenize(document):
        bow[loc[word.lower()]]+=1
    
    return t(bow)

In [None]:
def get_similarity_mat(N: int, dist: Callable, t: Callable):
    bows = [ bag_of_words(get(i), vocab, t) for i in range(N) ]
    
    sim_mat = np.zeros((n,n))
    for i in range(n):
        for j in range(i, n):
            sim_mat[i,j] = dist(bows[i], bows[j])
            sim_mat[j,i] = sim_mat[i,j]
    display(pd.DataFrame(data = sim_mat, columns=range(n), index=range(n)).loc[0:5, 0:5])
    return sim_mat

In [None]:
t1 = lambda arr: np.log2(1+arr)
t2 = lambda arr: np.log2(1 + t1(arr))
t3 = lambda arr, k: (k+1)*arr/(arr+k)

#### Comparing effect of Different Frequency Transformers

Given the fact that the book referenced vector dot product, that measure will be the one used for this step

In [None]:
dot_dist = lambda x, y: 1/np.dot(x,y)

### First Transform

In [None]:
mat = get_similarity_mat(500, dot_dist, t1)

In [None]:
get_stats(0)

### Second Transform

In [None]:
mat = get_similarity_mat(500, dot_dist, t2)

In [None]:
get_stats(0)

### Third Transform

In [None]:
k=1.2
t3_k = lambda arr: t3(arr, k)

In [None]:
mat = get_similarity_mat(500, dot_dist, t3_k)

In [None]:
get_stats(0)