In [None]:
#!/usr/bin/env python3#!/usr/ 
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 17 18:10:43 2018

@author: jacobjohn

A) Consider the following two sentences
1. Term frequency matrix is important for ranking docs.
2. TFIDF is more important than Term frequency matrix for the same.

    i) Find TF MATRIX, IDF values of each term and finally TF*IDF MATRIX.
    ii) Find cosine similarity also.


B) Implement PAGE RANK ALGORITHM. Take input for adjacency matrix (no need to visualise the directed graph), 
   find stochastic matrix, find transpose of it. Consider dumping factor 0.7. Consider initial P values as all 1s.  
   You can consider 5 nodes. Calculate page rank until 2 iterations and display the ranks.
   
C) Implement Ellias Gamma, Ellias Delta and Golomb coding
"""

In [2]:
from __future__ import division
import string
import math
 
tokenize = lambda doc: doc.lower().split(" ")
 
document_0 = "Term frequency matrix is important for ranking docs."
document_1 = "TFIDF is more important than Term frequency matrix for the same."

all_documents = [document_0,document_1]
 
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)
 
def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)
 
def sublinear_term_frequency(term, tokenized_document):
    count = tokenized_document.count(term)
    if count == 0:
        return 0
    return 1 + math.log(count)
 
def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))
 
def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values
 
def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents
 
tfidf_representation = tfidf(all_documents)
print("Sentence is: ",document_0)
print("TF-IDF Matrix: ")
print(tfidf_representation[0])
print("\n")
print("Sentence is: ", document_1)
print("TF-IDF Matrix: ")
print(tfidf_representation[1])

Sentence is:  Term frequency matrix is important for ranking docs.
TF-IDF Matrix: 
[1.0, 1.0, 1.6931471805599454, 0.0, 1.0, 1.0, 1.6931471805599454, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0]


Sentence is:  TFIDF is more important than Term frequency matrix for the same.
TF-IDF Matrix: 
[1.0, 1.0, 0.0, 1.6931471805599454, 1.0, 1.0, 0.0, 1.6931471805599454, 1.0, 1.6931471805599454, 1.6931471805599454, 1.0, 1.6931471805599454]


In [3]:
#in Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer
 
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
print("Sentence is: ",document_0)
print(sklearn_representation.toarray()[0].tolist())
print("\n")
print("Sentence is: ", document_1)
print(sklearn_representation.toarray()[1].tolist())

Sentence is:  Term frequency matrix is important for ranking docs.
[0.4942890846583352, 0.29193509597604356, 0.29193509597604356, 0.29193509597604356, 0.29193509597604356, 0.29193509597604356, 0.0, 0.4942890846583352, 0.0, 0.29193509597604356, 0.0, 0.0, 0.0]


Sentence is:  TFIDF is more important than Term frequency matrix for the same.
[0.0, 0.22176418069574952, 0.22176418069574952, 0.22176418069574952, 0.22176418069574952, 0.22176418069574952, 0.37547939729419455, 0.0, 0.37547939729419455, 0.22176418069574952, 0.37547939729419455, 0.37547939729419455, 0.37547939729419455]


In [1]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

document_0 = "Term frequency matrix is important for ranking docs."
document_1 = "TFIDF is more important than Term frequency matrix for the same."

vector1 = text_to_vector(document_0)
vector2 = text_to_vector(document_1)

cosine = get_cosine(vector1, vector2)

print('Cosine:', cosine)

Cosine: 0.6396021490668313


In [5]:
"""
Page Rank
"""

#calculating page rank of a given graph
import igraph
from numpy import *

gd = igraph.Graph(directed=True)
gd.add_vertices(5) 
gd.add_edges([(0,1),(0,2),(2,0),(2,1),(2,4),(3,4),(4,3)]) 
result = gd.get_adjacency()

print(gd.get_adjacency())

[[0, 1, 1, 0, 0]
 [0, 0, 0, 0, 0]
 [1, 1, 0, 0, 1]
 [0, 0, 0, 0, 1]
 [0, 0, 0, 1, 0]]


In [6]:
#Stochastic matrix calculation
stoc = result
sum = [0,0,0,0,0]
for i in range(5):
    for j in range(5):
        sum[i] += result[i,j]
        
for i in range(5):
    for j in range(5):
        if sum[i] == 0:
            stoc[i,j] = 1/5
        else:
            if stoc[i,j] > 0:
                stoc[i,j] = stoc[i,j]/sum[i]
print("Stochastic matrix is: ")
print(stoc)

Stochastic matrix is: 
[[0, 0.5, 0.5, 0, 0]
 [0.2, 0.2, 0.2, 0.2, 0.2]
 [0.3333333333333333, 0.3333333333333333, 0, 0, 0.3333333333333333]
 [0, 0, 0, 0, 1.0]
 [0, 0, 0, 1.0, 0]]


In [7]:
#Calculating transpose
trans = [[stoc[j][i] for j in range(5)] for i in range(5)]

print("Transpose is: ")
print(trans)

Transpose is: 
[[0, 0.2, 0.3333333333333333, 0, 0], [0.5, 0.2, 0.3333333333333333, 0, 0], [0.5, 0.2, 0, 0, 0], [0, 0.2, 0, 0, 1.0], [0, 0.2, 0.3333333333333333, 1.0, 0]]


In [8]:
#Page Rank Calculation
d = 0.7
n = 5
m = 5
E = [1] * n
rank = [1] * n
for i in range(n):
    E[i] = [1] * m

for it in range(2):
    for i in range(n):
        for j in range(n):
            rank[j] = ((E[i][j])/n)+((1-d)*trans[i][j])*rank[j]

for index in range(len(rank)):
    print("Page rank of",index+1,"is: ",rank[index])

sort_rank = [i[0] for i in sorted(enumerate(rank), key=lambda x:x[1], reverse = True)]

print("\nRanks are as follows: ")
for index in sort_rank:
    print("P",index+1," >> ",end = "",sep = "")

Page rank of 1 is:  0.2
Page rank of 2 is:  0.21276595744728455
Page rank of 3 is:  0.22000000000000003
Page rank of 4 is:  0.26
Page rank of 5 is:  0.2

Ranks are as follows: 
P4 >> P3 >> P2 >> P1 >> P5 >> 

In [9]:
'''
Implement Ellias Gamma, Ellias Delta and Golomb coding
'''

from math import log,ceil

log2 = lambda x: log(x,2)

def binary(x, l = 1):
	fmt = '{0:0%db}'%1
	return fmt.format(x)

def unary(x):
	return x*'1'+'0'

def elias_generic(lencoding, x):
	if x == 0: return '0'
	l = 1+int(log2(x))
	a = x - 2**(int(log2(x)))
	k = int(log2(x))
	return lencoding(l) + binary(a,k)

def golomb(b, x):
	q = int((x) / b)
	r = int((x) % b)
	l = int(ceil(log2(b)))
	#print(q,r,l)
	return unary(q) + binary(r, l)

def elias_gamma(x):
    return elias_generic(unary, x)

def elias_delta(x):
    return elias_generic(elias_gamma,x)

print("%5s: %-10s : %-10s : %-10s" %
      ("Num", "Gamma", "Delta", "Goloumb"))
for i in range(11):
	print("%5d: %-10s : %-10s : %-10s" %
	(i,elias_gamma(i),elias_delta(i), golomb(3,i)))

  Num: Gamma      : Delta      : Goloumb   
    0: 0          : 0          : 00        
    1: 100        : 1000       : 01        
    2: 1100       : 11000      : 010       
    3: 1101       : 11001      : 100       
    4: 11100      : 11010      : 101       
    5: 11101      : 11011      : 1010      
    6: 111010     : 110110     : 1100      
    7: 111011     : 110111     : 1101      
    8: 111100     : 111000     : 11010     
    9: 111101     : 111001     : 11100     
   10: 1111010    : 1110010    : 11101     
