In [1]:
##Implement Elias Delta and Elias Gamma and Golomb coding in python
from math import log,ceil

log2 = lambda x: log(x,2)

def binary(x, l = 1):
	fmt = '{0:0%db}'%1
	return fmt.format(x)

def unary(x):
	return x*'1'+'0'

def elias_generic(lencoding, x):
	if x == 0: return '0'
	l = 1+int(log2(x))
	a = x - 2**(int(log2(x)))
	k = int(log2(x))
	return lencoding(l) + binary(a,k)

def golomb(b, x):
	q = int((x) / b)
	r = int((x) % b)
	l = int(ceil(log2(b)))
	#print(q,r,l)
	return unary(q) + binary(r, l)

def elias_gamma(x):
    return elias_generic(unary, x)

def elias_delta(x):
    return elias_generic(elias_gamma,x)

print("%-46s %-20s" %
      (" ","Elias"))
print("%5s: %-11s : %-10s : %-10s : %-10s : %-10s" %
      ("Num", "Unary", "Binary", "Gamma", "Delta", "Goloumb"))
for i in range(11):
	print("%5d: %-11s : %-10s : %-10s : %-10s : %-10s" %
	(i,unary(i),binary(i),elias_gamma(i),elias_delta(i), golomb(3,i)))


                                               Elias               
  Num: Unary       : Binary     : Gamma      : Delta      : Goloumb   
    0: 0           : 0          : 0          : 0          : 00        
    1: 10          : 1          : 100        : 1000       : 01        
    2: 110         : 10         : 1100       : 11000      : 010       
    3: 1110        : 11         : 1101       : 11001      : 100       
    4: 11110       : 100        : 11100      : 11010      : 101       
    5: 111110      : 101        : 11101      : 11011      : 1010      
    6: 1111110     : 110        : 111010     : 110110     : 1100      
    7: 11111110    : 111        : 111011     : 110111     : 1101      
    8: 111111110   : 1000       : 111100     : 111000     : 11010     
    9: 1111111110  : 1001       : 111101     : 111001     : 11100     
   10: 11111111110 : 1010       : 1111010    : 1110010    : 11101     


In [2]:
#computing tf, tf-idf
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [3]:
document1 = tb("""Python is a 2000 made-for-TV horror movie directed by Richard
Clabaugh. The film features several cult favorite actors, including William
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy,
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean
Whalen. The film concerns a genetically engineered snake, a python, that
escapes and unleashes itself on a small town. It includes the classic final
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles,
 California and Malibu, California. Python was followed by two sequels: Python
 II (2002) and Boa vs. Python (2004), both also made-for-TV films.""")

document2 = tb("""Python, from the Greek word (πύθων/πύθωνας), is a genus of
nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are
recognised.[2] A member of this genus, P. reticulatus, is among the longest
snakes known.""")

document3 = tb("""The Colt Python is a .357 Magnum caliber revolver formerly
manufactured by Colt's Manufacturing Company of Hartford, Connecticut.
It is sometimes referred to as a "Combat Magnum".[1] It was first introduced
in 1955, the same year as Smith &amp; Wesson's M29 .44 Magnum. The now discontinued
Colt Python targeted the premium revolver market segment. Some firearm
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
Thompson, Renee Smeets and Martin Dougherty have described the Python as the
finest production revolver ever made.""")

bloblist = [document1, document2, document3]
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: python, TF-IDF: 0.01662
	Word: films, TF-IDF: 0.00997
	Word: made-for-TV, TF-IDF: 0.00665
Top words in document 2
	Word: genus, TF-IDF: 0.02192
	Word: 2, TF-IDF: 0.02192
	Word: from, TF-IDF: 0.01096
Top words in document 3
	Word: Colt, TF-IDF: 0.01367
	Word: Magnum, TF-IDF: 0.01367
	Word: revolver, TF-IDF: 0.01367


In [4]:
#Alternative method to compute TF-IDF

#Defining the TF function
def computeTF(wordDict,bow):
        tfDict = {}
        bowCount = len(bow)
        for word, count in wordDict.items():
                tfDict[word] = count / float(bowCount)
        return tfDict

In [5]:
#Calculating TF for two docs

docA = "the cat sat on my face"
docB = "the dog sat on my bed"

bowA = docA.split(" ")
bowB = docB.split(" ")

wordSet= set(bowA).union(set(bowB)) #finding vocabulary list

wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

for word in bowA:
        wordDictA[word]+=1

for word in bowB:
        wordDictB[word]+=1

import pandas as pd

bag = pd.DataFrame([wordDictA, wordDictB])

#printing wordbag
print(bag)

   bed  cat  dog  face  my  on  sat  the
0    0    1    0     1   1   1    1    1
1    1    0    1     0   1   1    1    1


In [6]:
#computing term frequency

tfBowA = computeTF(wordDictA, bowA)
print("Term Frequency Matrix for doc A: ")
print(tfBowA)

tfBowB = computeTF(wordDictB, bowB)
print("Term Frequency Matrix for doc B: ")
print(tfBowB)

Term Frequency Matrix for doc A: 
{'my': 0.16666666666666666, 'bed': 0.0, 'face': 0.16666666666666666, 'dog': 0.0, 'the': 0.16666666666666666, 'cat': 0.16666666666666666, 'on': 0.16666666666666666, 'sat': 0.16666666666666666}
Term Frequency Matrix for doc B: 
{'my': 0.16666666666666666, 'bed': 0.16666666666666666, 'face': 0.0, 'dog': 0.16666666666666666, 'the': 0.16666666666666666, 'cat': 0.0, 'on': 0.16666666666666666, 'sat': 0.16666666666666666}


In [7]:
#defining IDF
def computeIDF(docList):
        idfDict = {}
        N = len(docList)
        #Count N of docs that contain word w
        idfDict = dict.fromkeys(docList[0].keys(),0)
        for doc in docList:
                for word, val in doc.items():
                        if val > 0:
                                idfDict[word] +=1
        for word, val in idfDict.items():
                idfDict[word] = math.log(N/ float(val))
        return idfDict

In [8]:
#inverse document frequency for A and B
idfs = computeIDF([wordDictA, wordDictB])
idfs

{'my': 0.0,
 'bed': 0.6931471805599453,
 'face': 0.6931471805599453,
 'dog': 0.6931471805599453,
 'the': 0.0,
 'cat': 0.6931471805599453,
 'on': 0.0,
 'sat': 0.0}

In [9]:
def computeTFIDF(tfBow,idfs):
        tfidf = {}
        for word, val in tfBow.items():
                tfidf[word] = val * idfs[word]
        return tfidf

In [10]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

TF = pd.DataFrame([tfidfBowA, tfidfBowB])

print(TF)

        bed       cat       dog      face   my   on  sat  the
0  0.000000  0.115525  0.000000  0.115525  0.0  0.0  0.0  0.0
1  0.115525  0.000000  0.115525  0.000000  0.0  0.0  0.0  0.0


In [14]:
#Using sklearn in python
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
response = vectorizer.fit_transform([docA,docB])

print(response)

  (0, 7)	0.35464863330313684
  (0, 1)	0.49844627974580596
  (0, 6)	0.35464863330313684
  (0, 5)	0.35464863330313684
  (0, 4)	0.35464863330313684
  (0, 3)	0.49844627974580596
  (1, 7)	0.35464863330313684
  (1, 6)	0.35464863330313684
  (1, 5)	0.35464863330313684
  (1, 4)	0.35464863330313684
  (1, 2)	0.49844627974580596
  (1, 0)	0.49844627974580596
