In [2]:
import pandas as pd
import nltk
from string import punctuation
import math

In [3]:
docs = ["plot: two teen couples go to a church party, drink and then drive.", 
"films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before .", 
"every now and then a movie comes along from a suspect studio , with every indication that it will be a stinker , and to everybody's surprise ( perhaps even the studio ) the film becomes a critical darling . ", 
"damn that y2k bug ."
]
StopWords = set(nltk.corpus.stopwords.words('english'))
StopWords.update(set(punctuation))
StopWords.update(set(['.',',','a','they','the','his','so','and','were','from','that','of','in','only','with','to']))

In [4]:
newdocs = []
def bagOfWords(docs):
    for i in range(len(docs)):
        doc = nltk.tokenize.word_tokenize(docs[i])
        doc = [word for word in doc if not word in StopWords]
        newdocs.append(doc)
        BOWR = {} #bag of words representation
        terms = []
        for doc in newdocs:
            terms.extend(doc)
        terms = set(terms)
        for term in terms:
            row = []
            for doc in newdocs:
                count = 0
                for t in doc:
                    if t == term:
                        count += 1
                row.append(count)
            BOWR[term] = row
        BOWR['total_terms'] = [len(doc) for doc in newdocs]
    return(BOWR)

In [5]:
bag = bagOfWords(docs)
df = pd.DataFrame(bag, index = ['Document ' + str(i) for i in range(len(docs))])
df

Unnamed: 0,drink,two,perhaps,suspect,everybody,arthouse,teen,comes,kids,spawn,...,drive,indication,book,batman,plenty,crowd,really,party,critical,total_terms
Document 0,1,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,9
Document 1,0,0,0,0,0,1,0,0,1,1,...,0,0,1,1,1,1,1,0,0,27
Document 2,0,0,1,1,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,19
Document 3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [6]:
#TF matrix
totals = [bag['total_terms'][i] for i in range(len(docs))]
def getTFMatrix(bag):
    TFMatrix = {}
    TFMatrix = {term : [bag[term][i] / totals[i] for i in range(len(docs))] for term in bag.keys()}
    del TFMatrix['total_terms']
    return(TFMatrix)

In [7]:
TF = getTFMatrix(bag)
df = pd.DataFrame(TF, index = ['Document ' + str(i) for i in range(len(docs))])
df

Unnamed: 0,drink,two,perhaps,suspect,everybody,arthouse,teen,comes,kids,spawn,...,world,drive,indication,book,batman,plenty,crowd,really,party,critical
Document 0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0
Document 1,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.0,0.037037,0.037037,...,0.037037,0.0,0.0,0.037037,0.037037,0.037037,0.037037,0.037037,0.0,0.0
Document 2,0.0,0.0,0.052632,0.052632,0.052632,0.0,0.0,0.052632,0.0,0.0,...,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
Document 3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def getIDFVector(bag):
    IDF = {}
    terms = []
    for doc in newdocs:
        terms.extend(doc)
    terms = set(terms)
    for term in terms:
        appears = [0 for _ in range(len(docs))]
        for i in range(len(newdocs)):
            if term in newdocs[i]:
                appears[i] = 1
        IDF[term] = 0 if sum(appears) == 0 else math.log((1 + len(newdocs)) / sum(appears))
    return(IDF)

In [9]:
IDV = getIDFVector(bag)
df = pd.DataFrame(IDV, index = ['IDF values'])
df

Unnamed: 0,drink,two,perhaps,suspect,everybody,arthouse,teen,comes,kids,spawn,...,world,drive,indication,book,batman,plenty,crowd,really,party,critical
IDF values,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,...,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438,1.609438


In [10]:
def getTF_IDFMatrix(TF, IDV):
    TF_IDF = {}
    for term in TF:
        TF_IDF[term] = [TF[term][i] * IDV[term] for i in range(len(newdocs))]
    return(TF_IDF)

In [11]:
TF_IDF = getTF_IDFMatrix(TF, IDV)
df = pd.DataFrame(TF_IDF, index = ['Document ' + str(i) for i in range(len(docs))])
df

Unnamed: 0,drink,two,perhaps,suspect,everybody,arthouse,teen,comes,kids,spawn,...,world,drive,indication,book,batman,plenty,crowd,really,party,critical
Document 0,0.178826,0.178826,0.0,0.0,0.0,0.0,0.178826,0.0,0.0,0.0,...,0.0,0.178826,0.0,0.0,0.0,0.0,0.0,0.0,0.178826,0.0
Document 1,0.0,0.0,0.0,0.0,0.0,0.059609,0.0,0.0,0.059609,0.059609,...,0.059609,0.0,0.0,0.059609,0.059609,0.059609,0.059609,0.059609,0.0,0.0
Document 2,0.0,0.0,0.084707,0.084707,0.084707,0.0,0.0,0.084707,0.0,0.0,...,0.0,0.0,0.084707,0.0,0.0,0.0,0.0,0.0,0.0,0.084707
Document 3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
def normalize(TF_IDF):
    norm = {}
    denos = [0 for _ in range(len(newdocs))]
    for i in range(len(newdocs)):
        denos[i] += sum([TF_IDF[term][i] ** 2 for term in TF_IDF])
    for i in range(len(denos)):
        denos[i] = denos[i] ** 0.5
    for term in TF_IDF:
        norm[term] = [TF_IDF[term][i] / denos[i] for i in range(len(newdocs))]
    return(norm)
    

In [13]:
norm = normalize(TF_IDF)
df = pd.DataFrame(norm, index = ['Document ' + str(i) for i in range(len(docs))])
df

Unnamed: 0,drink,two,perhaps,suspect,everybody,arthouse,teen,comes,kids,spawn,...,world,drive,indication,book,batman,plenty,crowd,really,party,critical
Document 0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0
Document 1,0.0,0.0,0.0,0.0,0.0,0.187898,0.0,0.0,0.187898,0.187898,...,0.187898,0.0,0.0,0.187898,0.187898,0.187898,0.187898,0.187898,0.0,0.0
Document 2,0.0,0.0,0.211647,0.211647,0.211647,0.0,0.0,0.211647,0.0,0.0,...,0.0,0.0,0.211647,0.0,0.0,0.0,0.0,0.0,0.0,0.211647
Document 3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
norm

{'drink': [0.3333333333333333, 0.0, 0.0, 0.0],
 'two': [0.3333333333333333, 0.0, 0.0, 0.0],
 'perhaps': [0.0, 0.0, 0.21164730195116557, 0.0],
 'suspect': [0.0, 0.0, 0.21164730195116557, 0.0],
 'everybody': [0.0, 0.0, 0.21164730195116557, 0.0],
 'arthouse': [0.0, 0.18789780933325936, 0.0, 0.0],
 'teen': [0.3333333333333333, 0.0, 0.0, 0.0],
 'comes': [0.0, 0.0, 0.21164730195116557, 0.0],
 'kids': [0.0, 0.18789780933325936, 0.0, 0.0],
 'spawn': [0.0, 0.18789780933325936, 0.0, 0.0],
 'movie': [0.0, 0.0, 0.21164730195116557, 0.0],
 'superman': [0.0, 0.18789780933325936, 0.0, 0.0],
 'toward': [0.0, 0.18789780933325936, 0.0, 0.0],
 'superheroes': [0.0, 0.18789780933325936, 0.0, 0.0],
 'geared': [0.0, 0.18789780933325936, 0.0, 0.0],
 'never': [0.0, 0.18789780933325936, 0.0, 0.0],
 'every': [0.0, 0.0, 0.42329460390233115, 0.0],
 'go': [0.3333333333333333, 0.0, 0.0, 0.0],
 'along': [0.0, 0.0, 0.21164730195116557, 0.0],
 'darling': [0.0, 0.0, 0.21164730195116557, 0.0],
 'hell': [0.0, 0.1878978093

In [15]:
#Document similarity:
def getSimilarityResult(norm):
    similarityRes = {}
    for i in range(0, len(newdocs)):
        for j in range(i + 1, len(newdocs)):
            similarity = 0
            for term in norm:
                similarity += norm[term][i] * norm[term][j]
            res = 'Similarity of document ' + str(i + 1) + ' with document ' + str(j + 1) + ' is: ' + str(similarity)
            similarityRes[res] = similarity
            print(res)
    return(similarityRes)

In [16]:
result = getSimilarityResult(norm)

Similarity of document 1 with document 2 is: 0.0
Similarity of document 1 with document 3 is: 0.0
Similarity of document 1 with document 4 is: 0.0
Similarity of document 2 with document 3 is: 0.012889990160975492
Similarity of document 2 with document 4 is: 0.0
Similarity of document 3 with document 4 is: 0.0


In [17]:
result

{'Similarity of document 1 with document 2 is: 0.0': 0.0,
 'Similarity of document 1 with document 3 is: 0.0': 0.0,
 'Similarity of document 1 with document 4 is: 0.0': 0.0,
 'Similarity of document 2 with document 3 is: 0.012889990160975492': 0.012889990160975492,
 'Similarity of document 2 with document 4 is: 0.0': 0.0,
 'Similarity of document 3 with document 4 is: 0.0': 0.0}