In [61]:
from collections import Counter
import pandas
import string
import math

text = [
    "Programming language Python is simple programming language",
    "Machine Learning with Python is simple",
    "Machine Learning with Java is common",
    "Java is Object Oriented Programming language",
    "Machine Learning with R is old"
]

In [62]:
# Find all the unique words in the text.
unique_words = list(set(" ".join(text).lower().split()))

# remove stopwords
with open("stopwords_en.txt", 'r') as f:
    stopwords = f.read().split("\n")

unique_words = [w for w in unique_words if w not in stopwords]

def make_matrix(text, vocab):
    matrix = []
    for sentence in text:
        s = sentence.lower().split()
        # Count each word in the text, and make a dictionary.
        counter = Counter(s)
        # Turn the dictionary into a matrix row using the vocab.
        row = [counter.get(w, 0) for w in vocab]
        matrix.append(row)
    df = pandas.DataFrame(matrix)
    df.columns = unique_words
    return df

print(make_matrix(text, unique_words))

   oriented  java  language  python  object  programming  machine  simple  \
0         0     0         2       1       0            2        0       1   
1         0     0         0       1       0            0        1       1   
2         0     1         0       0       0            0        1       0   
3         1     1         1       0       1            1        0       0   
4         0     0         0       0       0            0        1       0   

   common  learning  
0       0         0  
1       0         1  
2       1         1  
3       0         0  
4       0         1  


In [63]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

cvec = CountVectorizer(lowercase=True, stop_words='english')

matrix = cvec.fit_transform(text)
vocab = cvec.get_feature_names()
cvec_counts = cvec.transform(text)

### Support Functions

In [64]:
def text2list(document):
    words = document.lower().split()
    words = [w for w in words if w not in stopwords]
    return words

In [65]:
text2list(text[0])

['programming', 'language', 'python', 'simple', 'programming', 'language']

### Functions for Scoring

In [200]:
def similarity_jaccard(query, document):
    intersection = set(query).intersection(set(document)) 
    union = set(query).union(set(document))
    return float(len(intersection))/float(len(union))

def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

def score_tf(query, tokenized_document):
    result = 0.0
    for q in query:
        count = term_frequency(q, tokenized_document)
        if count == 0:
            count = 0
        tf = 1 + math.log(count)
        print "count:",count, "\tterm:",q,"\ttf:",tf
        result = result + tf
    return result

def score_tf2(query, tokenized_document):
    result = 0.0
    for q in query:
        count = term_frequency(q, tokenized_document)
        tf = 1 + math.log(count)
        print "count:",count, "\tterm:",q,"\ttf:",tf
        result = result + tf
    return result

def inverse_document_frequencies(term, documents):
    count = 0
    for d in documents:
        tokenized_d = text2list(d)
        if term in tokenized_d:
            count = count + 1
    return math.log(len(documents)/count)



In [201]:
jaccard_similarity(text2list("python programming"), text2list(text[0]))


0.5

In [202]:
score_tf(text2list("programming"), text2list(text[0]))

count: 2 	term: programming 	tf: 1.69314718056


1.6931471805599454

In [204]:
inverse_document_frequencies("programming", text)

0.6931471805599453

#### Exercise: Create Tf-Idf scoring function 

In [207]:
def score_tfidf(query):
    score = 0.0
    #
    # implement the score tf-idf here
    #
    return score

In [209]:
score_tfidf(text2list("programming python"))


0.0