In [31]:
# generate N-Grams
import unidecode

def generate_n_grams(text, n):
    word_lst = [str(unidecode.unidecode(w.decode('utf-8'))) for w in text.split()]
    ngram_lst = []
    for i in range(len(word_lst)-n+1):
        ngram_lst.append(word_lst[i:i+n])
        
    return ngram_lst


test_sent = """
            Justin Trudeau was the very picture of Mr. Congeniality, positive and alert. 
            Donald Trump looked as if he was struggling to stay awake. 
            You couldn’t blame him. Their news conference was a snoozer. 
            Which means that it was a resounding triumph.
            """

print(generate_n_grams(test_sent, 3))

[['Justin', 'Trudeau', 'was'], ['Trudeau', 'was', 'the'], ['was', 'the', 'very'], ['the', 'very', 'picture'], ['very', 'picture', 'of'], ['picture', 'of', 'Mr.'], ['of', 'Mr.', 'Congeniality,'], ['Mr.', 'Congeniality,', 'positive'], ['Congeniality,', 'positive', 'and'], ['positive', 'and', 'alert.'], ['and', 'alert.', 'Donald'], ['alert.', 'Donald', 'Trump'], ['Donald', 'Trump', 'looked'], ['Trump', 'looked', 'as'], ['looked', 'as', 'if'], ['as', 'if', 'he'], ['if', 'he', 'was'], ['he', 'was', 'struggling'], ['was', 'struggling', 'to'], ['struggling', 'to', 'stay'], ['to', 'stay', 'awake.'], ['stay', 'awake.', 'You'], ['awake.', 'You', "couldn't"], ['You', "couldn't", 'blame'], ["couldn't", 'blame', 'him.'], ['blame', 'him.', 'Their'], ['him.', 'Their', 'news'], ['Their', 'news', 'conference'], ['news', 'conference', 'was'], ['conference', 'was', 'a'], ['was', 'a', 'snoozer.'], ['a', 'snoozer.', 'Which'], ['snoozer.', 'Which', 'means'], ['Which', 'means', 'that'], ['means', 'that', 'it

In [36]:
# tf-idf vs tf-isf
## TF-IDF formula gives the relative importance of a term in a corpus
## ISF denotes uniqueness of that word in all sentences
## TF for a term “t” is defined as the count of a term “t” in a document “D”
## IDF for a term is defined as logarithm of ratio of total documents available 
#### in the corpus and number of documents containing the term T.
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from sklearn.preprocessing import normalize

def tf(word, doc):     
    count = doc.count(word)     
    total = len(doc)
    tf_score = count / float(total)     
    return tf_score

def n_containing(word, docs):
    count = 0     
    for doc in docs:         
        if doc.count(word) > 0:
            count += 1     
    return count

def isf(word, docs):
    doc_count = n_containing(word, docs)     
    ratio = len(docs) / float(1 + doc_count )     
    return math.log(ratio)

def tfisf(word, doc, docs):     
    tf_score = tf(word, doc)     
    isf_score = isf(word, docs)     
    return tf_score * isf_score

def compute_tfisf_scores(sentences):
    tfisf_scores = []     
    for sent in sentences:         
        sentence_score = 0         
        for word in sent:
            sentence_score += tfisf(word,sent,sentences)         
        sentence_score /= float(len(sent))         
        tfisf_scores.append(sentence_score)     
    return normalize([tfisf_scores])  # when there is .reshape(-1, 1) warning, just add a [] outside of the matrix


# tf-isf
print(compute_tfisf_scores(test_sent[1:30]))

print("*****************************************")

# tf-idf
tf_idf_vector = TfidfVectorizer()
print(tf_idf_vector.fit_transform(test_sent.split('\n')))

[[ 0.06603622  0.06603622  0.06603622  0.06603622  0.06603622  0.06603622
   0.06603622  0.06603622  0.06603622  0.06603622  0.06603622  0.06603622
   0.26786777  0.19843566  0.26786777  0.26786777  0.26786777  0.26786777
   0.06603622  0.26786777  0.26786777  0.19843566  0.26786777  0.26786777
   0.22725259  0.19843566  0.06603622  0.26786777  0.22725259]]
*****************************************
  (1, 0)	0.296800509742
  (1, 1)	0.296800509742
  (1, 20)	0.296800509742
  (1, 6)	0.296800509742
  (1, 16)	0.296800509742
  (1, 18)	0.296800509742
  (1, 19)	0.296800509742
  (1, 32)	0.296800509742
  (1, 26)	0.296800509742
  (1, 33)	0.176079617178
  (1, 30)	0.296800509742
  (1, 13)	0.296800509742
  (2, 3)	0.31080556236
  (2, 23)	0.31080556236
  (2, 28)	0.31080556236
  (2, 24)	0.31080556236
  (2, 9)	0.31080556236
  (2, 11)	0.31080556236
  (2, 2)	0.31080556236
  (2, 14)	0.31080556236
  (2, 31)	0.31080556236
  (2, 8)	0.31080556236
  (2, 33)	0.184388242745
  (3, 22)	0.346023735584
  (3, 5)	0.3460

In [24]:
# TEXT CLASSIFICATION - Naive Bayesian
## when using Naive Bayesian, it's better for each class to contain similar amount of input
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob

training_corpus = [
                     ("Does Emmanuel like ice-cream mochi?", "food"),
                     ("Is Emmanuel feeling sad, or mad or happy?", "emotion"),
                     ("Does Emmanuel miss me?", "of course"),
                     ("Strawberry ice-cream tastes better than lavanda flavor.", "of course"),
                     ("Mango ice-cream mochi tastes better than strawberry mochi.", "food"),
                     ("Avacador is the best fruit!", "food"),
                     ("Would Emmanuel like the dessert from my hometown?", "food"),
                     ("My homwtown has the best desert and vegetables and many other things!", "food"),
                     ("Emmanuel is super lovely and sweet!", "emotion"),
                     ("The weather recently is very weird", "of course"),
                     ("Nothing can be as weird as so much work load....", "emotion"),
                     ("Human beings are so complex, you can never imagine what other people will do", "emotion"),
                     ("It is so good to work with people with mild personality, although they are rare...", "emotion"),
                     ("cats like play games", "of course"),
                     ("It is easy to get cold in Spring", "of course")
                  ]

testing_corpus = [
                   ("Crepe with fruits or cream or chocolate is too sweet, with avacado and bacon is more acceptable", "food"),
                   ("Eating too much weet food makes people get depressed", "of course"),
                   ("Living in a city like Vancouver, it is difficult not to feel depressed", "emotion"),
                   ("Montreal is pretty but super boring", "of course"),
                   ("The best food is home made food", "food"),
                   ("I miss Emmanuel....", "emotion")
                 ]

# Doesen't work very well here even when the number of classes are the same
model = NBC(training_corpus)
new_input1 = "apple ice-cream sounds weird...."
new_input2 = "pineapple pie is good, but blueberry pie may be better"
new_input3 = "American food is not food"
new_input4 = "Without Emmanuel, not that happy any more...."
print(model.classify(new_input1))
print(model.classify(new_input2))
print(model.classify(new_input3))
print(model.classify(new_input4))

print(model.accuracy(testing_corpus))

of course
of course
of course
of course
0.333333333333


In [30]:
# TEXT CLASSIFICATION - SVM

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import classification_report
from sklearn import svm 

training_features = []
training_labels = []

for r in training_corpus:
    training_features.append(r[0])
    training_labels.append(r[1])
    
testing_features = []
testing_labels = []

for r in testing_corpus:
    testing_features.append(r[0])
    testing_labels.append(r[1])
    
# create feature vector
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)
# vectorize training data
train_vectors = vectorizer.fit_transform(training_features)
# vectorize testing data
test_vectors = vectorizer.transform(testing_features)

# classification with SVM
svm_model = svm.SVC(kernel='linear')
svm_model.fit(train_vectors, training_labels)
prediction = svm_model.predict(test_vectors)
print(prediction)
print (classification_report(testing_labels, prediction))
print "Mean Accuracy: ", svm_model.score(test_vectors, testing_labels)

['emotion' 'of course' 'emotion' 'emotion' 'food' 'emotion']
             precision    recall  f1-score   support

    emotion       0.50      1.00      0.67         2
       food       1.00      0.50      0.67         2
  of course       1.00      0.50      0.67         2

avg / total       0.83      0.67      0.67         6

Mean Accuracy:  0.666666666667


In [33]:
# TEXT MATCHING - Levenshtein Distance
from Levenshtein import distance

s1 = "Cherry ice-cream is amazing!"
s2 = "cherry ice-cream is amazing"
s3 = "cherry ice-cream"
s4 = "Cherry ice-cream"
print distance(s1, s2)
print distance(s2, s3)
print distance(s3, s4)

2
11
1


In [11]:
# TEXT MATCHING - Phonetic Matching, match words that are phonetically similar
import fuzzy

soundex = fuzzy.Soundex(4)
print soundex('Emmanuel')
print soundex('Immanuel')
print soundex('Jupyter')
print soundex('Jupiter')

dmeta = fuzzy.DMetaphone()
print dmeta('Emmanuel')
print dmeta('Immanuel')
print dmeta('Jupyter')
print dmeta('Jupiter')

print fuzzy.nysiis('Emmanuel')
print fuzzy.nysiis('Immanuel')
print fuzzy.nysiis('Jupyter')
print fuzzy.nysiis('Jupiter')

E540
I540
J136
J136
['AMNL', None]
['AMNL', None]
['JPTR', 'APTR']
['JPTR', 'APTR']
ENANAL
INANAL
JAPATAR
JAPATAR


In [None]:
"""
For Phonetic Matching, python Fuzzy contains Soundex, DMetaphone and nysiis. 
It seems that when it comes to similar pronuncation but different spelling, DMetaphone works better. 
But if you care about both spelling and pronouncation differences, Soundex and nysiis maybe better
"""

In [17]:
# TEXT MATCHING - Cosine Similarity
import math
from collections import Counter
from Levenshtein import distance

def get_cosine(vec1, vec2):
    common = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in common])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()]) 
    sum2 = sum([vec2[x]**2 for x in vec2.keys()]) 
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
   
    if not denominator:
        return 0.0 
    else:
        return float(numerator) / denominator

def text_to_vector(text): 
    words = text.split() 
    return Counter(words)


text1 = "I miss all the food in my hometown...."
text2 = "food in my hometown, I miss them...."
text3 = "I miss all the food"

vector1 = text_to_vector(text1) 
vector2 = text_to_vector(text2) 
vector3 = text_to_vector(text3) 
cosine1 = get_cosine(vector1, vector2)
cosine2 = get_cosine(vector1, vector3)
print cosine1, cosine2
print distance(text1, text2)
print distance(text1, text3)

0.668153104781 0.790569415042
28
19
