In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from spellchecker import SpellChecker
import string

In [2]:
text = 'The class is over. I hopep it is intersting to you. Please let me knoww if not.'

In [3]:
#change to lower case
text.lower()

'the class is over. i hopep it is intersting to you. please let me knoww if not.'

In [4]:
# word tokenization
word_tokens = word_tokenize(text)
print(word_tokens)

['The', 'class', 'is', 'over', '.', 'I', 'hopep', 'it', 'is', 'intersting', 'to', 'you', '.', 'Please', 'let', 'me', 'knoww', 'if', 'not', '.']


In [5]:
# remove stop words and punctuations
stopword_list = stopwords.words('english')
punctuation_list = list(string.punctuation)
cleaned_text = [txt for txt in word_tokenize(text.lower()) if txt not in stopword_list+punctuation_list]
print(cleaned_text)

['class', 'hopep', 'intersting', 'please', 'let', 'knoww']


In [6]:
# typo correction
spell = SpellChecker()
corrected_text = [spell.correction(wd) for wd in cleaned_text]
print(corrected_text)

['class', 'hope', 'interesting', 'please', 'let', 'know']


In [7]:
# part of speech tagging
pos_tag(corrected_text)

[('class', 'NN'),
 ('hope', 'NN'),
 ('interesting', 'VBG'),
 ('please', 'JJ'),
 ('let', 'NN'),
 ('know', 'VB')]

In [8]:
# Stemming the words
porter = PorterStemmer()
stem_words = [porter.stem(txt) for txt in corrected_text]
list(zip(corrected_text,stem_words))

[('class', 'class'),
 ('hope', 'hope'),
 ('interesting', 'interest'),
 ('please', 'pleas'),
 ('let', 'let'),
 ('know', 'know')]

In [9]:
# ngram representation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [10]:
# sentence tokenization
sentence_list = sent_tokenize(text)
print(sentence_list)

['The class is over.', 'I hopep it is intersting to you.', 'Please let me knoww if not.']


In [11]:
# applying the stop words removal and typo correction
correct_sentence_list = []
for sent in sentence_list:
    correct_sentence_list.append(' '.join([spell.correction(wd) for wd in word_tokenize(sent.lower()) \
                                  if wd not in stopword_list+punctuation_list]))
   

In [12]:
correct_sentence_list

['class', 'hope interesting', 'please let know']

In [13]:
#unigram
vectorizer = CountVectorizer(ngram_range=(1,1)) 
X = vectorizer.fit_transform(correct_sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names()
df

Unnamed: 0,class,hope,interesting,know,let,please
0,1,0,0,0,0,0
1,0,1,1,0,0,0
2,0,0,0,1,1,1


In [14]:
# Tf-Idf transformation of unigram
vectorizer = TfidfVectorizer(ngram_range=(1,1)) 
X = vectorizer.fit_transform(correct_sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names()
df.round(2)

Unnamed: 0,class,hope,interesting,know,let,please
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.71,0.71,0.0,0.0,0.0
2,0.0,0.0,0.0,0.58,0.58,0.58


In [15]:
#bigramI 
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(sentence_list)
df = pd.DataFrame(X.toarray())
df.columns = vectorizer.get_feature_names()
df

Unnamed: 0,class is,hopep it,if not,intersting to,is intersting,is over,it is,knoww if,let me,me knoww,please let,the class,to you
0,1,0,0,0,0,1,0,0,0,0,0,1,0
1,0,1,0,1,1,0,1,0,0,0,0,0,1
2,0,0,1,0,0,0,0,1,1,1,1,0,0


In [16]:
# word vectors
import gensim.downloader as api
from scipy.spatial.distance import cosine

In [17]:
#loading the 100 dimension word vector dictionary trained on twitter data
model = api.load("glove-twitter-100")

In [18]:
# get the vector of the word cat
model.get_vector('cat')

array([ 0.38446  , -0.45507  ,  0.45351  ,  0.4301   , -0.050908 ,
       -0.26414  ,  0.43253  , -0.3166   ,  0.32214  ,  0.0064333,
       -0.47066  ,  0.95335  , -3.2063   ,  0.010913 , -0.27565  ,
        1.1732   ,  0.52033  , -0.045973 ,  0.094254 , -0.53846  ,
        0.0035668,  0.11934  , -0.17815  , -0.58093  ,  0.65081  ,
       -0.48746  , -0.50961  ,  0.42771  , -0.30638  ,  0.32385  ,
        0.33687  , -0.1717   , -0.39104  , -0.19038  ,  0.37016  ,
       -0.50396  ,  0.041969 , -0.20517  ,  0.3223   ,  0.41217  ,
       -0.42191  , -0.26359  , -0.1773   , -0.35658  ,  0.52145  ,
        0.57282  ,  0.60204  ,  0.74369  ,  0.33377  , -0.45041  ,
        0.015978 , -0.12575  ,  0.29786  , -0.77635  ,  0.23759  ,
        0.63821  ,  0.63726  ,  1.0079   ,  0.13714  , -0.031928 ,
       -0.21299  ,  0.52348  ,  0.67934  , -0.1427   , -0.64236  ,
       -0.47996  , -0.87915  ,  0.17501  ,  0.64517  ,  0.3778   ,
        0.53493  , -0.29723  , -0.25206  , -0.757    ,  0.3364

In [19]:
# get the most similar words as cat
model.most_similar('cat')

[('dog', 0.8752089142799377),
 ('kitty', 0.8015091419219971),
 ('pet', 0.7986467480659485),
 ('cats', 0.797942578792572),
 ('kitten', 0.7936834096908569),
 ('puppy', 0.7702749967575073),
 ('monkey', 0.758426308631897),
 ('bear', 0.7507944107055664),
 ('dogs', 0.7460063099861145),
 ('pig', 0.7117345333099365)]

In [20]:
# cosine similarity between cat and tiger
1-cosine(model.get_vector('cat'), model.get_vector('tiger'))

0.6474888920783997

In [21]:
#cosine similarity between cat and kitten
1-cosine(model.get_vector('cat'), model.get_vector('kitten'))

0.7936834692955017

In [22]:
#cosine similarit between cat and car
1-cosine(model.get_vector('cat'), model.get_vector('car'))

0.5291033983230591