# Basics of NLP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Tokenization : Example

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/gshyam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
text = "This is Andrew's text, isn't it?"
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

['This', 'is', "Andrew's", 'text,', "isn't", 'it?']

In [4]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

['This', 'is', 'Andrew', "'s", 'text', ',', 'is', "n't", 'it', '?']

In [5]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

['This', 'is', 'Andrew', "'", 's', 'text', ',', 'isn', "'", 't', 'it', '?']

# Stemming : Example

In [6]:
text = "feet wolves cats talked"
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)
print (tokens)

['feet', 'wolves', 'cats', 'talked']


In [7]:
stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(token) for token in tokens)

'feet wolv cat talk'

In [8]:
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token) for token in tokens)

'foot wolf cat talked'

# Tf-Idf : Example

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
texts = [ "good movie", "not a good movie", "did not like", "i like it", "good one" ]
# using default tokenizer in TfidfVectorizer
tfidf= TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))

features = tfidf.fit_transform(texts)

print ('features:\n',features)
pd.DataFrame( features.todense(), columns=tfidf.get_feature_names() )

features:
   (0, 0)	0.7071067811865476
  (0, 2)	0.7071067811865476
  (1, 3)	0.5773502691896257
  (1, 0)	0.5773502691896257
  (1, 2)	0.5773502691896257
  (2, 1)	0.7071067811865476
  (2, 3)	0.7071067811865476
  (3, 1)	1.0


Unnamed: 0,good movie,like,movie,not
0,0.707107,0.0,0.707107,0.0
1,0.57735,0.0,0.57735,0.57735
2,0.0,0.707107,0.0,0.707107
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0


## Abstract Syntax Trees (AST) : Examples

In [10]:
import ast
str_dict1 = "{'a':1, 'b':2}"; dict1 = ast.literal_eval(str_dict1)
str_lst1 = "[1,2,3,4]"; lst1=ast.literal_eval(str_lst1)
str_lst2 = "['php', 'mysql']"; lst2=ast.literal_eval(str_lst2)

#Now we can treat lst1 and lst2 as regular lists and dict1 as python dictionary
print (len(lst1), len(lst2) )
print (dict1.keys() )

4 2
dict_keys(['a', 'b'])


# Counter : Example

In [11]:
from collections import Counter

strr= 'abcdeabcdabcaba'
c = Counter(strr)  # count elements from a string

print ('original string:', strr)
print  ( 'most_common:', c.most_common(3) )
print  ( 'sorted:', sorted(c) )
print ( 'list elements with repetitions:', ''.join(sorted(c.elements())) )
print ()

lst=['aaa', 'abc', 'def', 'aaa', 'abc', 'aaa', 'def', 'aa']
c = Counter(lst)
print ('original list:', lst)
print  ( 'most_common:', c.most_common(3) )
print  ( 'sorted:', sorted(c) )
print ( 'list elements with repetitions:', ''.join(sorted(c.elements())) )


original string: abcdeabcdabcaba
most_common: [('a', 5), ('b', 4), ('c', 3)]
sorted: ['a', 'b', 'c', 'd', 'e']
list elements with repetitions: aaaaabbbbcccdde

original list: ['aaa', 'abc', 'def', 'aaa', 'abc', 'aaa', 'def', 'aa']
most_common: [('aaa', 3), ('abc', 2), ('def', 2)]
sorted: ['aa', 'aaa', 'abc', 'def']
list elements with repetitions: aaaaaaaaaaaabcabcdefdef


# Sort : Example

In [12]:
py_set = {'e', 'a', 'u', 'o', 'i'}
print(sorted(py_set, reverse=True))

py_dict = {'e': 1, 'a': 2, 'u': 3, 'o': 4, 'i': 5}
print(sorted(py_dict, reverse=True))

frozen_set = frozenset(('e', 'a', 'u', 'o', 'i'))
print(sorted(frozen_set, reverse=True))


['u', 'o', 'i', 'e', 'a']
['u', 'o', 'i', 'e', 'a']
['u', 'o', 'i', 'e', 'a']


In [13]:
def take_first(x): return x[0]
def take_second(x): return x[1]

# random list
random = [(2, 2), (3, 4), (4, 1), (1, 3)]
print('Sorted with first element :', sorted(random, key=take_first) )
print('Sorted with second element:', sorted(random, key=take_second) )

Sorted with first element : [(1, 3), (2, 2), (3, 4), (4, 1)]
Sorted with second element: [(4, 1), (2, 2), (1, 3), (3, 4)]


In [14]:
# List elements: (Student's Name, Marks out of 100 , Age)
participant_list = [ ('Alison', 50, 18), ('Terence', 75, 12), ('David', 75, 20), ('Jimmy', 90, 22),('Jak', 50, 17) ]
def sorter(item):
    # Since highest marks first, least error = most marks
    error = 100 - item[1]
    age = item[2]
    return (error, age)

sorted_list = sorted(participant_list, key=sorter)
print(sorted_list)

[('Jimmy', 90, 22), ('Terence', 75, 12), ('David', 75, 20), ('Jak', 50, 17), ('Alison', 50, 18)]


In [15]:
py_dict = {'a': 5, 'b': 1, 'c': 4, 'd': 6, 'e': 2}

sorted(py_dict.items(), key=lambda x: x[1], reverse=True)[:3]

[('d', 6), ('a', 5), ('c', 4)]

# Sparce matrix from SciPy : Example

     csr_matrix: Compressed Sparse Row format

In [16]:
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix
A = csr_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5]])
v = np.array([1, 0, -1])
A.dot(v)

AA = lil_matrix((5,6))
AA

<5x6 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in List of Lists format>

In [17]:
from scipy import sparse as sp_sparse
from numpy.random import rand

A = sp_sparse.lil_matrix((1000, 1000))
A[0, :100] = rand(100)
A[1, 100:200] = A[0, :100]
A.setdiag(rand(1000))

In [18]:
A = A.tocsr()
b = rand(1000)
x = sp_sparse.linalg.spsolve(A, b)

# TF-IDF

**short for Term Frequency–Inverse Document Frequency**

In the following example of a simple text

A vocabulary of 8 words is learned from the documents and each word is assigned a unique integer index in the output vector.

The inverse document frequencies are calculated for each word in the vocabulary, assigning the lowest score of 1.0 to the most frequently observed word: “the” at index 7.

Finally, the first document is encoded as an 8-element sparse array and we can review the final scorings of each word with different values for “the“, “fox“, and “dog” from the other words in the vocabulary.

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print('\nvectorizer.vocabulary_:',vectorizer.vocabulary_)
print('\nvectorizer.idf_:',vectorizer.idf_)
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print('\nvector.shape:',vector.shape)
print('\nvector.toarray:',vector.toarray())


vectorizer.vocabulary_: {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}

vectorizer.idf_: [1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]

vector.shape: (1, 8)

vector.toarray: [[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
vectorizer = CountVectorizer() # create the transform
vectorizer.fit(text) # tokenize and build vocab

print('\nvectorizer.vocabulary_:',vectorizer.vocabulary_) # summarize
vector = vectorizer.transform(text) # encode document

print('\nvector.shape:',vector.shape)
print('\nvector.toarray:',vector.toarray())
print('\ntype(vector):', type(vector))
X = vectorizer.fit_transform(text)


vectorizer.vocabulary_: {'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}

vector.shape: (1, 8)

vector.toarray: [[1 1 1 1 1 1 1 2]]

type(vector): <class 'scipy.sparse.csr.csr_matrix'>


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print (vectorizer.get_feature_names())
print(X.shape)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(4, 9)


# MultiLabelBinarizer

In [22]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
print ( mlb.fit_transform([(1, 2), (3,)]) )
#array([[1, 1, 0],
#       [0, 0, 1]])
print ( mlb.classes_ )
#array([1, 2, 3])

[[1 1 0]
 [0 0 1]]
[1 2 3]


In [23]:
mlb = MultiLabelBinarizer()
mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
#array([[0, 1, 1],
#       [1, 0, 0]])
print ( list(mlb.classes_) )
#['comedy', 'sci-fi', 'thriller']

['comedy', 'sci-fi', 'thriller']


## Preparing the data

In [24]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gshyam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
import re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
    text: a string
    return: modified initial string
    """

    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ',text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwors from text
    return text

### Transforming text to a vector

Machine Learning algorithms work with numeric data and we cannot use the provided text data "as is". There are many ways to transform text data to numeric vectors. In this task you will try to use two of them.

#### Bag of words

One of the well-known approaches is a *bag-of-words* representation. To create this transformation, follow the steps:
1. Find *N* most popular words in train corpus and numerate them. Now we have a dictionary of the most popular words.
2. For each title in the corpora create a zero vector with the dimension equals to *N*.
3. For each text in the corpora iterate over words which are in the dictionary and increase by 1 the corresponding coordinate.

Let's try to do it for a toy example. Imagine that we have *N* = 4 and the list of the most popular words is 

    ['hi', 'you', 'me', 'are']

Then we need to numerate them, for example, like this: 

    {'hi': 0, 'you': 1, 'me': 2, 'are': 3}

And we have the text, which we want to transform to the vector:

    'hi how are you'

For this text we create a corresponding zero vector 

    [0, 0, 0, 0]
    
And iterate over all words, and if the word is in the dictionary, we increase the value of the corresponding position in the vector:

    'hi':  [1, 0, 0, 0]
    'how': [1, 0, 0, 0] # word 'how' is not in our dictionary
    'are': [1, 0, 0, 1]
    'you': [1, 1, 0, 1]

The resulting vector will be 

    [1, 1, 0, 1]
   
Implement the described encoding in the function *my_bag_of_words* with the size of the dictionary equals to 5000. To find the most common words use train data. You can test your code using the function *test_my_bag_of_words*.

In [27]:
def my_bag_of_words(text, words_to_index, dict_size):
    result_vec = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vec[words_to_index[word]] +=1
    return result_vec
        

In [28]:
# test my bag of words
mytext = ['hi how are you']
words_to_index = {'hi': 0, 'you': 1, 'me': 2, 'are': 3} # these are the most common words already found
ans = [1, 1, 0, 1]

for i, text in enumerate(mytext):
    vec = my_bag_of_words(text, words_to_index, 4)
    print ('obtained vector:', vec)
    print ('correct ansswer:', ans)
    print ( 'The two are equal (T/F):',(vec==ans).any() )
    
    

obtained vector: [1. 1. 0. 1.]
correct ansswer: [1, 1, 0, 1]
The two are equal (T/F): True
