## BOOK NAME
#### DEEP LEARNING FOR NATURAL LANGUAGE PROCESSING - CREATING NEURAL NETWORKS WITH PYTHON

#### TEXT SEARCH USING REGULAR EXPRESSIONS

In [2]:
import re

In [10]:
words = ['very', 'nice', 'lecture', 'today']
sentence = '|'.join(words)

In [11]:
sentence

'very|nice|lecture|today'

In [12]:
re.findall(sentence, 'i attended a very nice lecture last year', re.M)

['very', 'nice', 'lecture']

#### PREPROCESSING THE TEXT

In [13]:
sentence = 'John has been selected for the trial phase this time. Congrats!!'

In [14]:
print(sentence.lower())

john has been selected for the trial phase this time. congrats!!


In [15]:
#we can define positive and negative words separately
positive_words = ['awesome','good', 'nice', 'super', 'fun', 'delightful','congrats']
negative_words = ['awful','lame','horrible','bad']

In [16]:
sentence = sentence.replace('!', " ")

In [17]:
print(sentence)

John has been selected for the trial phase this time. Congrats  


In [22]:
words = sentence.lower().split()

In [23]:
print(words)

['john', 'has', 'been', 'selected', 'for', 'the', 'trial', 'phase', 'this', 'time.', 'congrats']


In [24]:
print(set(words) - set(positive_words))

{'the', 'has', 'trial', 'for', 'john', 'been', 'this', 'time.', 'selected', 'phase'}


#### ACCESSING TEXT FORM WEB

In [25]:
import urllib3
from bs4 import BeautifulSoup

In [31]:
pool_object = urllib3.PoolManager()
response = pool_object.request('GET', 'http://www.gutenberg.org/files/2554/2554-h/2554-h.htm#link2HCH0008')
html_txt = BeautifulSoup(response, 'lxml')
print(html_txt)




#### COUNT VECTORIZATION

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
texts=["Ramiess sings classic songs and sings","he listens to old pop songs he ",
"and rock music", ' and also listens to classical songs']

In [41]:
cv = CountVectorizer()

In [42]:
cv_fit = cv.fit_transform(texts)

In [62]:
# gives all the different words in the vocabulary
print(cv.get_feature_names())

['also', 'and', 'classic', 'classical', 'he', 'listens', 'music', 'old', 'pop', 'ramiess', 'rock', 'sings', 'songs', 'to']


In [63]:
print(cv_fit.toarray())

[[0 1 1 0 0 0 0 0 0 1 0 2 1 0]
 [0 0 0 0 2 1 0 1 1 0 0 0 1 1]
 [0 1 0 0 0 0 1 0 0 0 1 0 0 0]
 [1 1 0 1 0 1 0 0 0 0 0 0 1 1]]


### TF-IDF

** The term frequency is the ratio of the count of a particular word to the total number of words in the document. For example, a word 'happy' appears 5 times in a document containing 100 words, hence the term frequency for the word 'happy' would be 5/100 = 0.05.**

** The term IDF is called as Inverse Document Frequency, and it is the log ratio of the total number of documents to the number of documents containing the word 'happy'. For example, suppose the total number of documents is 10 million and out of 10 million only 1000 documents contain the word 'happy', hence the IDF of the word 'happy' would be log(10,000,000/1000) = 4**

** Therefore the TF-IDF of the word 'happy' would be 0.05 * 4 = 0.20 **

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
texts=["Ramiess sings classic songs","he listens to old pop","and rock music", ' and also listens to classical songs']

In [66]:
vect = TfidfVectorizer()

In [67]:
x = vect.fit_transform(texts)

In [69]:
x.todense()

matrix([[0.        , 0.        , 0.52547275, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.52547275,
         0.        , 0.52547275, 0.41428875, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.48546061,
         0.38274272, 0.        , 0.48546061, 0.48546061, 0.        ,
         0.        , 0.        , 0.        , 0.38274272],
        [0.        , 0.48693426, 0.        , 0.        , 0.        ,
         0.        , 0.61761437, 0.        , 0.        , 0.        ,
         0.61761437, 0.        , 0.        , 0.        ],
        [0.47212003, 0.37222485, 0.        , 0.47212003, 0.        ,
         0.37222485, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.37222485, 0.37222485]])

In [70]:
vect.get_feature_names()

['also',
 'and',
 'classic',
 'classical',
 'he',
 'listens',
 'music',
 'old',
 'pop',
 'ramiess',
 'rock',
 'sings',
 'songs',
 'to']

### TEXT CLASSIFIER

** Classification of text into positive and negative **

In [71]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

In [72]:
data = [
('I love my country.', 'pos'),
('This is an amazing place!', 'pos'),
('I do not like the smell of this place.', 'neg'),
('I do not like this restaurant', 'neg'),
('I am tired of hearing your nonsense.', 'neg'),
("I always aspire to be like him", 'pos'),
("It's a horrible performance.", "neg")
]

In [74]:
nb = NaiveBayesClassifier(data)

In [76]:
nb.classify('I love this place')

'pos'

In [77]:
nb.classify('I hate this place')

'neg'

In [80]:
nb.classify('You are horrible')

'neg'

### WORD VECTOR REPRESENTATION