# Natural Language Processing (NLP) in Python - From Zero to Hero

## Imports

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import re

# Sk-lean libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

## Bag of Words

In [2]:
class Category:
    BOOKS = 'BOOKS'
    CLOTHING = 'CLOTHING'

train_x = [
    'i love the book',
    'this is a great book',
    'the fit is great',
    'i love the shoes'
]

train_y = [
    Category.BOOKS,
    Category.BOOKS,
    Category.CLOTHING,
    Category.CLOTHING,
]

In [7]:
vectorizer = CountVectorizer(binary = True)
train_x_vectors = vectorizer.fit_transform(train_x)

# Vectorizer Data
print(vectorizer.get_feature_names())
print(train_x_vectors.toarray())

['book', 'fit', 'great', 'is', 'love', 'shoes', 'the', 'this']
[[1 0 0 0 1 0 1 0]
 [1 0 1 1 0 0 0 1]
 [0 1 1 1 0 0 1 0]
 [0 0 0 0 1 1 1 0]]


In [8]:
# Training the model

clf_svc = SVC(kernel = 'linear')
clf_svc.fit(train_x_vectors, train_y)

test_x_vectors = vectorizer.transform(['i like the book'])

clf_svc.predict(test_x_vectors)

array(['BOOKS'], dtype='<U8')

## Word Vector

In [10]:
nlp = spacy.load('en_core_web_md')

In [18]:
print(train_x)
docs = [nlp(text) for text in train_x]

['i love the book', 'this is a great book', 'the fit is great', 'i love the shoes']


In [19]:
docs[0].vector

array([ 0.08563001,  0.313255  , -0.2392405 , -0.17215225,  0.1418515 ,
        0.1970548 ,  0.04868999, -0.12744625,  0.05947001,  2.1347    ,
       -0.61964   ,  0.01162549,  0.29980502, -0.125354  ,  0.017935  ,
       -0.1355105 , -0.27094752,  1.1129825 , -0.16986902, -0.0266875 ,
        0.14768225, -0.16372526,  0.121907  , -0.06876825, -0.061945  ,
        0.08704174, -0.2005705 , -0.24039775, -0.0675595 ,  0.0926495 ,
       -0.13526568,  0.24121101, -0.20299   ,  0.30007   ,  0.11574501,
        0.055062  ,  0.013516  , -0.0664179 , -0.3380587 , -0.17823698,
       -0.01039225,  0.03333575, -0.10241525, -0.093445  ,  0.09327275,
        0.20661727, -0.15074751,  0.14018372,  0.23520125, -0.05192125,
       -0.0999365 , -0.1212635 , -0.05895525, -0.005062  ,  0.06003174,
        0.01213001, -0.11257375, -0.24570274,  0.00678   , -0.1888345 ,
       -0.09276348, -0.25614128, -0.20717824,  0.0858725 , -0.02215025,
       -0.303222  , -0.00274375,  0.11888   ,  0.02695867,  0.20

In [31]:
# Training the model

train_x_wv = [x.vector for x in docs]

clf_svc_wv = SVC(kernel = 'linear')
clf_svc_wv.fit(train_x_wv, train_y)

test_x = ['i love the books', 'i love the story', 'i love the hats', 'these earings hurt']
test_x_docs = [nlp(text) for text in test_x]
test_x_vw = [x.vector for x in test_x_docs]

clf_svc_wv.predict(test_x_vw)

array(['BOOKS', 'BOOKS', 'CLOTHING', 'CLOTHING'], dtype='<U8')

## Regex

In [54]:
regex = re.compile(r'ab[^\s]*cd')

phrases = ['abcd', 'xxx', 'aa abxxxcd ccc', 'ab x x xxcd']

matches = []
for phrase in phrases:
    if re.search(regex, phrase):
        matches.append(phrase)
        
matches

['abcd', 'aa abxxxcd ccc']