In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Representing Text as Vectors

_The starting point of NLP is to encode text into numerical featues that can be consumed by statistical models._

In [2]:
raw_text = '''I consider John Woo to be one of the greatest action movie directors in the world,and "Face/Off" proves just that. 
However,the element that makes this film one of the most intense and spectacular action movies to ever come from 
Hollywood is the presence of two tremendously talented actors,both of whom are my favorites.
Yes, when you have John Travolta and Nicholas Cage together in a movie ,the results are bound to be over the top.
John Travolta plays Sean Archer,a dedicated FBI agent who survived a murder attempt by notorious criminal
Castor Troy six years ago. However,tragically his son died instead of him and since than Archer is relentlessly
chasing Troy.Nicholas Cage plays Troy,and the film begins with a spectacular chase where we see the sheer intensity
generated by both these great actors,and you realize that this is not going to be just another action flick.'''

#### _How do you transform text into standardized vectors?_


In [3]:
X = [
        [0, 1, 1.3],
        [0, 0, 0.2],
        [1, 0, 3.2],
        [1, 1, 2.1],
    ]

y = [
        1,
        1,
        0,
        1,
    ]

LogisticRegression().fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Sample Problem

#### How do you build a classifier to determine if the sentiment of a movie review is positive or negative?

### Bag of words (BoW) Model

Sentences can be transformed into vectors by:

1) building a dictionary of words/tokens based on your data

2) transform sentences into numerical representations of word occurences

In [4]:
sentences = [
    "great movie",
    "good movie",
    "bad movie",
    "awful movie",
]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
'''
    The .fit() method of the CountVectorizer is building the token dictionary from training data.
'''
cv.fit(sentences)

'''
    The .transform() method of the CountVectorizer is converting sentences into vectors based
    on the dictionary.
'''
word_sentence_matrix = cv.transform(sentences)

sentences_df = pd.DataFrame(word_sentence_matrix.toarray(), columns=cv.get_feature_names())
sentences_df

Unnamed: 0,awful,bad,good,great,movie
0,0,0,0,1,1
1,0,0,1,0,1
2,0,1,0,0,1
3,1,0,0,0,1


In [6]:
print('tokens:', cv.get_feature_names())
word_sentence_matrix.toarray()

tokens: ['awful', 'bad', 'good', 'great', 'movie']


array([[0, 0, 0, 1, 1],
       [0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1],
       [1, 0, 0, 0, 1]])

### Learning token weights

In [7]:
sentences = [
    "great movie",
    "good movie",
    "bad movie",
    "awful movie",
]

'''
    Binary Sentiment:
        - 1: positive
        - 0: negative
'''
sentiment = [
    1,
    1,
    0,
    0,
]

cv = CountVectorizer()
X = cv.fit_transform(sentences)
y = sentiment
print('X:\n', X.toarray())
print()
print('y:\n', y)


X:
 [[0 0 0 1 1]
 [0 0 1 0 1]
 [0 1 0 0 1]
 [1 0 0 0 1]]

y:
 [1, 1, 0, 0]


In [8]:
lr_bow_classifier = LogisticRegression()
lr_bow_classifier.fit(X,y)

sorted(list(zip(cv.get_feature_names(), lr_bow_classifier.coef_[0])), key=lambda x: -x[1])

[('good', 0.4010569669628933),
 ('great', 0.4010569669628933),
 ('movie', 0.0),
 ('awful', -0.4010569669628933),
 ('bad', -0.4010569669628933)]

In [9]:
lr_bow_classifier.predict(cv.transform(["it was great"]))

array([1])

In [10]:
lr_bow_classifier.predict(cv.transform(["it was bad"]))

array([0])

### Additional Vectorization Examples

## Text Pre-processing Methods

Reducing noise in text

In [11]:
sentences = [    
    "it was a bad movie",
    "it was a okay movie",
    "it was an awful movie",

    "it was a good movie",
    "i like the movie",
    "i liked this movie okay",
]

sentiment = [
    0,
    0,
    0,
    1,
    1,
    1,
]

cv = CountVectorizer()
cv.fit(sentences)
word_sentence_matrix = cv.transform(sentences)
sentences_df = pd.DataFrame(word_sentence_matrix.toarray(), columns=cv.get_feature_names())
sentences_df

Unnamed: 0,an,awful,bad,good,it,like,liked,movie,okay,the,this,was
0,0,0,1,0,1,0,0,1,0,0,0,1
1,0,0,0,0,1,0,0,1,1,0,0,1
2,1,1,0,0,1,0,0,1,0,0,0,1
3,0,0,0,1,1,0,0,1,0,0,0,1
4,0,0,0,0,0,1,0,1,0,1,0,0
5,0,0,0,0,0,0,1,1,1,0,1,0


In [12]:
cv = CountVectorizer()
X = cv.fit_transform(sentences)
y = sentiment

lr_bow_classifier = LogisticRegression(penalty='l1', C=1000, solver='liblinear')
lr_bow_classifier.fit(X,y)

sorted(list(zip(cv.get_feature_names(), lr_bow_classifier.coef_[0])), key=lambda x: -x[1])

[('good', 14.119732285923776),
 ('this', 5.41464426861084),
 ('the', 5.168892533138319),
 ('like', 1.737862245510235),
 ('liked', 1.4921105100377128),
 ('an', 0.0),
 ('awful', 0.0),
 ('bad', 0.0),
 ('movie', 0.0),
 ('okay', 0.0),
 ('it', -3.2596939285842645),
 ('was', -4.017981926568795)]

In [13]:
lr_bow_classifier.predict_proba(cv.transform(["the"]).toarray())

array([[0.00565867, 0.99434133]])

In [14]:
lr_bow_classifier.predict_proba(cv.transform(["it"]).toarray())

array([[0.96301989, 0.03698011]])

In [15]:
lr_bow_classifier.predict_proba(cv.transform(["bad"]).toarray())

array([[0.5, 0.5]])

#### Keeping stopwords can add noise to models:

- this model has learned that "the" is a positive sentiment term and "it" is a negative sentiment term
- Due to some of these confusions, it has also learned weights suggesting that "bad" is neither negative or positive

### Pre-processing methods: stop words

_Remove noise from your data_

In [16]:
cv = CountVectorizer(stop_words=['it', 'an', 'was', 'this'])

'''
Many libraries come with standard stop words baked in:
    
    cv = CountVectorizer(stop_words='english')
'''

X = cv.fit_transform(sentences)
y = sentiment

lr_bow_classifier = LogisticRegression(random_state=0)
lr_bow_classifier.fit(X,y)

sorted(list(zip(cv.get_feature_names(), np.round(lr_bow_classifier.coef_[0], 3))), key=lambda x: -x[1])

[('liked', 0.432),
 ('good', 0.426),
 ('like', 0.357),
 ('the', 0.357),
 ('movie', 0.0),
 ('okay', -0.03),
 ('awful', -0.377),
 ('bad', -0.377)]

### Pre-processing: Token Frequency Filters

In [17]:
sentences = [    
    "it was a good movie",
    "it was a very good movie",
    "it was an bad movie",

    "it was a bad movie",
    "it was a terrible movie",
]
cv = CountVectorizer(
    min_df = 2, # absolute threshold
    max_df = 0.9, # ratio threshold,
    stop_words='english'
)
cv.fit(sentences)
word_sentence_matrix = cv.transform(sentences)
sentences_df = pd.DataFrame(word_sentence_matrix.toarray(), columns=cv.get_feature_names())
sentences_df


Unnamed: 0,bad,good
0,0,1
1,0,1
2,1,0
3,1,0
4,0,0


### Pre-processing: Word Stemming

In [18]:
sentences = [
    "great movie",
    "good movie",
    "bad movie",
    "awful movie",
]
cv = CountVectorizer().fit(sentences)

In [19]:
cv.transform([
    'great'
]).toarray()

array([[0, 0, 0, 1, 0]])

In [20]:
'''
    Unseen tokens are ignored after the dictionary is built
'''
cv.transform([
    'greatest'
]).toarray()

array([[0, 0, 0, 0, 0]])

In [21]:
sentences = [    
    "it was a great movie",
    "it was the greatest movie",
    
    "it was a bad movie",
    "it was a very bad movie",
]

sentiment = [
    1,
    1,
    0,
    0,
]

cv = CountVectorizer()
cv.fit(sentences)
word_sentence_matrix = cv.transform(sentences)
sentences_df = pd.DataFrame(word_sentence_matrix.toarray(), columns=cv.get_feature_names())
sentences_df

Unnamed: 0,bad,great,greatest,it,movie,the,very,was
0,0,1,0,1,1,0,0,1
1,0,0,1,1,1,1,0,1
2,1,0,0,1,1,0,0,1
3,1,0,0,1,1,0,1,1


In [22]:
cv = CountVectorizer(stop_words='english')

X = cv.fit_transform(sentences)
y = sentiment

lr_bow_classifier = LogisticRegression(random_state=0)
lr_bow_classifier.fit(X,y)

sorted(list(zip(cv.get_feature_names(), np.round(lr_bow_classifier.coef_[0], 3))), key=lambda x: -x[1])

[('great', 0.366), ('greatest', 0.366), ('movie', -0.0), ('bad', -0.732)]

In [23]:
from nltk.stem.snowball import SnowballStemmer

In [24]:
stemmer = SnowballStemmer('english')
for token in ['run', 'running', 'runs', 'ran']:
    print(token, ':', stemmer.stem(token))

run : run
running : run
runs : run
ran : ran


### N grams

Tokens as compounds of words using N-gram ranges

In [25]:
sentence = "the united states of america "

for token in sentence.split(' '):
    print(stemmer.stem(token))

the
unit
state
of
america



In [26]:
### n-gram range (1,2)
'''
    ['the', 'the united', 'united', 'united states', 'states', 'states of', ...]
'''

### n-gram range (1,3)
'''
    ['the', 'the united', 'the united states', 'united', 'united states', 'united states of', ...]
'''
None

In [27]:
raw_text

'I consider John Woo to be one of the greatest action movie directors in the world,and "Face/Off" proves just that. \nHowever,the element that makes this film one of the most intense and spectacular action movies to ever come from \nHollywood is the presence of two tremendously talented actors,both of whom are my favorites.\nYes, when you have John Travolta and Nicholas Cage together in a movie ,the results are bound to be over the top.\nJohn Travolta plays Sean Archer,a dedicated FBI agent who survived a murder attempt by notorious criminal\nCastor Troy six years ago. However,tragically his son died instead of him and since than Archer is relentlessly\nchasing Troy.Nicholas Cage plays Troy,and the film begins with a spectacular chase where we see the sheer intensity\ngenerated by both these great actors,and you realize that this is not going to be just another action flick.'

In [28]:
cv = CountVectorizer(stop_words='english', ngram_range=(1,3)).fit(raw_text.split('.'))
cv.get_feature_names()

['action',
 'action flick',
 'action movie',
 'action movie directors',
 'action movies',
 'action movies come',
 'actors',
 'actors favorites',
 'actors realize',
 'actors realize going',
 'agent',
 'agent survived',
 'agent survived murder',
 'ago',
 'archer',
 'archer dedicated',
 'archer dedicated fbi',
 'archer relentlessly',
 'archer relentlessly chasing',
 'attempt',
 'attempt notorious',
 'attempt notorious criminal',
 'begins',
 'begins spectacular',
 'begins spectacular chase',
 'bound',
 'cage',
 'cage movie',
 'cage movie results',
 'cage plays',
 'cage plays troy',
 'castor',
 'castor troy',
 'castor troy years',
 'chase',
 'chase sheer',
 'chase sheer intensity',
 'chasing',
 'chasing troy',
 'come',
 'come hollywood',
 'come hollywood presence',
 'consider',
 'consider john',
 'consider john woo',
 'criminal',
 'criminal castor',
 'criminal castor troy',
 'dedicated',
 'dedicated fbi',
 'dedicated fbi agent',
 'died',
 'died instead',
 'died instead archer',
 'directors'

In [29]:
cv = CountVectorizer(stop_words='english', ngram_range=(1,3), min_df=2).fit(raw_text.split('.'))
cv.get_feature_names()

['action',
 'actors',
 'archer',
 'cage',
 'film',
 'john',
 'john travolta',
 'just',
 'movie',
 'nicholas',
 'nicholas cage',
 'plays',
 'spectacular',
 'travolta',
 'troy']

### How much data do you need to train an NLP model?

#### Unfortunately no clear rule of thumb.. Using cross fold validation and other evaluation methods will help understand your use case. But..

- Learning token weights (treating features as independent) at LEAST
    * 100 words in dictionary.. x 10 records per feature = 1,000 records
    * 1,000 words in dictionary.. x 10 records per feature = 10,000 records


- Classifying documents (treating features as dependent)
    * 1000 words in dictionary x 100 records per feature = 100,000 records
    * 10,000 words in dictionary x 100 records per feature = 1,000,000 records
    


## Word / Document Vectors (Embeddings)

_Moving from bag of words to word vectors_

##### Tensorflow Wikipedia Word2Vec Projector

https://projector.tensorflow.org/


####    Developed by training models to predict relationships of words:
    
- "the dog chased the: {?}"
- "the dog {?} the cat"
- "the {?} chased the cat"


In [30]:
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-50")
model
# model.most_similar("glass")

unable to import 'smart_open.gcs', disabling that module


<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x11c0a7e50>

In [31]:
model['cat']

array([ 0.45281 , -0.50108 , -0.53714 , -0.015697,  0.22191 ,  0.54602 ,
       -0.67301 , -0.6891  ,  0.63493 , -0.19726 ,  0.33685 ,  0.7735  ,
        0.90094 ,  0.38488 ,  0.38367 ,  0.2657  , -0.08057 ,  0.61089 ,
       -1.2894  , -0.22313 , -0.61578 ,  0.21697 ,  0.35614 ,  0.44499 ,
        0.60885 , -1.1633  , -1.1579  ,  0.36118 ,  0.10466 , -0.78325 ,
        1.4352  ,  0.18629 , -0.26112 ,  0.83275 , -0.23123 ,  0.32481 ,
        0.14485 , -0.44552 ,  0.33497 , -0.95946 , -0.097479,  0.48138 ,
       -0.43352 ,  0.69455 ,  0.91043 , -0.28173 ,  0.41637 , -1.2609  ,
        0.71278 ,  0.23782 ], dtype=float32)

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([model['cat']], [model['dog']])


array([[0.92180055]], dtype=float32)

In [33]:
cosine_similarity([model['cat']], [model['laptop']])

array([[0.29200307]], dtype=float32)

### Linear operations with Word Embeddings

In [34]:
queen = model['king'] - model['man'] + model['woman']

cosine_similarity([queen], [model['queen']])

array([[0.8609581]], dtype=float32)

### Using Document Embeddings in Models

In [35]:
from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english')) 


sentences = [
    "great movie",
    "awful movie",
    "terrible movie",
    "best movie",
]

sentiments = [
    1,
    0,
    0,
    1,
]

def average_document(sentence):
    tokens = [token for token in sentence.split(' ') if token not in stop_words]
    return np.average([model[token] for token in tokens], axis=0)

X = np.array([average_document(s) for s in sentences])
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.140836,0.753965,-0.630695,-0.174898,0.40267,0.051885,-0.382405,-0.31263,-0.25218,0.763297,...,-0.223905,0.753775,-0.37661,-0.306865,-0.52717,0.372588,-0.001055,-0.739175,-0.368825,0.371665
1,0.106434,-0.135745,-0.425995,-0.209078,0.31976,-0.0092,-0.01637,-0.351265,-0.345415,0.914115,...,-0.49592,0.54419,-0.079615,-0.178475,-0.376856,0.300144,0.160355,-0.26854,0.018505,0.824085
2,0.320165,0.071936,-0.40742,-0.212798,0.26962,0.170055,0.274435,-0.083535,-0.16474,0.755728,...,-0.51014,0.62877,0.044895,-0.335534,-0.217745,0.375964,0.028185,-0.41934,0.09694,0.664395
3,-0.30374,0.38784,-0.27208,0.153718,0.41566,0.113419,-0.67844,-0.6528,-0.068025,1.128275,...,-0.173658,0.756765,-0.153992,-0.48838,-0.57976,0.172008,0.23368,-0.518901,-0.123241,0.879575


In [36]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X, sentiments)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [37]:
'''
    'amazing' is an unseen token
'''
clf.predict([average_document('amazing movie')])

array([1])

In [38]:
'''
    'worst' is an unseen token
'''
clf.predict([average_document('worst movie')])

array([0])

## Why use embeddings?

- Word / document embeddings are richer representations of text data.

In [39]:
CountVectorizer().fit_transform(['the cat ran over the fence']).toarray()

array([[1, 1, 1, 1, 2]])

In [40]:
average_document('the cat ran over the fence')

array([-0.05408067, -0.15016332,  0.34357   , -0.00895233,  0.08262666,
        0.24126466, -0.86868   , -0.32828102,  0.26363868, -0.62116003,
       -0.41942668, -0.15177333, -0.15515666,  0.34706464,  0.018743  ,
        0.12498334,  0.18917668,  0.19567998, -0.7870967 , -0.19576333,
       -0.01438   ,  0.14882   , -0.17090333,  0.05952001,  0.5412767 ,
       -1.3095332 ,  0.10766002,  0.55329996,  0.48070002, -0.8011033 ,
        1.8541666 , -0.17321032, -0.17284234,  0.48640335, -0.41412333,
        0.25697634,  0.24372673, -0.01935335, -0.09209368, -0.10261667,
       -0.15008901,  0.09858632, -0.19396664,  0.49168667,  0.18769   ,
       -0.4771267 ,  0.14372334, -1.06508   ,  0.29074666, -0.3884767 ],
      dtype=float32)