## There are a few different NLP libraries with python, uncomment the code below to install them

In [255]:
# ! pip install spacy
# ! python -m spacy download en
# import nltk
# nltk.download('punkt') ##this downloads the default word tokenizer
# nltk.download('stopwords') ##this downloads all stopwords
# nltk.download('popular') ##this downloads many different popular libraries 

# Outline
### 1. Tokenization (stop words, stemming, lemmatizing)
### 2. Vectorization
### 3. NLP with Machine Learning
### 4. Cosine Similarity
### Bonus: Spacy

### 1. Tokenization
### Key terminology:

When we are performing NLP analysis, our entire collection of text is referred to as a **corpus**. Every item within a corpus is referred to as a **document**. Within that document, we break down the text into individual **tokens**, which can be either sentences or words. 

In the context of a spam classification problem, your entire inbox would be the corpus, each email would be a document, and each word/sentence would be a token.

<img src = "./resources/corpus_doc_tokens.png" width="500">

In [2]:
from nltk import sent_tokenize
test_text = """This is my first sentence. This is my second sentence. Oh wow now there is a third\
 sentence. This is getting out of control!"""
sent_tokenize(test_text)

['This is my first sentence.',
 'This is my second sentence.',
 'Oh wow now there is a third sentence.',
 'This is getting out of control!']

In [3]:
##now let's tokenize by word
from nltk import word_tokenize

In [4]:
word_tokenize(test_text)

['This',
 'is',
 'my',
 'first',
 'sentence',
 '.',
 'This',
 'is',
 'my',
 'second',
 'sentence',
 '.',
 'Oh',
 'wow',
 'now',
 'there',
 'is',
 'a',
 'third',
 'sentence',
 '.',
 'This',
 'is',
 'getting',
 'out',
 'of',
 'control',
 '!']

We have capitalization, punctuation and words that are all too frequent, such as "a", "the", "two". Let's create a tokenize function that will remove punctuation as well as commonly used words. NLTK has a stopwords method built-in. 

In [6]:
from nltk.corpus import stopwords
print(stopwords.words('english')[:20])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [7]:
import string
my_stopwords = set(stopwords.words('english'))

def tokenize(document):
    tocs = word_tokenize(document.lower())
    stop_words = set(stopwords.words('english'))
    tokenized_list = [toc for toc in tocs if toc not in stop_words]
    
    punctuation = set(string.punctuation)
    no_punctuation = [word for word in tokenized_list if word not in punctuation]
    
   
    
    return no_punctuation
    
    

In [9]:
tokenize(test_text)

['first',
 'sentence',
 'second',
 'sentence',
 'oh',
 'wow',
 'third',
 'sentence',
 'getting',
 'control']

NLTK has tokenizers that have different applications.

In [12]:
from nltk import TweetTokenizer, WhitespaceTokenizer
twt = TweetTokenizer()
toks = twt.tokenize('OMG coding is #fun and #bigly cool, do you agree @professor_purple_pants?')
toks

['OMG',
 'coding',
 'is',
 '#fun',
 'and',
 '#bigly',
 'cool',
 ',',
 'do',
 'you',
 'agree',
 '@professor_purple_pants',
 '?']

### Stemming/ Lemmatization

* Stemming: reduces words by removing suffixes (often reduces to strings that are not real words)
* Lemmatization: reduces words to some root form that is still in the the English dictionary

Longer Explanation: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

Your use of stemming/lemmatization will wholly depend on the context of the problem you're trying to solve. Let's take a look at how a sample sentence might be treated differently depending on our stemming technique:

In [331]:
sample_sentence = """when data scientists are performing natural language processing analysis, they must take\
 different verb tenses and singular versus plural words into account."""

In [335]:
from nltk.stem import LancasterStemmer, SnowballStemmer, WordNetLemmatizer 
def stem_words(document,stemmer):
    toks = nltk.word_tokenize(document)
    wrd_list = []
    for word in toks:
        wrd_list.append(stemmer.stem(word))
    return " ".join(wrd_list)
    
    

In [336]:
snowball = nltk.stem.SnowballStemmer('english')

In [337]:
stem_words(sample_sentence,snowball)

'when data scientist are perform natur languag process analysi , they must take differ verb tens and singular versus plural word into account .'

In [338]:
lancaster = LancasterStemmer()
stem_words(sample_sentence,lancaster)

'when dat sci ar perform nat langu process analys , they must tak diff verb tens and singul vers plur word into account .'

In [339]:
regex_stemmer = nltk.RegexpStemmer('ing$|s$|e$|able$', min=4)
stem_words(sample_sentence,regex_stemmer)

'when data scientist are perform natural languag process analysi , they must tak different verb tense and singular versu plural word into account .'

In [222]:
def lem_words(document,lemmer):
    toks = nltk.word_tokenize(document)
    wrd_list = []
    for word in toks:
        wrd_list.append(lemmer.lemmatize(word))
    return " ".join(wrd_list)

In [340]:
lemmer = nltk.stem.WordNetLemmatizer()
lem_words(sample_sentence,lemmer)

'when data scientist are performing natural language processing analysis , they must take different verb tense and singular versus plural word into account .'

### Machine Learning models aren't able to operate on text because text means nothing to mathematical functions! We need to convert our text to a numerical form. SciKit Learn has packages that enable you to vectorize your documents

<img src = "./resources/vector_space_model.png" width = "600">


### Bag of Words Vectorization (count vectorizer)
<img style="float: left" src="./resources/bag_of_word.jpg" width="200">

A Bag of Words model is simply a collection of the count of words in each document. The order of the words is not taken into account, and neither is the frequency of words in the overall corpus. Each document will have a vector of length = total # of unique features in the corpus.


In [56]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [346]:
basic_example = ['The Data Scientist wants to train a machine to train machine learning models.']
bow_sample = CountVectorizer()
bow_sample.fit(basic_example)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [347]:
print(bow_sample.vocabulary_)

{'the': 5, 'data': 0, 'scientist': 4, 'wants': 8, 'to': 6, 'train': 7, 'machine': 2, 'learning': 1, 'models': 3}


In [348]:
print(bow_sample.get_feature_names())

['data', 'learning', 'machine', 'models', 'scientist', 'the', 'to', 'train', 'wants']


In [375]:
vector = bow_sample.transform(basic_example)
print(type(vector))
print(vector)
text_data = pd.DataFrame(vector.toarray(),columns=bow_sample.get_feature_names())
text_data

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 0)	1
  (0, 1)	1
  (0, 2)	2
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	2
  (0, 7)	2
  (0, 8)	1


Unnamed: 0,data,learning,machine,models,scientist,the,to,train,wants
0,1,1,2,1,1,1,2,2,1


In [369]:
new_data.toarray()[0].shape

(9,)

In [374]:
# We can turn another document into a vector by using the transform method
new_text = ['the data scientist plotted the residual error of her model']
new_data = bow_sample.transform(new_text)
new_count = pd.DataFrame(new_data.toarray(),columns=bow_sample.get_feature_names())
new_count

Unnamed: 0,data,learning,machine,models,scientist,the,to,train,wants
0,1,0,0,0,1,2,0,0,0


### N-grams

Often times, it might not be individual words, but rather how certain phrases that might allow us to draw the most insight. By altering the ngram_range, we are creating new feature for our model. Our vocabulary, however, will remain the same.

In [411]:
sentences = ['The Data Scientist wants to train a machine to train machine learning models.',
'the data scientist plotted the residual error of her model in her analysis',
'Her analysis was so good, she won a Kaggle competition.',
'The machine gained sentiance']
## here we are instantiating a bag of words model with ngrams ranging from single word to two words.
bigrams = CountVectorizer(stop_words='english',ngram_range=(1,2))
bigram_vector = bigrams.fit_transform(sentences)
bigrams.get_feature_names()[:10]

['analysis',
 'analysis good',
 'competition',
 'data',
 'data scientist',
 'error',
 'error model',
 'gained',
 'gained sentiance',
 'good']

In [429]:
bigram_df = pd.DataFrame(bigram_vector.toarray(),columns=bigrams.get_feature_names())
bigram_df.head()

Unnamed: 0,analysis,analysis good,competition,data,data scientist,error,error model,gained,gained sentiance,good,...,scientist,scientist plotted,scientist wants,sentiance,train,train machine,wants,wants train,won,won kaggle
0,0,0,0,1,1,0,0,0,0,0,...,1,0,1,0,2,2,1,1,0,0
1,1,0,0,1,1,1,1,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0


## TF-IDF vectorizer

* The tf-idf vectorizer takes has the ability to detect words that might be more important for our specific corpus http://scikit-learn.org/stable/modules/feature_extraction.html


* Places a higher weight on words that appear in certain documents that are infrequent in the overall corpus
<img src= "./resources/tfidf.png">


#### The more frequent a word comes up in different documents, the less weight it gets. This places more emphasis on words that are rarer in our domain and more specifically, the corpus we are training on

In [407]:
sentences = ['The Data Scientist wants to train a machine to train machine learning models.',
'the data scientist plotted the residual error of her model in her analysis',
'Her analysis was so good, she won a Kaggle competition.',
'The machine gained sentiance']
tfidf = TfidfVectorizer(stop_words='english')
tfidf_sentences = tfidf.fit_transform(sentences)

In [408]:
data = pd.DataFrame(tfidf_sentences.toarray(), columns = tfidf.get_feature_names())

In [409]:
data.head()

Unnamed: 0,analysis,competition,data,error,gained,good,kaggle,learning,machine,model,models,plotted,residual,scientist,sentiance,train,wants,won
0,0.0,0.0,0.240692,0.0,0.0,0.0,0.0,0.305288,0.481384,0.0,0.305288,0.0,0.0,0.240692,0.0,0.610575,0.305288,0.0
1,0.325557,0.0,0.325557,0.412928,0.0,0.0,0.0,0.0,0.0,0.412928,0.0,0.412928,0.412928,0.325557,0.0,0.0,0.0,0.0
2,0.366739,0.465162,0.0,0.0,0.0,0.465162,0.465162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465162
3,0.0,0.0,0.0,0.0,0.617614,0.0,0.0,0.0,0.486934,0.0,0.0,0.0,0.0,0.0,0.617614,0.0,0.0,0.0


#### Our data is a sparse matrix, which is a matrix with far more 0 values than not 0 values

#### More about it herehttps://docs.scipy.org/doc/scipy/reference/sparse.html

#### To make better sense of it, we can put it back into a dataframe. 

#### Caution: moving from sparse matrix to array format will take much more memory to perform operations

In [315]:
text_data = pd.DataFrame(training_data.toarray(),columns=bow.get_feature_names())

Now if we want to transform a new test document, we can use the transform method that we previously used


In [262]:
X_test = bow.transform(['this is a test document','look at me I am a test document'])

In [None]:
X_test

## Basic Machine Learning NLP Pipeline Example

Now that we've gone over the basics of NLP data, we can take a look at an example of how a pipeline might work.

In [432]:
review_data = pd.read_json('./reviews_Musical_Instruments_5.json',lines=True)

In [460]:
review_data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1384719342,"[0, 0]",5,"Not much to write about here, but it does exac...","02 28, 2014",A2IBPI20UZIR0U,"cassandra tu ""Yeah, well, that's just like, u...",good,1393545600
1,1384719342,"[13, 14]",5,The product does exactly as it should and is q...,"03 16, 2013",A14VAT5EAX3D9S,Jake,Jake,1363392000
2,1384719342,"[1, 1]",5,The primary job of this device is to block the...,"08 28, 2013",A195EZSQDW3E21,"Rick Bennette ""Rick Bennette""",It Does The Job Well,1377648000
3,1384719342,"[0, 0]",5,Nice windscreen protects my MXL mic and preven...,"02 14, 2014",A2C00NNG1ZQQG2,"RustyBill ""Sunday Rocker""",GOOD WINDSCREEN FOR THE MONEY,1392336000
4,1384719342,"[0, 0]",5,This pop filter is great. It looks and perform...,"02 21, 2014",A94QU4C90B1AX,SEAN MASLANKA,No more pops when I record my vocals.,1392940800


In [456]:
review_data['helpful']

(10261,)

In [571]:
from sklearn.datasets import fetch_20newsgroups
cats = ['rec.sport.baseball','rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train',categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test',categories=cats)

In [572]:
print(newsgroups_train.data[0])
print(newsgroups_train.target[0])

From: dougb@comm.mot.com (Doug Bank)
Subject: Re: Info needed for Cleveland tickets
Reply-To: dougb@ecs.comm.mot.com
Organization: Motorola Land Mobile Products Sector
Distribution: usa
Nntp-Posting-Host: 145.1.146.35
Lines: 17

In article <1993Apr1.234031.4950@leland.Stanford.EDU>, bohnert@leland.Stanford.EDU (matthew bohnert) writes:

|> I'm going to be in Cleveland Thursday, April 15 to Sunday, April 18.
|> Does anybody know if the Tribe will be in town on those dates, and
|> if so, who're they playing and if tickets are available?

The tribe will be in town from April 16 to the 19th.
There are ALWAYS tickets available! (Though they are playing Toronto,
and many Toronto fans make the trip to Cleveland as it is easier to
get tickets in Cleveland than in Toronto.  Either way, I seriously
doubt they will sell out until the end of the season.)

-- 
Doug Bank                       Private Systems Division
dougb@ecs.comm.mot.com          Motorola Communications Sector
dougb@nwu.edu       

In [573]:
bow = CountVectorizer(stop_words='english')
X_train = bow.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target

In [574]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
mnb = MultinomialNB()
mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

##### Now that we've fit our model, we can transform out X_test into vectorized form. In order to do this, we will be using a .transform( ) method on the previously trained vectorizer model. Why don't we use a fit_transform operation?????   It's because we can only make  a vector based off of the vocabulary and features of our trained dataset. If there is a new vocabulary word in the test set that is not present in the training set, we will not gain any new information from it.

In [575]:
X_test = bow.transform(newsgroups_test.data)
y_test = newsgroups_test.target
accuracy_score(mnb.predict(X_test),y_test)

0.9748743718592965

Wow! Even without accounting for different ngrams, or removing special characters, we can classify these two types of articles very accurately. Let's take a look at our features to get a better idea of which ones were the most important in determining our prediction. Of course, we should also look at a confusion matrix to gain a better understanding of how well our model is performing.


In [569]:
### grabbing our feature names (each one of our tokenized words)
feature_names = np.array(bow.get_feature_names())

In [555]:
# we can look at the coefficients for the fitted Multinomial Naive Bayes model in order to see the coefficient values

min(mnb.coef_[0] )

-11.778469284674996

In [546]:
feature_importances = np.argsort(mnb.coef_[0])[-10:]


In [570]:
for idx in feature_importances:
    print(feature_names[idx])

1009
players
fuseholder
ordering
list
style
highlanders
capitals
taubensee
edt


#### Clearly there are some features that are indicated as significant that don't exactly make sense. The fact that the number 10 is the most distinguishing feature between the two categories indicates that there might be some numerical identifiers in each category. To help narrow down the possibilities, we can make custom tokenizers/prepocessors that we feed into our vectorizers

Learn more about adding custom tokenizers, preprocessors, and analyzers here: https://towardsdatascience.com/hacking-scikit-learns-vectorizers-9ef26a7170af

Learn more about selecting features in Naive Bayes text classification problems here:
https://arxiv.org/pdf/1602.02850.pdf


Try making a custom tokenizer function and use it with sklearn's vectorizor classes: 

In [None]:
def tokenizer_func():
    """Input: raw text - document to be tokenized
       Output: list - tokenized text """
    
    
    
    
    pass


count = CountVectorizer(tokenizer = tokenizer_func)

## Measuring the Similarity Between Documents

We can tell how similar two documents are to one another, normalizing for size, by taking the cosine similarity of the two. 

This number will range from [0,1], with 0 being not similar whatsoever, and 1 being the exact same. A potential application of cosine similarity is a basic recommendation engine. If you wanted to recommend articles that are most similar to other articles, you could talk the cosine similarity of all articles and return the highest one.

<img src="./resources/better_cos_similarity.png">

In [270]:
sample = CountVectorizer()
sunday_afternoon = ['I ate a burger at burger queen and it was very good.',
           'I ate a hot dog at burger prince and it was bad',
          'I drove a racecar through your kitchen door',
          'I ate a hot dog at burger king and it was bad. I ate a burger at burger queen and it was very good']

trial.fit(sunday_afternoon)
text_data = trial.transform(sunday_afternoon)

In [271]:
from sklearn.metrics.pairwise import cosine_similarity
## the 0th and 2nd index lines are very different, a number close to 0
cosine_similarity(text_data[0],text_data[2])


array([[0.]])

In [272]:
## the 0th and 3rd index lines are very similar, despite different lengths
cosine_similarity(text_data[0],text_data[3])

array([[0.91413793]])

## Bonus

### Spacy 

Spacy is a powerful, efficient NLP library that employs many deep learning techniques to create semantic meaning for different words
Spacy has features related to syntactic meaning of words

In [412]:
import spacy
nlp = spacy.load('en')

In [413]:
import pandas as pd

In [424]:
sample_sentence = """when data scientists are performing natural language processing analysis, they must take\
different verb tenses and singular versus plural words into account."""

In [425]:
tokenized = nlp(sample_sentence)

In [426]:
for word in tokenized:
    print(word, word.pos_)

when ADV
data NOUN
scientists NOUN
are VERB
performing VERB
natural ADJ
language NOUN
processing NOUN
analysis NOUN
, PUNCT
they PRON
must VERB
takedifferent VERB
verb NOUN
tenses NOUN
and CCONJ
singular ADJ
versus ADP
plural ADJ
words NOUN
into ADP
account NOUN
. PUNCT


It can also detect things such as "noun chunks" and many other parts of speech

In [427]:
for chunk in tokenized.noun_chunks:
    print(chunk)

data scientists
natural language processing analysis
they
verb tenses
plural words
account


#### Spacy has built in models that have been trained that represent different words with vectors. They are part of a larger deep learning field called word2vec.

Read more about it here: https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf

In [428]:
for token in tokenized:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

when True 22.530506 True
data True 23.138157 True
scientists True 27.306108 True
are True 28.186378 True
performing True 26.581202 True
natural True 25.798756 True
language True 26.271992 True
processing True 25.600733 True
analysis True 23.92877 True
, True 21.912632 True
they True 26.637281 True
must True 27.559183 True
takedifferent True 24.669182 True
verb True 23.433626 True
tenses True 25.326244 True
and True 25.221043 True
singular True 25.690449 True
versus True 23.43405 True
plural True 26.543924 True
words True 25.54208 True
into True 24.95463 True
account True 22.755398 True
. True 26.206877 True
