In [None]:
##############################################################################
## Fundamentals for pratical Text Analytics - spacy for language modeling, NER, 
##                                          roll our own intent sclassification 
##
## Learning goals:
##                 - reinforce text as a robust dataset via language modeling
##                 - python packages for handling our corpus for these specific tasks
##                 - SPACY!
##                 - POS tagging (to help with extraction/classification)
##                 - NER extraction
##                 - generalized, pre-trained word vectors for S|UML tasks (intent classification)
##############################################################################

In [None]:
# installs
! pip install newspaper3k
! pip install spacy
! pip install wordcloud
! pip install emoji
! pip install nltk
! pip install scikit-plot
! pip install umap-learn
! pip install afinn
! pip install textblob
! pip install gensim
! pip install pysrt
! pip install wikipedia

In [None]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplot

# some "fun" packages
from wordcloud import WordCloud
import emoji

import re

# text imports
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer  
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import gensim

from afinn import Afinn

from newspaper import Article

In [None]:
#######################################  Quick warmup exercise

corpus = ['My email address is btibert@bu.edu',
          'I search https://www.google.com 100 times a day!',
          'The cat in the hat']


# parse the corpus and get the token text, check if it each is a stopword, a url, or an email address
# HINT:  refer to the token attribute documentation or inspect a test token



In [None]:
# import spacy.cli 
# spacy.cli.download("en_core_web_md")
# nlp = spacy.load('en_core_web_md')

In [None]:
# https://spacy.io/api/token#attributes

token_data = []
for d in corpus:
  doc = nlp(d)
  for token in doc:
    token_data.append((token.text, token.is_stop, token.like_url, token.like_email))


token_data

In [None]:
# what do we have avilable
# doc = nlp("hockey")
# token = doc[0]

In [None]:
#######################################  Named Entity Recognition
## 
## We have seen regex can be very powerful
## not only can we tokenize data, but we COULD use it to parse patterns
##
## HOWEVER:  the spacy parsing has already trained a GENERALIZED model for us
##           lets start there! But note, based on certain tasks, spacy is near/at SOTA
## 
## https://spacy.io/usage/linguistic-features#named-entities
## 
## Why does this matter?
## - we have large corpora and want to extract the entities being discussed
## - think legal documents -  which people/organizations are involved
## - news organizations tagging/categorizing articles to compare across all articles
## - content recommendations - other texts including this entity/entities
## - customer support - which products/services are our customers reference in service requests
## - medical - illnesses or diseases per medical intake forms
## - hiring/scanning: skill detection, experience detection
## 

In [None]:
# lets use a different corpus
ner_corpus = ["Apple makes the iphone", 
              "Google created Colab", 
              "Questrom is a B-school",
              "Salesforce acquired Slack for $27 Billion dollars",
              "Mark Benioff leads Salesforce which is located in San Francisco",
              "Admithub just raised $14 million and is located in Boston"]

ner_corpus

In [None]:
# using enumerate 

ents = []
for i, d in enumerate(ner_corpus):
  doc = nlp(d)
  for ent in doc.ents:
    ents.append((i, ent.start_char, ent.end_char, ent.text, ent.label_))

ents

In [None]:
spacy.explain('ORG')
spacy.explain('GPE')
spacy.explain('PERSON')

In [None]:
# make it dataframe
ents_df = pd.DataFrame(ents, columns=['index', 'start', 'end', 'textspan', 'type'])
ents_df.head(3)

In [None]:
# of course, we can visualize this.  spacy is the bees knees

from spacy import displacy

displacy.render(nlp(ner_corpus[-1]), style="ent", jupyter=True)

# or in vs code -- localhost:5000
# appears to be a bug with the admithub parse, so beware
# displacy.serve(nlp(ner_corpus[-1]), style="ent")

In [None]:
# so why does this matter?
# lets create a quick corpus about go

corpus = ['I want to go to the store', 'I like programming in the language go']

goents = []
for d in corpus:
  doc = nlp(d)
  for ent in doc.ents:
    goents.append((ent.text, ent.label_)) 

goents



In [None]:
#######################################  YOUR TURN
##
## parse the article at the URL below
## trick: consider this a document, not a corpus
## extract the entities
## visualize

URL = "https://www.lyrics.com/lyric/180684/Billy+Joel/We+Didn%27t+Start+the+Fire"


In [None]:
# parse the article
article = Article(URL)
article.download()
article.parse()
article.text



In [None]:
# extract
ents_song = []
doc = nlp(article.text)
for e in doc.ents:
  ents_song.append((e.text, e.start_char, e.end_char, e.label_))

len(ents_song)

ents_song[:10]



In [None]:
# put this into a dataframe
song_enttiies = pd.DataFrame(ents_song, columns=("text", "start", "end", "label"))

In [None]:
displacy.render(doc, style="ent", jupyter=True)

In [None]:
######## where to go from here?
##
## spacy attempts to provide us a framework for many NLP tasks
## we chose the medium model to see that the starting point is pretty good
## but its not perfect (it's a model-based approach, after all!)
## 
## the docs are great, and we can role our own, because this is a framework
##


In [None]:
#######################################  Vectors/Embeddings
##
## You have heard me use this term quite a bit
## we have seen this via PCA ----> take a large feature space and re-represent this in a new space
##     the goal was to encode information and reduce noise, right?
##
## we saw this in Tsne (2 embeddings) and UMAP (can be 2 or more depending on our needs)
## 
## Well in text, we have the same idea
## we could always use the tools above, but there this is a "hot" field right now -> embeddings
## 
## https://spacy.io/usage/linguistic-features#vectors-similarity
##
## we will build our own domain-specific embeddings next week, but for now lets use pre-trained embeddings
## let's loosely refer to this as "transfer learning"   --> we are taking one learned model and applying it to our own problem
## in truth, these are generalized, but we are starting to see patterns where domain-specific actions MIGHT help
##
## going back to the start - we used the medium model from spacy to get access to a larger
## trained vocabulary and these embeddings!
##
## I am sure you are thinking: what was this trained on by now:
## https://spacy.io/models/en#en_core_web_md
## view the source (conversations, news articles, texts, etc.)
## 

![](https://miro.medium.com/max/2224/0*K5a1Ws_nsbEjhbYk.png)

> Above we can see words can be represented in these highly dimensional spaces.  The aim is to encapsulate context.  Remember bag-of-words removes sequence/order!

---
![](https://jalammar.github.io/images/word2vec/king-analogy-viz.png)

In [None]:
# lets see this at the core
nlp("golf").vector.shape
nlp("analytics").vector[:5]


# each token has a vector representation

In [None]:
# lets see this for a document
msg = "Questrom is a business school located in Boston"
vectors = [(doc.text, doc.vector) for doc in nlp(msg)]

In [None]:
# get the token and the vectors
vectors[-1]

In [None]:
# lets look at the last entry - Boston


In [None]:
# how many entries in the word vector


In [None]:
# lets look at the entries
# norm = the square root of the sum of the values squared

explore = []
doc = nlp(msg)
for i, token in enumerate(doc):
  explore.append((i, token.text, token.is_oov, token.has_vector, token.vector_norm))

explore

In [None]:
nlp("Questrom").vector

In [None]:
# spacy has this really nice property, but differs from other approaches!
# Not all tokens have vectors (to save space), but also, when a vector is not available (or because OOV)
# spacy gives us a 300-length vector anyway
# if the token does not have a vector, it will initialize with all 0's.  
# I tend to like this approach, but its not the same for other toolkits where an OOV is just missing


In [None]:
## lets see another example - a little drawn out, but aim is to build intuition

msg = "Chess is a game, python is a programming language"
doc = nlp(msg)
tokens = [token.text for token in doc]
vectors = [token.vector for token in doc]


In [None]:
vectors[-0][:5]

In [None]:
# vectors looks awfuly compatible with numpy, dont they.

va = np.array(vectors)

from scipy.spatial.distance import pdist, squareform

cd = pdist(va, metric="cosine")

squareform(cd).shape

squareform(cd)[:5, :5]

In [None]:
# or two tokens -- long winded way to get the vectors, 
# we will see an easier way below

chess = va[0, :]
python = va[5,:]

# stack the vectors row-wise (now 2 "rows" by 300 "features/columns")
cp = np.vstack((chess,python))
cp.shape

# calculate sim, not the default distance metric!
1 - pdist(cp, metric="cosine")

In [None]:
# lets confirm the intuition with spacy

chess2 = nlp("chess")
python2 = nlp("python")



# spacy compares similarity via cosine
chess2.similarity(python2)

In [None]:
# spacy has a built in cosine SIM (not difference) calc built-in for tokens/docs/spans
# 
# above chess2 and python2 are a doc of a single token
# docs/span vectors are simply the average of the token vectors!
# yes, its that simple
#

# lets compare 3 docs by changing tokens
doc1 = nlp("I like turtles")
doc2 = nlp("I like hockey")
doc3 = nlp("I hate hockey")

print(f"doc 1 and doc 2 is {doc1.similarity(doc2)}") 
print(f"doc 1 and doc 3 is {doc1.similarity(doc3)}") 
print(f"doc 2 and doc 3 is {doc2.similarity(doc3)}") 

In [None]:
# spans ---> just like slicing a list
# 

doc1[:2].vector

In [None]:
# the span, at the lowest level, is still comprised of tokens
# and has a vector (average of the span tokens)


In [None]:
#########################################
######################################### Lets see this in action
######################################### USE-CASE 1
#####
#### word vectors and document categories
####

## a pipeline is only the bits that we need (just vectors, for example)
## for a list
# https://spacy.io/usage/processing-pipelines#built-in

# we are only to include the vectors
nlp = spacy.load("en_core_web_md", enable=['toc2vec'])


In [None]:
## get the topics data from big query
## questrom.datasets.topics
##
## 

SQL = "SELECT * FROM `questrom.datasets.topics`"
PROJ = "questrom"

intents = pd.read_gbq(SQL, PROJ)

In [None]:
# what do we have
intents.shape

In [None]:
intents.head(3)

In [None]:
# what is the distro of the intents?
intents.topic.value_counts(normalize=True)

In [None]:
# above we can view thats as relatively evenly distributed customer intents
# for example, frame this as an email coming into support@....

docs = list(nlp.pipe(intents.text))
vectors = [doc.vector for doc in docs]
vectors = np.array(vectors)

In [None]:
# remember, we setup the tok2vec which only grabs the vectors, not the other components
# https://spacy.io/usage/processing-pipelines#built-in
#

# takes a few minutes
# docs = list(nlp.pipe(intents.text))
# vectors = [doc.vector for doc in docs]
# vectors = np.array(vectors)

In [None]:
# what do we have


In [None]:
# lets throw a PCA at this to start, only care
# about two dimensions for viz
# GOAL? -> can we sort this even with something like PCA

from sklearn.decomposition import PCA
pca = PCA(2)
pcs = pca.fit_transform(vectors)

pcs.shape

In [None]:
# dataframe
pcdf = pd.DataFrame(pcs, columns=['pc1','pc2'])
pcdf['intent'] = intents.topic

In [None]:
pcdf.head(3)

In [None]:
# lets plot this out
plt.figure(figsize=(10,6))
p = sns.scatterplot(x="pc1", y="pc2", data=pcdf, hue="intent", alpha=.25)
p.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
# lets take a given statement
intents.head(3)

In [None]:
# now lets build a reco engine!
# lets take the second row, and compare

vcs = pdist(vectors, metric="cosine")

# lets grab the second row, or the index 1

example = squareform(vcs)[1, :]

In [None]:
# now, lets find the top 5 indices
# minimize distance here, not similarity
# so the sort and top 5 let the records with the lowest "distance" values

sims = np.argsort(example)[:5]

In [None]:
# we can flag similar intents (of course, itself is found)
# this is a function of how I am doing it, but intuition holds we can 
# use this to look up plan of action given similar intents
# this could be a news article, etc.

intents.iloc[sims, :].values

In [None]:
############################### Challenge/Practice
## a dataset for intents - think re-reouting/optimizing customer service requests!
## small utterances for airlines
## 
## 
## questrom.datasets.airline-intents
## just shy of 5k intents for airline travel/support
##
## lets frame a business problem
##

## each message takes 5 minutes on average to resolve.  
## it costs $300 on average, to resolve an hour's worth of support requests
## this is a manual process today
## can we predict the intent which allows us to pass a first-pass reponse message
## this might resolve the issue 10-20% of the time if we are lucky, but that is a significant time savings

## things to explore
## what are the top 5 locations mentioend
## can you predict the entity?
