# Simple bag of words

In [64]:
# load bag of words 
from sklearn.feature_extraction.text import CountVectorizer

In [65]:
train_x = ["I love the book", "This is a great book", "the fit is great", "I love the shoses"]


In [66]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(train_x)

In [67]:
# so this will give you all the words(features) - removing some words it doesn't want
vectorizer.get_feature_names_out()

array(['book', 'fit', 'great', 'is', 'love', 'shoses', 'the', 'this'],
      dtype=object)

In [68]:
vectors.toarray()
# the first array item represents the first item in the training data
# one means the feature exists in the item, zero means non-exist
# so the first one is saying "book", "love", "the" are in the first array item

array([[1, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 1, 0]])

In [122]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm

train_x = ["I love the book", "This is a great book", "the fit is great", "I love the shoses"]

class Category:
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"

# y train
train_y = [Category.BOOKS, Category.BOOKS, Category.CLOTHING, Category.CLOTHING]

# here use ngram and specify both 1 word and 2 words
vectorizer = CountVectorizer(binary=True, ngram_range=(1,2))
# now you have the word in numeric values
train_x_vectors = vectorizer.fit_transform(train_x)

clf_svm = svm.SVC(kernel="linear")
clf_svm.fit(train_x_vectors, train_y)

In [123]:
# now to predict 

test_x_vectors = vectorizer.transform(["Good book"])
test_x_vectors.toarray()  # the first element is  the word "book"

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

# Word Vectors model - sPacy

In [71]:
# !pip install spacy

In [72]:
# download the model
# !python -m spacy download en_core_web_md

In [73]:
import spacy

nlp = spacy.load("en_core_web_md")

In [33]:
nlp("I love books")

I love books

In [36]:
nlp("this is a good book")

this is a good book

In [45]:
print(train_x)

['I love the book', 'This is a great book', 'the fit is great', 'I love the shoses']


In [46]:
docs = [nlp(txt) for txt in train_x]

In [50]:
# word embeding representation
#docs[0].vector  

In [51]:
train_x_word_vectors = [x.vector for x in docs]

In [53]:
# train_x_word_vectors

In [54]:
# now use the word embeding for training 
clf_svm_wv = svm.SVC(kernel="linear")
clf_svm_wv.fit(train_x_word_vectors, train_y)

In [55]:
"""Steps
1. define your texts in an array
2. transform them into nlp docs
3. get the word embeding vectors
"""

test_x = ["I went to the bank and wrote a check", "let me check that out"]
test_docs = [nlp(t) for t in test_x]
test_x_word_vectors = [x.vector for x in test_docs]

clf_svm_wv.predict(test_x_word_vectors)


array(['CLOTHING', 'CLOTHING'], dtype='<U8')

## More SpaCy Basics

In [61]:
#!pip install spacy
#!python -m spacy download en_core_web_md

In [75]:
import spacy

#nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x1324e5820>

In [113]:
nlp_doc = nlp("This is an introduction of SpaCy. Yeah it's pretty cool. I am detailed-oriented")
type(nlp_doc)  # Doc object

spacy.tokens.doc.Doc

In [114]:
# already tokenized

for token in nlp_doc:
    print(token.text)

This
is
an
introduction
of
SpaCy
.
Yeah
it
's
pretty
cool
.
I
am
detailed
-
oriented


In [115]:
for sentence in nlp_doc.sents:
    print(sentence)

This is an introduction of SpaCy.
Yeah it's pretty cool.
I am detailed-oriented


In [116]:
# you can inspect the token 
for token in nlp_doc:
    print(token,"Alphabeti? ", token.is_alpha)
    print(token,"Punctuation?: ", token.is_punct)
    print(token,"Stop word?: ", token.is_stop)

This Alphabeti?  True
This Punctuation?:  False
This Stop word?:  True
is Alphabeti?  True
is Punctuation?:  False
is Stop word?:  True
an Alphabeti?  True
an Punctuation?:  False
an Stop word?:  True
introduction Alphabeti?  True
introduction Punctuation?:  False
introduction Stop word?:  False
of Alphabeti?  True
of Punctuation?:  False
of Stop word?:  True
SpaCy Alphabeti?  True
SpaCy Punctuation?:  False
SpaCy Stop word?:  False
. Alphabeti?  False
. Punctuation?:  True
. Stop word?:  False
Yeah Alphabeti?  True
Yeah Punctuation?:  False
Yeah Stop word?:  False
it Alphabeti?  True
it Punctuation?:  False
it Stop word?:  True
's Alphabeti?  False
's Punctuation?:  False
's Stop word?:  True
pretty Alphabeti?  True
pretty Punctuation?:  False
pretty Stop word?:  False
cool Alphabeti?  True
cool Punctuation?:  False
cool Stop word?:  False
. Alphabeti?  False
. Punctuation?:  True
. Stop word?:  False
I Alphabeti?  True
I Punctuation?:  False
I Stop word?:  True
am Alphabeti?  True
am

In [117]:
# stopwords
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [121]:
list(spacy_stopwords)[:10]

['every',
 'because',
 'mine',
 'across',
 'around',
 'also',
 'however',
 'hers',
 'thereby',
 'former']

### Name Entity Recognition (NER)

In [221]:
txt = """Hi I am Shin Hsu. Google LLC is an American multinational technology company focusing on search engine technology, online advertising, cloud computing, computer software, quantum computing, e-commerce, artificial intelligence, and consumer electronics."""
google_txt_doc = nlp(txt)

In [222]:
# Get the name entities using doc.ents
for entity in google_txt_doc.ents:
    print(entity.text,":", entity.label_, f" e.g.,{spacy.explain(entity.label_)}")
    

Shin Hsu : PERSON  e.g.,People, including fictional
Google LLC : ORG  e.g.,Companies, agencies, institutions, etc.
American : NORP  e.g.,Nationalities or religious or political groups
quantum computing : ORG  e.g.,Companies, agencies, institutions, etc.


In [223]:
#### Retokenize - we can merge some tokens to make them one token, E.g., first name + last name

In [224]:
doc = nlp("She lived in New Hampshire.")
print(doc.ents)

[(token.text, token.i) for token in doc]
[('She', 0), ('lived', 1), ('in', 2), ('New', 3), ('Hampshire', 4), ('.', 5)]
print(len(doc))


(New Hampshire,)
6


In [215]:
# use retokenize to merge certain tokens that you want to treat as one token
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[3:5],attrs={"LEMMA": "new hampshire"})

for t in doc:
    print(t.text_with_ws) # now "New Hampshre is treated as one token rather than two"

She 
lived 
in 
New Hampshire.


In [225]:
# back to the google text
for token in google_txt_doc:
    if token.ent_type_ == "PERSON":
        print(token.text_with_ws)

Shin 
Hsu


In [226]:
# now back to the google text
with google_txt_doc.retokenize() as retokenizer:
    for entity in google_txt_doc.ents:
        retokenizer.merge(entity)  # by doing this, now we can see all entities are now treaded as one token including person name (first name + last name)
for t in google_txt_doc:
    print(t)

Hi
I
am
Shin Hsu
.
Google LLC
is
an
American
multinational
technology
company
focusing
on
search
engine
technology
,
online
advertising
,
cloud
computing
,
computer
software
,
quantum computing
,
e
-
commerce
,
artificial
intelligence
,
and
consumer
electronics
.


# NLTK

In [None]:
# Stemming and Lemmatization