In [None]:
# Tagging parts of speech using the spacy package. Spacy is an open source nlp library offer pos and named entity tagging

In [27]:
import spacy
import pandas as pd

In [33]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [35]:
nlp = spacy.load("en_core_web_sm")            

In [37]:
# the text below is lowercased, without punctuation, has the stopwords

In [21]:
emma_ja = "emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of authority being now long passed away they had been living together as friend and friend very mutually attached and emma doing just what she liked highly esteeming miss taylors judgment but directed chiefly by her own"
print(emma_ja)

emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of auth

In [43]:
spacy_doc = nlp(emma_ja)

In [45]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

In [53]:
for token in spacy_doc:
    # Convert eh structured data in spacy_doc into a dataframe using pandas.concat
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token':token.text, 'pos_tag':token.pos_}])], ignore_index=True)

In [57]:
pos_df.head()

Unnamed: 0,token,pos_tag
0,emma,PROPN
1,woodhouse,PROPN
2,handsome,ADJ
3,clever,ADJ
4,and,CCONJ


In [None]:
# create new dataframe, group by token and pos_tag and use .size() to get the number of tokens that fall into that group, reset the index
# and give it the name counts, sort the values by counts in descending order

In [59]:
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [None]:
# get the 10 most common tokens and their associated tags

In [63]:
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
88,of,ADP,14
49,had,AUX,9
54,her,PRON,9
111,the,DET,8
6,and,CCONJ,8
0,a,DET,6
114,to,PART,5
61,in,ADP,4
13,been,AUX,4
120,very,ADV,4


In [None]:
# Next, We want to know how many different words we have for wach of the different tags. In the groupbyfunction, first parameter is the column
# you are aggregating by and the second is the column that you want to aggregate

In [77]:
pos_df_poscounts = pos_df.groupby(['pos_tag'])['token'].count().sort_values(ascending=False)

In [79]:
pos_df_poscounts.head()

pos_tag
NOUN    44
ADP     28
ADV     22
ADJ     19
VERB    19
Name: token, dtype: int64

In [85]:
# Suppose you want to look at individual tags, for example, the most common nouns coming up within our data - top 10

In [81]:
nouns = pos_df_counts[pos_df_counts.pos_tag=='NOUN']

In [87]:
nouns.head(10)

Unnamed: 0,token,pos_tag,counts
48,governess,NOUN,3
46,friend,NOUN,3
130,years,NOUN,2
35,emma,NOUN,2
28,daughters,NOUN,2
103,sisters,NOUN,2
82,mother,NOUN,2
89,office,NOUN,1
78,mistress,NOUN,1
75,mildness,NOUN,1


In [111]:
# In spaCy, namaed entitties are real world objects that are identified and categorized in text such as persons, 
# organisations, locations, dates, monetary values etc. spaCy's pretrained model comes with a built in named entity recognition pipeline.
# ent.text is the entity and ent.label is the NER

In [95]:
from spacy import displacy
from spacy import tokenizer
import re

In [None]:
#initialize the spacy model and the model that we want to use

In [99]:
nlp = spacy.load("en_core_web_sm")

In [101]:
google_text = "Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet."
print(google_text)

Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet.


In [103]:
spacy_doc = nlp(google_text) # Create spacy document with our reference text

In [105]:
for word in spacy_doc.ents:
    print(word.text, word.label_)

Google ORG
September 4, 1998 DATE
Larry Page PERSON
Sergey Brin PERSON
PhD WORK_OF_ART
Stanford University ORG
California GPE
about 14% PERCENT
56% PERCENT
IPO ORG
2004 DATE
2015 DATE
Google ORG
Alphabet Inc. ORG
Alphabet ORG
Alphabet ORG
Sundar Pichai PERSON
Google ORG
October 24, 2015 DATE
Larry Page PERSON
Alphabet GPE
December 3, 2019 DATE
Pichai PERSON
Alphabet GPE


In [107]:
# Use the dsiplacy render function to create a nice visual of the different enttities in our text

In [115]:
displacy.render(spacy_doc, style="ent", jupyter=True)

In [119]:
google_text_clean = re.sub(r'[^\w\s]', '', google_text).lower()
print(google_text_clean)

google was founded on september 4 1998 by computer scientists larry page and sergey brin while they were phd students at stanford university in california together they own about 14 of its publicly listed shares and control 56 of its stockholder voting power through supervoting stock the company went public via an initial public offering ipo in 2004 in 2015 google was reorganized as a wholly owned subsidiary of alphabet inc google is alphabets largest subsidiary and is a holding company for alphabets internet properties and interests sundar pichai was appointed ceo of google on october 24 2015 replacing larry page who became the ceo of alphabet on december 3 2019 pichai also became the ceo of alphabet


In [121]:
spacy_doc_clean = nlp(google_text_clean) 

In [123]:
for word in spacy_doc_clean.ents:
    print(word.text, word.label_)

google ORG
september 4 1998 DATE
stanford university ORG
california GPE
about 14 CARDINAL
56 CARDINAL
2004 DATE
2015 DATE
alphabet inc google ORG
google ORG
october 24 2015 DATE
larry PERSON
december 3 2019 DATE


In [None]:
# Lot less is picked up below after we remove the punctuation and the capital letters. Consider at what point in your anlysis and data cleaning
# do you user Named entity recognition. If we clean up the text too much for ML, it will not pick up the entities. Do basic preperocessing
# check through your data and decide

In [125]:
displacy.render(spacy_doc_clean, "ent", jupyter=True)