In [27]:
# import library

import spacy
from spacy import displacy
import pandas as pd
import os
import requests

<h4>download model</h4>

<body>
<p>model name : en_core_web_sm<br>
[1]_[2]_[3]_[4] :<br>
[1]	language<br>
[2]	type<br>
[3]	genre<br>
[4]	size</p>
</body>

In [12]:
# download model
nlp = spacy.load("en_core_web_sm")

In [13]:
# example

# tokenization
sentence = nlp.tokenizer("we live in paris.")

# length of sentence
print("the number of tokens: ", len(sentence))

# print individual words
print(sentence)

the number of tokens:  5
we live in paris.


In [14]:
# get the data

# get current working directory path
cwd = os.getcwd()

# import jeopardy questions
data = pd.read_csv(cwd + '/data/jeopardy_questions/jeopardy_questions.csv')

# data = pd.DataFrame(data=data)
data = pd.DataFrame(data)

# display first 5-record data
print('-' * 20)
print('import data :')
print('first 5 records :')
print(data.head(5))
print('-' * 20)
print('column names :')
print('before cleaning :')
print(data.columns)
# print(data["question"].head(5))

--------------------
import data :
first 5 records :
   Show Number    Air Date      Round                         Category  Value  \
0         4680  2004-12-31  Jeopardy!                          HISTORY   $200   
1         4680  2004-12-31  Jeopardy!  ESPN's TOP 10 ALL-TIME ATHLETES   $200   
2         4680  2004-12-31  Jeopardy!      EVERYBODY TALKS ABOUT IT...   $200   
3         4680  2004-12-31  Jeopardy!                 THE COMPANY LINE   $200   
4         4680  2004-12-31  Jeopardy!              EPITAPHS & TRIBUTES   $200   

                                            Question      Answer  
0  For the last 8 years of his life, Galileo was ...  Copernicus  
1  No. 2: 1912 Olympian; football star at Carlisl...  Jim Thorpe  
2  The city of Yuma in this state has a record av...     Arizona  
3  In 1963, live on "The Art Linkletter Show", th...  McDonald's  
4  Signer of the Dec. of Indep., framer of the Co...  John Adams  
--------------------
column names :
before cleaning :
Inde

In [15]:
# manage data

# reduce size of data
data = data[0:1000]

# lowercase, strip whitespace, and view columns name (clean all columns' name !!!)
data.columns = map(lambda x: x.lower().strip(), data.columns)
print('-' * 20)
print('after cleaning :')
print(data.columns)

--------------------
after cleaning :
Index(['show number', 'air date', 'round', 'category', 'value', 'question',
       'answer'],
      dtype='object')


In [16]:
# tokenization

# tokenize jeopardy questions
# data["question_tokens"] = data["question"].apply(lambda x: nlp(x)) # 1
data["question_tokens"] = data["question"].apply(nlp) # 2

# view first question
example_question = data.question[0]
example_question_tokens = data.question_tokens[0]
print('full question :')
print(example_question)
print('-' * 20)
print('question tokens :')
for token in example_question_tokens:
	print(token)

full question :
For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory
--------------------
question tokens :
For
the
last
8
years
of
his
life
,
Galileo
was
under
house
arrest
for
espousing
this
man
's
theory


In [17]:
# part-of-speech tagging

# view POS tags for tokens in the first question
print('-' * 20)
print('the Part-of-speech tags for each token in the first question :')
for token in example_question_tokens:
	print(token.text, '|', token.pos_, '|', spacy.explain(token.pos_))

--------------------
the Part-of-speech tags for each token in the first question :
For | ADP | adposition
the | DET | determiner
last | ADJ | adjective
8 | NUM | numeral
years | NOUN | noun
of | ADP | adposition
his | PRON | pronoun
life | NOUN | noun
, | PUNCT | punctuation
Galileo | PROPN | proper noun
was | AUX | auxiliary
under | ADP | adposition
house | NOUN | noun
arrest | NOUN | noun
for | ADP | adposition
espousing | VERB | verb
this | DET | determiner
man | NOUN | noun
's | PART | particle
theory | NOUN | noun


In [18]:
# dependency parsing tagging

# view dependecy parsing tags for tokens in the first question
print('-' * 20)
print('the dependency parsing tags for each token in the first question :')
for token in example_question_tokens:
	print(token.text, '|', token.dep_, '|', spacy.explain(token.dep_))

# visulaize dependency parsing tags
displacy.render(example_question_tokens, style='dep', jupyter=True, options={'distance': 120})

--------------------
the dependency parsing tags for each token in the first question :
For | prep | prepositional modifier
the | det | determiner
last | amod | adjectival modifier
8 | nummod | numeric modifier
years | pobj | object of preposition
of | prep | prepositional modifier
his | poss | possession modifier
life | pobj | object of preposition
, | punct | punctuation
Galileo | nsubj | nominal subject
was | ROOT | root
under | prep | prepositional modifier
house | compound | compound
arrest | pobj | object of preposition
for | prep | prepositional modifier
espousing | pcomp | complement of preposition
this | det | determiner
man | poss | possession modifier
's | case | case marking
theory | dobj | direct object


In [19]:
# chunking

data["question_tokens_chunking"] = data["question"].apply(lambda x: nlp(x).noun_chunks)

# view first question
example_question = data.question[0]
example_question_chunks = data.question_tokens_chunking[0]
print('full question :')
print(example_question)
print('-' * 20)
print('question chunks :')
for chunk in example_question_chunks:
	print(chunk)

full question :
For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory
--------------------
question chunks :
the last 8 years
his life
Galileo
house arrest
this man's theory


In [22]:
# lemmatization | stemming
# but in spacy supports only lemmatization

# create new DataFrame to compare original and lemmatized text
lemmatization = pd.DataFrame(data=[], columns=['original', 'lemmatized'])

# loop through enumerated tokens (to be able to loop w/ index)
for i, token in enumerate(example_question_tokens):
	lemmatization.loc[i, 'original'] = token.text
	lemmatization.loc[i, 'lemmatized'] = token.lemma_

# display compared table
print(lemmatization)


     original lemmatized
0         For        for
1         the        the
2        last       last
3           8          8
4       years       year
5          of         of
6         his        his
7        life       life
8           ,          ,
9     Galileo    Galileo
10        was         be
11      under      under
12      house      house
13     arrest     arrest
14        for        for
15  espousing    espouse
16       this       this
17        man        man
18         's         's
19     theory     theory


In [26]:
# NER :: named-entity recognition

# print NER result ; could be a single token or a set of tokens
print('text | start | end | label')
for ent in example_question_tokens.ents:
	print(ent.text, '|', ent.start_char, '|', ent.end_char, '|', ent.label_)

# visualize NER result
displacy.render(example_question_tokens, style='ent', jupyter=True, options={'distance': 120})

text | start | end | label
the last 8 years | 4 | 20 | DATE
Galileo | 34 | 41 | PRODUCT


In [32]:
# NEL :: named-entity linking

# define google knowledge graph api result function
def returnGraphResult(query, key, entityType):
	if entityType == 'PERSON':
		google = f"https://kgsearch.googleapis.com/v1/eitities:search?query={query}&key={key}"
		resp = requests.get(google)
		url = resp.json()['itemListElement'][0]['result']['detailDescription']['url']
		desc = resp.json()['itemListElement'][0]['result']['detailDescription']['articleBody']
		return url, desc
	else:
		return 'no_match', 'no_match'

key = ''
# to input API key
# https://developers.google.com/knowledge-graph

for ent in example_question_tokens.ents:
	url, desc = returnGraphResult(ent.text, key, ent.label_)
	print(ent.text, ent.label_, url, desc)


the last 8 years DATE no_match no_match
Galileo PRODUCT no_match no_match
