In [6]:
import pandas as pd
import spacy

%autosave 20

Autosaving every 20 seconds


## load data

In [15]:
%store -r df
df.shape

(63807, 11)

In [12]:
df.head()

Unnamed: 0,item,title,published,date,year,month,day,love_ind,hate_ind,war_ind,peace_ind
0,http://www.wikidata.org/entity/Q4069062,the arrest of a pickpocket,1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0
1,http://www.wikidata.org/entity/Q7168279,"performing animals; or, skipping dogs",1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0
2,http://www.wikidata.org/entity/Q2819823,a morning alarm,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0
3,http://www.wikidata.org/entity/Q3326074,mounted police charge,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0
4,http://www.wikidata.org/entity/Q3520164,the burning stable,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0


## create a language project and process the film title
- tokenization
- named entities

In [7]:
nlp = spacy.load("en_core_web_sm")

In [45]:
doc = nlp(df.title[0])
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop, token.is_lower)

the the DET DT det xxx True True True
arrest arrest NOUN NN ROOT xxxx True False True
of of ADP IN prep xx True True True
a a DET DT det x True True True
pickpocket pickpocket NOUN NN pobj xxxx True False True


In [46]:
spacy.explain("DET")

'determiner'

In [47]:
spacy.explain("ADP")

'adposition'

In [48]:
spacy.explain("PROPN")

'proper noun'

In [49]:
spacy.explain("VBZ")

'verb, 3rd person singular present'

In [50]:
spacy.explain("SYM")

'symbol'

In [51]:
# from spacy import displacy

# displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [81]:
for doc in nlp.pipe(test['title'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        print([n.text for n in doc])
#         lemma.append([n.lemma_ for n in doc])
#         pos.append([n.pos_ for n in doc])

['the', 'arrest', 'of', 'a', 'pickpocket']
['performing', 'animals', ';', 'or', ',', 'skipping', 'dogs']
['a', 'morning', 'alarm']
['mounted', 'police', 'charge']
['the', 'burning', 'stable']


In [101]:
%%time

tokens = []
lemma = []
ent = []
pos = []

for doc in nlp.pipe(test['title'].astype('unicode').values, batch_size=50, n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        ent.append([n.label_ for n in doc.ents]) #ent.append([n.ent_type_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        ent.append(None)
        pos.append(None)

test['species_tokens'] = tokens
test['species_lemma'] = lemma
test['species_pos'] = pos
test['species_ent'] = ent

CPU times: user 21.6 ms, sys: 4.85 ms, total: 26.4 ms
Wall time: 26.6 ms


In [102]:
test.head().species_ent[0]

[]

In [103]:
test.head()

Unnamed: 0,item,title,published,date,year,month,day,love_ind,hate_ind,war_ind,peace_ind,new,species_tokens,species_lemma,species_pos,species_ent
0,http://www.wikidata.org/entity/Q4069062,the arrest of a pickpocket,1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0,1,"[the, arrest, of, a, pickpocket]","[the, arrest, of, a, pickpocket]","[DET, NOUN, ADP, DET, NOUN]",[]
1,http://www.wikidata.org/entity/Q7168279,"performing animals; or, skipping dogs",1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0,1,"[performing, animals, ;, or, ,, skipping, dogs]","[perform, animal, ;, or, ,, skip, dog]","[VERB, NOUN, PUNCT, CCONJ, PUNCT, VERB, NOUN]",[]
2,http://www.wikidata.org/entity/Q2819823,a morning alarm,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1,"[a, morning, alarm]","[a, morning, alarm]","[DET, NOUN, NOUN]",[TIME]
3,http://www.wikidata.org/entity/Q3326074,mounted police charge,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1,"[mounted, police, charge]","[mount, police, charge]","[VERB, NOUN, NOUN]",[]
4,http://www.wikidata.org/entity/Q3520164,the burning stable,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1,"[the, burning, stable]","[the, burn, stable]","[DET, VERB, NOUN]",[]


In [84]:
test

Unnamed: 0,item,title,published,date,year,month,day,love_ind,hate_ind,war_ind,peace_ind,new,species_tokens,species_lemma,species_pos
0,http://www.wikidata.org/entity/Q4069062,the arrest of a pickpocket,1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0,1,"[the, arrest, of, a, pickpocket]","[the, arrest, of, a, pickpocket]","[DET, NOUN, ADP, DET, NOUN]"
1,http://www.wikidata.org/entity/Q7168279,"performing animals; or, skipping dogs",1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0,1,"[performing, animals, ;, or, ,, skipping, dogs]","[perform, animal, ;, or, ,, skip, dog]","[VERB, NOUN, PUNCT, CCONJ, PUNCT, VERB, NOUN]"
2,http://www.wikidata.org/entity/Q2819823,a morning alarm,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1,"[a, morning, alarm]","[a, morning, alarm]","[DET, NOUN, NOUN]"
3,http://www.wikidata.org/entity/Q3326074,mounted police charge,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1,"[mounted, police, charge]","[mount, police, charge]","[VERB, NOUN, NOUN]"
4,http://www.wikidata.org/entity/Q3520164,the burning stable,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1,"[the, burning, stable]","[the, burn, stable]","[DET, VERB, NOUN]"


In [87]:
test.species_pos[0]

['DET', 'NOUN', 'ADP', 'DET', 'NOUN']

In [53]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [70]:
doc = nlp('Uber eats India was acquired by Zomato for $350 million')

In [71]:
doc.ents

(India, Zomato, $350 million)

In [73]:
'man' in ['woman', 'people']

False

In [74]:
test = df.head().copy()

In [75]:
test['new'] = 1

In [76]:
test.loc[0, 'new'] = ['']

Unnamed: 0,item,title,published,date,year,month,day,love_ind,hate_ind,war_ind,peace_ind,new
0,http://www.wikidata.org/entity/Q4069062,the arrest of a pickpocket,1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0,1
1,http://www.wikidata.org/entity/Q7168279,"performing animals; or, skipping dogs",1895-01-01T00:00:00Z,1895-01-01,1895,1,1,0,0,0,0,1
2,http://www.wikidata.org/entity/Q2819823,a morning alarm,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1
3,http://www.wikidata.org/entity/Q3326074,mounted police charge,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1
4,http://www.wikidata.org/entity/Q3520164,the burning stable,1896-01-01T00:00:00Z,1896-01-01,1896,1,1,0,0,0,0,1


In [54]:
spacy.explain("GPE")

'Countries, cities, states'

In [55]:
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

the True 19.79567 False
arrest True 20.331337 False
of True 22.341307 False
a True 21.505527 False
pickpocket True 18.446245 False


In [57]:
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.4192831
dog banana 0.41783607
cat dog 0.4192831
cat cat 1.0
cat banana 0.34277543
banana dog 0.41783607
banana cat 0.34277543
banana banana 1.0


  """


In [61]:
doc1 = nlp(df.title[0])
doc2 = nlp(df.title[1])
print(doc1.similarity(doc2))

0.07194189301221651


  This is separate from the ipykernel package so we can avoid doing imports until


In [63]:
from spacy.attrs import ORTH, LIKE_URL

In [64]:
doc = nlp("Check out https://spacy.io")
for token in doc:
    print(token.text, token.orth, token.like_url)

Check 8104846059040039827 False
out 1696981056005371314 False
https://spacy.io 17142293684782158888 True


In [65]:
attr_ids = [ORTH, LIKE_URL]
doc_array = doc.to_array(attr_ids)
print(doc_array.shape)
print(len(doc), len(attr_ids))

(3, 2)
3 2


In [68]:
doc_array

array([[ 8104846059040039827,                    0],
       [ 1696981056005371314,                    0],
       [17142293684782158888,                    1]], dtype=uint64)

In [66]:
assert doc[0].orth == doc_array[0, 0]
assert doc[1].orth == doc_array[1, 0]
assert doc[0].like_url == doc_array[0, 1]

In [67]:
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
print(list(doc_array[:, 1]))

[0, 0, 1]
