In [1]:
import spacy

In [34]:
#nlp = spacy.load('de_core_news_sm')
nlp = spacy.load('de_core_news_md')
doc = nlp(u'Ich bin ein Berliner.')

In [35]:
doc1 = nlp(u'Der Apfel und die Orange sind ähnlich')

In [36]:
der_apfel = doc[:2]
die_orange = doc[3:5]
der_apfel.similarity(die_orange)

0.27352026

In [37]:
doc = nlp(u'Den Berliner hat der Hund nicht gebissen.')
# heads array: [1, 6, 2, 4, 2, 6, 2, 2] (second token is attached with a non-projective arc)


In [38]:
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.pos_) for t in doc))
# output: Ich/PRON bin/AUX ein/DET Berliner/NOUN ./PUNCT

Den/DET Berliner/NOUN hat/AUX der/DET Hund/NOUN nicht/PART gebissen/VERB ./PUNCT


In [39]:
# show dependency arcs
print('\n'.join('{child:<8} <{label:-^7} {head}'.format(child=t.orth_, label=t.dep_, head=t.head.orth_) for t in doc))
# output: (sb: subject, nk: noun kernel, pd: predicate)
# Ich      <--sb--- bin
# bin      <-ROOT-- bin
# ein      <--nk--- Berliner
# Berliner <--pd--- bin
# .        <-punct- bin

Den      <--nk--- Berliner
Berliner <--oa--- gebissen
hat      <-ROOT-- hat
der      <--nk--- Hund
Hund     <--sb--- hat
nicht    <--ng--- gebissen
gebissen <--oc--- hat
.        <-punct- hat


In [40]:
# show named entities
for ent in doc.ents:
    print(ent.text)
# output:
# Berline

Berliner


In [41]:
# show noun chunks
for chunk in doc.noun_chunks:
    print(chunk.text)
# output:
# ein Berliner

# noun chunks include so-called measure constructions ...
doc = nlp(u'Ich möchte gern zum Essen eine Tasse Kaffee bestellen.')
print( [ chunk for chunk in doc.noun_chunks ])
# output:
# [Essen, eine Tasse Kaffee]

# ... and close appositions
doc = nlp(u'Der Senator vermeidet das Thema Flughafen.')
print( [ chunk for chunk in doc.noun_chunks ])
# output:
# [Der Senator, das Thema Flughafen]

Den Berliner
der Hund
[Ich, Essen, eine Tasse, Kaffee]
[Der Senator, das Thema Flughafen]


In [42]:
# Use word vectors
#de = spacy.load('de')
doc = nlp(u'Der Apfel und die Orange sind ähnlich')
assert len(doc.vector) == len(doc[0].vector)
der_apfel = doc[:2]
die_orange = doc[3:5]
der_apfel.similarity(die_orange)
# output:
# 0.63665210991205579
der, apfel = der_apfel
der.similarity(apfel)
# output:
# 0.24995991403916812

0.36658847

In [43]:
# the root has no left dependents:
print(doc[2].n_lefts)
# output:
# 0

# but the root's left-most descendant is not the root itself but a token further left
print(doc[2].left_edge.i, doc[2].left_edge.orth_)
# output:
# (0, u'Den')

0
2 und


https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html#working-with-text