In [2]:
import spacy

In [18]:
#word tokenize test
nlp = spacy.load('en')
doc1 = nlp(u"This's a test for spacy, add a Capital word, 试几个中文")
print(doc1)
for token in doc1:
    print(token)
#只做了分词，没有转换大小写, 没有云掉puncutation

This's a test for spacy, add a Capital word, 试几个中文
This
's
a
test
for
spacy
,
add
a
Capital
word
,
试几个中文


In [15]:
#sentence tokenize test
nlp = spacy.load('en')
doc2 = nlp(u"This is a test for space sentences. OK, try this: when will you back to your hometown? This have no space with the last sentence.再来一句中文. 你能认出来么, 这里会被分成几句")
for sent in doc2.sents:
    print(sent)
#句与句之间要有空格，.?!

This is a test for space sentences.
OK, try this: when will you back to your hometown?
This have no space with the last sentence.再来一句中文. 你能认出来么, 这里会被分成几句


In [20]:
#Lemmatize Test 词干化
doc3 = nlp(u"Windows Authentication is not enabled so I cannot login. Or is there a better solution exists?")
for token in doc3:
    print(token, token.lemma, token.lemma_)
#转换成小写
#is-->be
#better-->good
#exists-->exist
#没有去掉puncutation

Windows 10646332009431770838 windows
Authentication 9067479397929045832 authentication
is 10382539506755952630 be
not 447765159362469301 not
enabled 1080083029942854337 enable
so 9781598966686434415 so
I 561228191312463089 -PRON-
can 6635067063807956629 can
not 447765159362469301 not
login 7612924703817238883 login
. 12646065887601541794 .
Or 3740602843040177340 or
is 10382539506755952630 be
there 2112642640949226496 there
a 11901859001352538922 a
better 5711639017775284443 good
solution 8054865091517299512 solution
exists 2808794269826886559 exist
? 8205403955989537350 ?


In [22]:
#Pos Tagging Test
for token in doc3:
    print(token, token.pos, token.pos_)
#词性
#可以用来去除puncutation

Windows 95 PROPN
Authentication 95 PROPN
is 99 VERB
not 85 ADV
enabled 99 VERB
so 84 ADP
I 94 PRON
can 99 VERB
not 85 ADV
login 99 VERB
. 96 PUNCT
Or 88 CCONJ
is 99 VERB
there 85 ADV
a 89 DET
better 83 ADJ
solution 91 NOUN
exists 99 VERB
? 96 PUNCT


In [27]:
#Name entity recognizer test: 命名实体识别
doc4=nlp(u"Andrew Yan-Tak Ng (Chinese: 吳恩達; born 1976) is a Chinese American computer scientist. He is the former chief scientist at Baidu, where he led the company's Artificial Intelligence Group. He is an adjunct professor (formerly associate professor) at Stanford University. Ng is also the co-founder and chairman of Coursera, an online education platform")
for ent in doc4.ents:
    print(ent, ent.label, ent.label_)

#PERSON:人名
#NORP:民族，宗教或政组织
#Date:日期
#ORG:组织
#GPE:国家，城市， 州
#可以用来识别日志中的实例，想办法自定义识别出IP, timestamp

Andrew Yan-Tak Ng 378 PERSON
Chinese 379 NORP
1976 388 DATE
Chinese 379 NORP
American 379 NORP
Baidu 381 ORG
Artificial Intelligence Group 381 ORG
Stanford University 381 ORG
Coursera 382 GPE


In [25]:
#Noun Chunk Test: 名词短语提取
for np in doc4.noun_chunks:
    print(np)

Andrew Yan-Tak Ng
a Chinese American computer scientist
He
the former chief scientist
Baidu
he
the company's Artificial Intelligence Group
He
an adjunct professor
formerly associate professor
Stanford University
Ng
the co-founder and chairman
Coursera
an online education platform


In [37]:
# Word Vector Test: 其于词向量计算两个单词的相似度
Andrew = doc4[0] #这里都己经表示成了向量
computer = doc4[17]
scientist =doc4[18]
professor="professor" #所以这样是不行的
print(Andrew.similarity(scientist))
print(Andrew.similarity(computer))
print(computer.similarity(scientist))
#print(Andrew.similarity(professor))

-0.040077213
0.16391583
0.6350539


In [40]:
## 提取实体关系
from __future__ import unicode_literals, print_function

import plac
import spacy


TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
]


@plac.annotations(
    model=("Model to load (needs parser and NER)", "positional", None, str))
def main(model='en_core_web_sm'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))


def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations

main()

Loaded model 'en_core_web_sm'
Processing 2 texts
Net income	MONEY	$9.4 million
the prior year	MONEY	$2.7 million
Revenue   	MONEY	twelve billion dollars
a loss    	MONEY	1b


In [41]:
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
based on list of single or multiple-word company names. Companies are
labelled as ORG and their spans are merged into one token. Additionally,
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
respectively.
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token


@plac.annotations(
    text=("Text to process", "positional", None, str),
    companies=("Names of technology companies", "positional", None, str))
def main(text="Alphabet Inc. is the company behind Google.", *companies):
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    if not companies:  # set default companies if none are set via args
        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
    component = TechCompanyRecognizer(nlp, companies)  # initialise component
    nlp.add_pipe(component, last=True)  # add last to the pipeline

    doc = nlp(text)
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Tokens', [t.text for t in doc])  # company names from the list are merged
    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities


class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
    labelled as ORG and their spans are merged into one token. Additionally,
    ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
    respectively."""
    name = 'tech_companies'  # component name, will show up in the pipeline

    def __init__(self, nlp, companies=tuple(), label='ORG'):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        # so even if the list of companies is long, it's very efficient
        patterns = [nlp(org) for org in companies]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('TECH_ORGS', None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        Token.set_extension('is_tech_org', default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_tech_org == True.
        Doc.set_extension('has_tech_org', getter=self.has_tech_org)
        Span.set_extension('has_tech_org', getter=self.has_tech_org)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            for token in entity:
                token._.set('is_tech_org', True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_tech_org(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a tech org. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_tech_org' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_tech_org') for t in tokens])

main()
    # Expected output:
    # Pipeline ['tech_companies']
    # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
    # Doc has_tech_org True
    # Token 0 is_tech_org True
    # Token 1 is_tech_org False
    # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]

Pipeline ['tech_companies']
Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
Doc has_tech_org True
Token 0 is_tech_org True
Token 1 is_tech_org False
Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]


In [42]:
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy


# training data
TRAIN_DATA = [
    ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
            print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

main()
    # Expected output:
    # Entities [('Shaka Khan', 'PERSON')]
    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
    # ('Khan', 'PERSON', 1), ('?', '', 2)]
    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
    # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

Created blank 'en' model
{'ner': 19.558236360549927}
{'ner': 14.745993072603596}
{'ner': 11.542696918196498}
{'ner': 14.38020922677226}
{'ner': 9.85609424795561}
{'ner': 3.5626718958740184}
{'ner': 3.6819251688477435}
{'ner': 6.9809978348852395}
{'ner': 8.36336357941655}
{'ner': 8.520106192316252}
{'ner': 8.797412687945833}
{'ner': 12.88454615854036}
{'ner': 3.72887843955949}
{'ner': 0.7986601515369851}
{'ner': 2.5107453060453935}
{'ner': 1.9720739553377198}
{'ner': 1.0585482862800424}
{'ner': 1.9833813895683194}
{'ner': 0.9036725126217289}
{'ner': 2.481782546589562}
{'ner': 0.00012695567780440168}
{'ner': 0.08297985243381768}
{'ner': 0.4068829501889216}
{'ner': 0.02961837654670605}
{'ner': 1.9997692670523712}
{'ner': 0.0002583125256980492}
{'ner': 0.8201337977707732}
{'ner': 1.801046368404278}
{'ner': 5.517401668892613e-06}
{'ner': 8.493352189590618e-12}
{'ner': 0.0018518639206181102}
{'ner': 0.13071620679875195}
{'ner': 6.538074707778533e-14}
{'ner': 0.0864296929161509}
{'ner': 0.002

In [43]:
#训练自己的新的entity https://github.com/explosion/spacy/blob/master/examples/training/train_new_entity_type.py

### 参考