In [1]:
import os
from sklearn.datasets import fetch_20newsgroups
import spacy
from spacy import displacy
import pandas as pd

In [2]:
PIPELINE = ['tagger', 'parser', 'ner']

In [3]:
texts = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')).data

print(len(texts))

11314


In [4]:
texts = [t.replace('\n', ' ') for t in texts]

In [5]:
nlp = spacy.load('en')

for name in PIPELINE:
    component = nlp.create_pipe(name)

In [13]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7f5b7ff68cc0>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7f5b7fee8200>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7f5b7fee8258>)]

In [6]:
%%time

docs = [nlp(txt) for txt in texts[:50]]

CPU times: user 7.72 s, sys: 6.9 s, total: 14.6 s
Wall time: 3.89 s


In [7]:
%%time

docs = [doc for doc in nlp.pipe(texts[:50], n_threads=6)]

CPU times: user 5.93 s, sys: 3.89 s, total: 9.82 s
Wall time: 3.15 s


## POS

In [8]:
df_pos = pd.DataFrame(columns=['Text', 'Lemma', 'POS', 'TAG', 'DEP', 'Shape', 'Stop Word'])

for token in docs[10]:
    df_pos = df_pos.append({'Text': token.text, 'Lemma': token.lemma_, 'POS': token.pos_, 'TAG': token.tag_, 
                            'DEP': token.dep_, 'Shape': token.shape_, 'Stop Word': token.is_stop}, ignore_index=True)
    
df_pos.head(1000000)

Unnamed: 0,Text,Lemma,POS,TAG,DEP,Shape,Stop Word
0,I,-PRON-,PRON,PRP,nsubj,X,False
1,have,have,VERB,VBP,ROOT,xxxx,True
2,a,a,DET,DT,det,x,True
3,line,line,NOUN,NN,dobj,xxxx,False
4,on,on,ADP,IN,prep,xx,True
5,a,a,DET,DT,det,x,True
6,Ducati,ducati,PROPN,NNP,nmod,Xxxxx,False
7,900GTS,900gts,NUM,CD,compound,dddXXX,False
8,1978,1978,NUM,CD,nummod,dddd,False
9,model,model,NOUN,NN,pobj,xxxx,False


## NER

### [annotation types](https://spacy.io/usage/linguistic-features#entity-types)

In [9]:
for doc in docs:
    displacy.render(doc, style='ent', jupyter=True)
    print('=============================================================================================================')

























  "__main__", mod_spec)


















  "__main__", mod_spec)






























































In [11]:
df_ent = pd.DataFrame(columns=['Text', 'Start Char', 'End Char', 'Label'])

for ent in docs[10].ents:
    df_ent = df_ent.append({'Text': ent.text, 'Start Char': ent.start_char, 'End Char': ent.end_char, 'Label': ent.label_},
                           ignore_index=True)
    
df_ent.head(10000000)

Unnamed: 0,Text,Start Char,End Char,Label
0,Ducati,19,25,GPE
1,900GTS 1978,26,37,DATE
2,17k,49,52,DATE
3,1st,163,166,DATE
4,1,251,252,CARDINAL
5,3495,281,285,MONEY
6,3K.,316,319,MONEY
7,,320,321,NORP
8,Beemer,410,416,PERSON
9,Axis Motors,460,471,ORG


## Dependency Parsing

In [10]:
for sent in docs[10].sents:
    print(sent)
    tmp_doc = sent.as_doc()
    tmp_doc.user_data['title'] = sent.text
    displacy.render(tmp_doc, style='dep', jupyter=True, options={'compact': True})

I have a line on a Ducati 900GTS 1978 model with 17k on the clock.  


Runs very well, paint is the bronze/brown/orange faded out, leaks a bit of oil and pops out of 1st with hard accel.  


The shop will fix trans and oil  leak.  


They sold the bike to the 1 and only owner.  


They want $3495, and I am thinking more like $3K.  Any opinions out there?  


Please email me.


Thanks.  


It would be a nice stable mate to the Beemer.  


Then I'll get a jap bike and call myself Axis Motors!  


--  -----------------------------------------------------------------------


"Tuba" (Irwin)      


"I honk therefore I am"     CompuTrac-Richardson,Tx irwin@cmptrc.lonestar.org    DoD #0826          (R75/6)
