In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [12]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [15]:
from sklearn.feature_extraction import stop_words

sorted(list(stop_words.ENGLISH_STOP_WORDS))[:5]

['a', 'about', 'above', 'across', 'after']

In [16]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hongdouli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
from nltk import stem

In [18]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [19]:
word_list = ['feet', 'foot', 'foots', 'footing']

In [20]:
[wnl.lemmatize(word) for word in word_list]

['foot', 'foot', 'foot', 'footing']

In [25]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [21]:
import spacy
from spacy.lemmatizer import Lemmatizer
lemmatizer = Lemmatizer()
[lemmatizer.lookup(word) for word in word_list]

['feet', 'foot', 'foots', 'footing']

In [None]:
nlp = spacy.load("en_core_web_sm")


In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk

In [23]:
vectorizer = CountVectorizer(stop_words='english')

In [26]:
vectors = vectorizer.fit_transform(newsgroups_train.data).todense() # (documents, vocab)
vectors.shape #, vectors.nnz / vectors.shape[0], row_means.shape

(2034, 26576)

In [27]:
vocab = np.array(vectorizer.get_feature_names())

In [4]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [5]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [6]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [7]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [8]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [9]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
#pprint(iob_tagged)

In [10]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
#print(ne_tree)

In [11]:
ne_tree = conlltags2tree(iob_tagged)
#print (ne_tree)

In [100]:
from nltk import word_tokenize, pos_tag, ne_chunk
ne_tree = ne_chunk(pos_tag(word_tokenize(text)))
 
iob_tagged = tree2conlltags(ne_tree)
print (iob_tagged)

[('Another', 'DT', 'O'), ('tech', 'NN', 'O'), ('start-up', 'NN', 'O'), ('made', 'VBD', 'O'), ('a', 'DT', 'O'), ('blockbuster', 'NN', 'O'), ('debut', 'NN', 'O'), ('on', 'IN', 'O'), ('the', 'DT', 'O'), ('US', 'NNP', 'B-ORGANIZATION'), ('stock', 'NN', 'O'), ('market', 'NN', 'O'), ('last', 'JJ', 'O'), ('week', 'NN', 'O'), (',', ',', 'O'), ('with', 'IN', 'O'), ('shares', 'NNS', 'O'), ('in', 'IN', 'O'), ('Slack', 'NNP', 'B-GPE'), ('ending', 'VBG', 'O'), ('the', 'DT', 'O'), ('week', 'NN', 'O'), ('more', 'JJR', 'O'), ('than', 'IN', 'O'), ('40', 'CD', 'O'), ('%', 'NN', 'O'), ('higher.That', 'WDT', 'O'), ('values', 'VBZ', 'O'), ('the', 'DT', 'O'), ('Silicon', 'NNP', 'B-ORGANIZATION'), ('Valley-based', 'JJ', 'O'), ('business', 'NN', 'O'), ('at', 'IN', 'O'), ('$', '$', 'O'), ('20', 'CD', 'O'), ('bn', 'NN', 'O'), (',', ',', 'O'), ('not', 'RB', 'O'), ('bad', 'JJ', 'O'), ('for', 'IN', 'O'), ('a', 'DT', 'O'), ('messaging', 'NN', 'O'), ('app', 'NN', 'O'), ('that', 'WDT', 'O'), ('was', 'VBD', 'O'), ('on

In [101]:
[x for x in iob_tagged if x[2] != 'O']

[('US', 'NNP', 'B-ORGANIZATION'),
 ('Slack', 'NNP', 'B-GPE'),
 ('Silicon', 'NNP', 'B-ORGANIZATION')]

In [84]:
print(ne_chunk(pos_tag(word_tokenize(text))))

(S
  Another/DT
  tech/NN
  start-up/NN
  made/VBD
  a/DT
  blockbuster/NN
  debut/NN
  on/IN
  the/DT
  (ORGANIZATION US/NNP)
  stock/NN
  market/NN
  last/JJ
  week/NN
  ,/,
  with/IN
  shares/NNS
  in/IN
  (GPE Slack/NNP)
  ending/VBG
  the/DT
  week/NN
  more/JJR
  than/IN
  40/CD
  %/NN
  higher.That/WDT
  values/VBZ
  the/DT
  (ORGANIZATION Silicon/NNP)
  Valley-based/JJ
  business/NN
  at/IN
  $/$
  20/CD
  bn/NN
  ,/,
  not/RB
  bad/JJ
  for/IN
  a/DT
  messaging/NN
  app/NN
  that/WDT
  was/VBD
  only/RB
  publicly/RB
  released/VBN
  in/IN
  2014/CD
  and/CC
  has/VBZ
  never/RB
  turned/VBN
  a/DT
  profit/NN
  ./.)


In [59]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [16]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [18]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html)
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)


172

In [19]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 82,
         'GPE': 16,
         'CARDINAL': 5,
         'ORG': 39,
         'DATE': 23,
         'NORP': 2,
         'ORDINAL': 1,
         'FAC': 1,
         'PRODUCT': 3})

In [20]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 28), ('F.B.I.', 13), ('Trump', 12)]

In [29]:
sentences = [x for x in article.sents]
print(sentences[67])

Strzok, 48, a graduate of Georgetown University, served as an officer in the Army before he joined the F.B.I.


In [25]:
displacy.render(nlp(str(sentences[67])), jupyter=True, style='ent')

In [30]:
displacy.render(nlp(str(sentences[67])), style='dep', jupyter = True, options = {'distance': 120})


In [32]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

In [33]:
import pandas as pd

In [34]:
df = pd.read_csv('Combined_News_DJIA.csv')

In [36]:
df.columns

Index(['Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7',
       'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
       'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23',
       'Top24', 'Top25'],
      dtype='object')

In [37]:
df

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...
5,2008-08-15,1,"b""Mom of missing gay man: Too bad he's not a 2...","b""Russia: U.S. Poland Missile Deal Won't Go 'U...","b""The government has been accused of creating ...",b'The Italian government has lashed out at an ...,b'Gorbachev: Georgia started conflict in S. Os...,"b""China fakes more than your girlfriend; 'Ethn...","b""The UN's criticism of freedom of expression ...",b'Russian general threatens nuclear strike on ...,...,b'Why are redditors generally supportive of Ru...,b'Johann Hari: We need to stop being such cowa...,b'US officials have said that their military p...,b'Israel clears troops who killed Reuters came...,b'Unenforceable laws encourage cops to escalat...,b'What Chinese pollution really looks like',"b'Hacker Kidnaps and Tortures Informant, Posts...",b'Bush Tells Putin: This Aggression Will Not S...,b'Georgia is all about the oil pipelines',b'Rivals say they plan to remove Georgian pres...
6,2008-08-18,0,"b'In an Afghan prison, the majority of female ...","b""Little girl, you're not ugly; they are""","b""Pakistan's Musharraf to Resign, Leave the Co...","b'Tornado throws a bus in Poland, captured by ...","b""Britain's terror laws have left me and my fa...","b""Iran 'fires satellite into space'""",b'Rights of Non-Muslims restricted by new Mald...,b'Tour of Tskhinvali undercuts Russian version...,...,b'MI5 seeks gay spies',b' New porn channel lets Canadians strut their...,b'The Dangerous Neighbor: Vladimir Putin Takes...,b'Israel opinion page: Russians are saner.',"b""NATO's Hour""",b'Georgian President Saakashvili Eats His Tie ...,b'No Chicken Left Behind: Animal RFID Surveill...,b'Putin has given us an order that everyone mu...,b'National DNA database grows on the genes of ...,b'Mayor Asks Ugly Women To Visit His Town'
7,2008-08-19,0,"b""Man arrested and locked up for five hours af...",b'The US missile defence system is the magic p...,b'Schrder lambasted for blaming Russian confli...,b'Officials: 10 French soldiers killed near Ka...,b'These ten laws make China a totalitarian was...,b'Russia seizes US vehicles',"b""Muslims are only 4% of Denmark's 5.4 million...",b'Taliban Forces Kill 10 French Soldiers and R...,...,b'Brazil Will Play Military War Game to Defend...,"b'16,000 fine for British woman caught sharing...",b'102-year-old grandma is oldest person on Fac...,b'Today 5 years ago - August 19th 2003. Bombin...,"b'US national Ken Haywood, whose computer was ...",b' Taliban kill 10 French troops near Afghan c...,b'Not Everybody Loves Offshore Wind Power in S...,b'Taliban Forces Kill 10 French Soldiers and R...,b'Pakistan is more democratic than America. ',b'Blaze engulfs Egyptian parliament'
8,2008-08-20,1,b'Two elderly Chinese women have been sentence...,b'The Power of Islam: The Human Rights Council...,"b""We had 55 times more military soldiers in th...","b'""I live here on less than a dollar a month"" ...",b'Russia sends aircraft carrier to Syria.',b'The American people should be eternally grat...,b'Abkhazia officially appeals to Russia for in...,"b'Russia warns of response ""beyond diplomacy"" ...",...,b'Grote Markt [PIC]',b'Russia has informed Norway that it plans to ...,"b""'What Are the Aims of this War?': French Opp...",b'Bush Covered up Musharraf Ties with Al Qaeda',b'Mikhail Gorbachev: Russia Never Wanted a War',b'Germans urge tougher laws after new privacy ...,b'The Time of the Wimps: Dialogue with Russia ...,b'1998 Missile Strikes on Bin Laden May Have B...,"b""For a moment let's forget everything else an...",b'The First Solar Radio Station in Argentina'
9,2008-08-21,1,"b""British resident held in Guantanamo Bay wins...",b'Chinese may have killed 140 Tibetans this we...,b'U.S. Navy Ships Head to Georgia',b'Hacker uncovers Chinese olympic fraud',"b""If you've ever wondered what Kim Jong Il was...","b""Russia's Nuclear Threat Is More Than Words""","b'Czech President: ""I must protest aloud again...",b'50% Of All Food Produced Is Wasted Before It...,...,b'Russia condemns US missile deal',b'NATOs decision to freeze relations with Mosc...,"b'Sweet Sixteen or Fraudulent Fourteen, Hacke...","b'If Russias feeling churlish, they can pretty...","b'Chinese Gymnasts 14, Official Document Shows'",b'Suicide attack kills at least 50 at Pakistan...,b'The Abkhazian Parliament has approved an off...,"b'Georgia, Bulgaria and the Second Balkan War ...","b""Terrorist reveals Pak's sinister designs on ...","b""International Olympic Committee launches pro..."


In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [52]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
#from sklearn import cross_val_score
#from sklearn import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

ImportError: cannot import name 'cross_val_score' from 'sklearn' (/Users/hongdouli/anaconda3/lib/python3.7/site-packages/sklearn/__init__.py)

In [42]:
nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [43]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

CPU times: user 2.07 s, sys: 115 ms, total: 2.18 s
Wall time: 2.21 s


In [44]:
train_sents[0]

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [45]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]


In [46]:
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'melbourne',
 'word[-3:]': 'rne',
 'word[-2:]': 'ne',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'NP',
 'postag[:2]': 'NP',
 'BOS': True,
 '+1:word.lower()': '(',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'Fpa',
 '+1:postag[:2]': 'Fp'}

In [47]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


CPU times: user 953 ms, sys: 100 ms, total: 1.05 s
Wall time: 1.07 s


In [48]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 30.7 s, sys: 189 ms, total: 30.9 s
Wall time: 30.9 s


In [49]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [50]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7964686316443963

In [51]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)


NameError: name 'RandomizedSearchCV' is not defined

In [116]:
import spacy

text = "Another tech start-up made a blockbuster debut on the US stock \
market last week, with shares in Slack ending the week more than 40% higher.\
That values the Silicon Valley-based business at $20 bn, not bad for a messaging \
app that was only publicly released in 2014 and has never turned a profit."

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

US GPE
last week DATE
Slack ORG
the week DATE
more than 40% PERCENT
the Silicon Valley LOC
20 MONEY
2014 DATE


In [65]:
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'GPE': 1, 'DATE': 3, 'ORG': 1, 'PERCENT': 1, 'LOC': 1, 'MONEY': 1})

In [68]:
displacy.render(nlp(str(text.split('.')[0])), style='dep', jupyter = True, options = {'distance': 120})

In [73]:
displacy.render(nlp(text), jupyter=True, style='ent')

In [76]:
from nltk.tag import StanfordNERTagger
stanford_ner_tagger = StanfordNERTagger(
    ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz',
    ner_dir + 'stanford-ner-3.9.1.jar')

NameError: name 'ner_dir' is not defined

In [106]:
text += "Basically, workplace communication based around email is ripe for a shake-up - and many believe Slack will lead the way.\
Indeed, headline writers would have you believe that Slack is the 'Email Killer'. But even boss Stewart Butterfield doesn't go that far.\
He does, though, think his Searchable Log of All Communication and Knowledge (Slack) software can revolutionise the way employees communicate."

In [117]:
text += "Mr Butterfield, the entrepreneur behind the Flickr photo app that he sold to Yahoo, is sympathetic if people struggle to get their heads around this new way of communication."

In [118]:
text += "Still, the company has 100,000 paying customers, plus many more using a free basic service, with the number of active daily users put at 10 million. \
The biggest corporate customers pay at least $100,000 a year for the service. But Slack has never made a profit. Although revenue rose 80% to $400m in 2018, losses were $144m."

In [119]:
displacy.render(nlp(text), jupyter=True, style='ent')