In [30]:
import pandas as pd
import numpy as np
from lib.utility import get_text, ProcessPipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import pickle
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

### Part 1: get raw text data 

In [31]:
### read pickle file
with open('./data/news.pickle', 'rb') as handle:
    texts = pickle.load(handle) 

In [5]:
pipeline = ProcessPipeline(texts)

textsProcessed = pipeline.run(return_str=False)

In [32]:
textsProcessed[0][:10]

['forum',
 'address',
 'one',
 'press',
 'issu',
 'lifetim',
 'global',
 'energi',
 'climat',
 'chang']

In [33]:
len(textsProcessed)

2988

### Part 2: Sequence Tagging

### 2.1 NLTK sequence tagging

### Pos: part of speech

In [8]:
print(pos_tag(textsProcessed[0]))

[('forum', 'NN'), ('address', 'NN'), ('one', 'CD'), ('press', 'NN'), ('issu', 'NN'), ('lifetim', 'NN'), ('global', 'JJ'), ('energi', 'NN'), ('climat', 'NN'), ('chang', 'NN'), ('india’', 'NN'), ('develop', 'VB'), ('undoubtedli', 'JJ'), ('fuell', 'NN'), ('increas', 'NNS'), ('energi', 'VBP'), ('consumpt', 'JJ'), ('econom', 'NN'), ('develop', 'VB'), ('beli', 'JJ'), ('grow', 'NN'), ('problem', 'NN'), ('climat', 'NN'), ('chang', 'NN'), ('caus', 'VBP'), ('CO', 'NNP'), ('emiss', 'JJ'), ('includ', 'NN'), ('increas', 'NNS'), ('earth’', 'VBP'), ('mean', 'JJ'), ('surfac', 'NN'), ('temperatur', 'NN'), ('also', 'RB'), ('known', 'VBN'), ('global', 'JJ'), ('warm', 'NN'), ('rise', 'NN'), ('sea', 'NN'), ('level', 'NN'), ('acidif', 'NN'), ('extrem', 'VBP'), ('weather', 'NN'), ('event', 'NN'), ('increas', 'NNS'), ('global', 'JJ'), ('temperatur', 'JJ'), ('loss', 'NN'), ('polar', 'JJ'), ('ice', 'NN'), ('strongli', 'NN'), ('link', 'VBP'), ('anthropogen', 'NN'), ('activ', 'NN'), ('particularli', 'NN'), ('CO',

In [36]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

# Token: list of word: sequence tagging

In [42]:
nltk.word_tokenize(ex)

['European',
 'authorities',
 'fined',
 'Google',
 'a',
 'record',
 '$',
 '5.1',
 'billion',
 'on',
 'Wednesday',
 'for',
 'abusing',
 'its',
 'power',
 'in',
 'the',
 'mobile',
 'phone',
 'market',
 'and',
 'ordered',
 'the',
 'company',
 'to',
 'alter',
 'its',
 'practices']

In [10]:
def tagSec(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [11]:
print(tagSec(ex))

[('European', 'JJ'), ('authorities', 'NNS'), ('fined', 'VBD'), ('Google', 'NNP'), ('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.1', 'CD'), ('billion', 'CD'), ('on', 'IN'), ('Wednesday', 'NNP'), ('for', 'IN'), ('abusing', 'VBG'), ('its', 'PRP$'), ('power', 'NN'), ('in', 'IN'), ('the', 'DT'), ('mobile', 'JJ'), ('phone', 'NN'), ('market', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), ('the', 'DT'), ('company', 'NN'), ('to', 'TO'), ('alter', 'VB'), ('its', 'PRP$'), ('practices', 'NNS')]


In [12]:
### Explain of tags
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## 2.2 Spacy nlp: Tagging, entity recognition

!python -m spacy download en <br>
Tutorial: https://spacy.io/models/

In [43]:
from pprint import pprint

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

nlp = spacy.load('en')

In [14]:
pipeline = ProcessPipeline()
textsSummarized = list(map(pipeline.summarize,texts))

In [44]:
eg = textsSummarized[0]
eg

'The forum addressed one of the most pressing issues of our lifetimes - global energy and climate change.\nIndia’s development will undoubtedly be fuelled by an increase in energy consumption, but this economic development belies a growing problem - climate change caused by CO2 emissions.\nThis includes increase in Earth’s mean surface temperature (also known as global warming), rise in sea level and acidification, extreme weather events, and so on.\nWhile the increase in global temperature and loss of polar ice has been strongly linked to anthropogenic activities (particularly CO2 emissions), there is no consensus among researchers about the link between extreme weather events such as forest fires, cyclones, droughts etc and anthropogenic causes.\nNow, concerning the timeline, it is expected that global temperatures will increase by over 2 degrees C by 2040 if emissions continue as before, well within our lifetimes for most of us reading this article.\nMillions of Indians live off the

#### Token api

In [45]:
doc = nlp(eg)

In [46]:
doc

The forum addressed one of the most pressing issues of our lifetimes - global energy and climate change.
India’s development will undoubtedly be fuelled by an increase in energy consumption, but this economic development belies a growing problem - climate change caused by CO2 emissions.
This includes increase in Earth’s mean surface temperature (also known as global warming), rise in sea level and acidification, extreme weather events, and so on.
While the increase in global temperature and loss of polar ice has been strongly linked to anthropogenic activities (particularly CO2 emissions), there is no consensus among researchers about the link between extreme weather events such as forest fires, cyclones, droughts etc and anthropogenic causes.
Now, concerning the timeline, it is expected that global temperatures will increase by over 2 degrees C by 2040 if emissions continue as before, well within our lifetimes for most of us reading this article.
Millions of Indians live off the elect

In [47]:
for token in doc:
    print((token.text,token.lemma_,token.is_stop,token.pos_,token.tag_))

('The', 'the', False, 'DET', 'DT')
('forum', 'forum', False, 'NOUN', 'NN')
('addressed', 'address', False, 'VERB', 'VBD')
('one', 'one', True, 'NUM', 'CD')
('of', 'of', True, 'ADP', 'IN')
('the', 'the', True, 'DET', 'DT')
('most', 'most', True, 'ADV', 'RBS')
('pressing', 'pressing', False, 'ADJ', 'JJ')
('issues', 'issue', False, 'NOUN', 'NNS')
('of', 'of', True, 'ADP', 'IN')
('our', '-PRON-', True, 'ADJ', 'PRP$')
('lifetimes', 'lifetime', False, 'NOUN', 'NNS')
('-', '-', False, 'PUNCT', 'HYPH')
('global', 'global', False, 'ADJ', 'JJ')
('energy', 'energy', False, 'NOUN', 'NN')
('and', 'and', True, 'CCONJ', 'CC')
('climate', 'climate', False, 'NOUN', 'NN')
('change', 'change', False, 'NOUN', 'NN')
('.', '.', False, 'PUNCT', '.')
('\n', '\n', False, 'SPACE', '')
('India', 'india', False, 'PROPN', 'NNP')
('’s', '’s', False, 'PART', 'POS')
('development', 'development', False, 'NOUN', 'NN')
('will', 'will', True, 'VERB', 'MD')
('undoubtedly', 'undoubtedly', False, 'ADV', 'RB')
('be', 'be', 

#### Entity api

In [18]:
for X in doc.ents:
    print((X.text, X.label_))

('one', 'CARDINAL')
('\n', 'GPE')
('India', 'GPE')
('CO2', 'ORG')
('\n', 'GPE')
('Earth', 'LOC')
('\n', 'GPE')
('CO2', 'ORG')
('\n', 'GPE')
('Now', 'DATE')
('2 degrees C by', 'QUANTITY')
('2040', 'DATE')
('Millions', 'CARDINAL')
('Indians', 'NORP')
('\n', 'GPE')
('Global Energy Forum', 'ORG')
('\n', 'GPE')
('5', 'CARDINAL')
('6', 'CARDINAL')
('7', 'CARDINAL')


#### Sentence api

In [19]:
### Show doc sentences
sentences = [sen for sen in  doc.sents]
sentences

[The forum addressed one of the most pressing issues of our lifetimes - global energy and climate change.,
 India’s development will undoubtedly be fuelled by an increase in energy consumption, but this economic development belies a growing problem - climate change caused by CO2 emissions.,
 This includes increase in Earth,
 ’s mean surface temperature (also known as global warming), rise in sea level and acidification, extreme weather events, and so on.,
 While the increase in global temperature and loss of polar ice has been strongly linked to anthropogenic activities (particularly CO2 emissions), there is no consensus among researchers about the link between extreme weather events such as forest fires, cyclones, droughts etc and anthropogenic causes.,
 Now, concerning the timeline, it is expected that global temperatures will increase by over 2 degrees C by 2040 if emissions continue as before, well within our lifetimes for most of us reading this article.,
 Millions of Indians live

#### Visualization

In [20]:
### Display sentence relationship with entity style
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

In [21]:
### Display sentence relationship
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 100})

In [22]:
### Display whole doc 
displacy.render(doc,jupyter=True,style='ent')

###  Modulize 

In [23]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = spacy.load('en')

class entityPipeline:
    def __init__(self,texts,targetPOSs=['NOUN']):
        ### texts is list of strings
        self.texts = texts
        self.targetPOSs = targetPOSs

    def run(self,n_threads=16):
        nlp = spacy.load('en')

        res = []
        for doc in nlp.pipe(self.texts, n_threads=n_threads, batch_size=10000):
            featureDic = {}
            entDic = self.extract_entity(doc)
            POSDic = self.extract_POS(doc,self.targetPOSs)
            featureDic.update(entDic)
            featureDic.update(POSDic)
            res.append(featureDic)
        return res
        
    def process(self,text):
        ### text is string type
        nlp = spacy.load('en')
        doc = nlp(text)
        featureDic = {}
        entDic = self.extract_entity(doc)
        POSDic = self.extract_POS(doc,self.targetPOSs)
        featureDic.update(entDic)
        featureDic.update(POSDic)
        return featureDic
        
    def extract_entity(self,doc):
        entDic = {}
        for ent in doc.ents:
            if ent.text != "\n" and ent.label_ not in  ["CARDINAL",'DATE',"LAW"]:
                if ent.text in entDic:
                    entDic[ent.text]['count'] = entDic[ent.text]['count'] + 1  
                else:
                    tmpDic = {ent.text:{"count":1,"label":ent.label_}}
                    entDic.update(tmpDic)
        return entDic
    
    def extract_POS(self,doc,targetPOSs=["VERB"]):
        POSDic = {}
        for token in doc:
            if token.is_stop == False and token.pos_ in targetPOSs :
                if token.lemma_ in POSDic:
                    POSDic[token.lemma_]['count'] = POSDic[token.lemma_]['count'] + 1  
                else:
                    tmpDic = {token.lemma_:{"count":1,"POS":token.pos_}}
                    POSDic.update(tmpDic)
        return POSDic   