# Extract Subject-Verb-Object (S, V, O) Triplets : VP Chunker version

In [1]:
import os
path_to_gs = "/usr/bin/gs"
os.environ['PATH'] += os.pathsep + path_to_gs
from nltk.chunk import conlltags2tree, tree2conlltags
# Classifier based tagger
from collections import Iterable
from nltk import ChunkParserI, ClassifierBasedTagger
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import conll2000  # training data for chunker
from nltk import pos_tag, word_tokenize
import random
import time
import re
import pandas as pd
import numpy as np

# =============================================================================
""" trained chunker """ 
# =============================================================================
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('__START2__', '__START2__'), ('__START1__', '__START1__')] + list(tokens) + [('__END1__', '__END1__'), ('__END2__', '__END2__')]
    history = ['__START2__', '__START1__'] + list(history)
 
    # shift the index with 2, to accommodate the padding
    index += 2
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
 
        'next-word': nextword,
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'prev-word': prevword,
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
    }
 
class ClassifierChunkParser(ChunkParserI):
    def __init__(self, chunked_sents, **kwargs):
        assert isinstance(chunked_sents, Iterable)
 
        # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
        chunked_sents = [tree2conlltags(sent) for sent in chunked_sents]
 
        # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
        def triplets2tagged_pairs(iob_sent):
            return [((word, pos), chunk) for word, pos, chunk in iob_sent]
        chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=chunked_sents,
            feature_detector=features,
            **kwargs)
 
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
    

## Train Chunker on 'conll2000' Datasets

CoNLL2000 Datasets: `NP`: 55,081 (51%),  `VP`: 21467 (20%) ...

###  only use 'VP' chunk_type

In [2]:
shuffled_conll_sents = list(conll2000.chunked_sents(chunk_types=['VP']))
random.shuffle(shuffled_conll_sents)
# train, test
train_sents = shuffled_conll_sents[:int(len(shuffled_conll_sents) * 0.9)]
test_sents = shuffled_conll_sents[int(len(shuffled_conll_sents) * 0.9 + 1):]  
# train chunker
VPchunker = ClassifierChunkParser(train_sents)

### use 'NP' and 'VP' chunk_type

In [3]:
shuffled_conll_sents = list(conll2000.chunked_sents(chunk_types=['VP', 'NP']))
random.shuffle(shuffled_conll_sents)
train_sents = shuffled_conll_sents[:int(len(shuffled_conll_sents) * 0.9)]
test_sents = shuffled_conll_sents[int(len(shuffled_conll_sents) * 0.9 + 1):]  
print(len(train_sents), len(test_sents))
NPVPchunker = ClassifierChunkParser(train_sents)

9853 1094


In [4]:
import pandas as pd
dat_path = '/home/junhyuki/DLproject/DAT'
usecols=['connected_url', 'keywords', 'timestamp', 'clean_title', 'location']
df = pd.read_csv(dat_path + '/1-DailyNews/cleaned_news_2012.csv', usecols=usecols, nrows=100)
df

Unnamed: 0,connected_url,keywords,timestamp,clean_title,location
0,http://www.reuters.com/article/2012/01/02/us-f...,Juan Salceto;Stephen Glatstein;US;FOOTBALL;ACC...,20120101,Oakland Raiders fan falls from bleachers after...,
1,http://www.reuters.com/article/2012/01/02/us-n...,Ian Cole;Justin Abdelkader;US;NHL;BLUES;COLE;N...,20120101,Blues' Cole suspended for three games,
2,http://www.reuters.com/article/2012/01/02/us-n...,Barry Trotz;Brendan Morrison;Jarome Iginla;Kev...,20120101,Kostitsyn's hat-trick helps Predators douse Fl...,
3,http://www.reuters.com/article/2012/01/02/us-n...,Mark Leffingwell;Tim Tebow;US;NFL;BRONCOS;Spor...,20120101,"Broncos clinch AFC West, snap playoff drought",
4,http://www.reuters.com/article/2012/01/02/us-c...,Cuba;Mexico;Benedict XVI;Fidel Castro;Raul Cas...,20120101,Pope Benedict XVI to visit Cuba March 26-28,
5,http://www.reuters.com/article/2012/01/02/us-c...,US;CHINA;LAWYER;Crime / Law / Justice;Fundamen...,20120101,China dissident-lawyer Gao jailed in far west,
6,http://www.reuters.com/article/2012/01/02/idUS...,India;N.K. Nanda;Rana Som;Commodities Markets;...,20120101,Govt appoints 3-month interim head for India's...,
7,http://www.reuters.com/article/2012/01/02/us-n...,John Sommers II;Ray Rice;US;NFL;RAVENS;Sport;N...,20120101,Ravens earn AFC North title and first-round bye,
8,http://www.reuters.com/article/2012/01/02/us-c...,Colombia;Panama;United States;Venezuela;Juan d...,20120101,Colombian police kill leader of powerful drug ...,
9,http://www.reuters.com/article/2012/01/02/us-k...,China;Korea;South Korea;Taiwan;United States;C...,20120101,S. Korea manufacturing shrinks most in 3 years,


## Test1: VP chunker -> Ignore NNS, NN in 'VP' case

This test doesn't produce satisfactory results, since TRAINED POS-tagging model cannot caputre real VERB (They reconize them as NNS(pluran nouns) or NN(singular noun))

In [5]:
import numpy as np

def ExtractSVO_usingVPchunker(text):
    """ For simplicity, we assume that if there are more than 2 'VP':
        -> We select First 'VP' 
    
    Error Type:
        text='Jinke Property wins land auctions in Chongqing for 117.9 mln yuan'
        > nltk.pos_tag recognize 'wins' as plural noun (NNS)
    """
    vp = []
    tree = VPchunker.parse(pos_tag(word_tokenize(text)))
    print(text)
    for child in tree:
        if str(type(child)) == "<class 'nltk.tree.Tree'>":
            if re.findall('/N', str(child)) != []:
                print('This is Not the case of Verb Phrase : ' + str(child))
            else:
                vp.append(child)
    if vp == []:
        s, v, o = np.nan, np.nan, np.nan
    else:
        if len(vp) > 1:
            vp = vp[0]  # use first 'VP' if all doesn't have prefix 'N'
            # Tree('VP', [...]) shape
            if len(vp) != 1:
                vp_ = ' '.join([x[0] for x in vp])
            else:
                vp_ = vp[0][0]
        else:
            # [Tree('VP', [...])] shape
            if len(vp) != 1:
                vp_ = ' '.join([x[0] for x in vp[0]])
            else:
                vp_ = vp[0][0][0]
        print(' ------> VP: ' + vp_)
        
        # e.g. +76 can be captured as 'verb' -> then because of special characters in `regex`, we need to use r'
        # But we can think that this type of verb is somewhat error, so pass!
        try:
            m = re.search(vp_, text)
            start = m.start()
            end = m.end()
            s = text[:start].strip()
            v = vp_.strip()
            o = text[end:].strip()            
        except:
            s, v, o = np.nan, np.nan, np.nan
        
    return s, v, o

In [6]:
df['clean_title'].apply(ExtractSVO_usingVPchunker)

Oakland Raiders fan falls from bleachers after game
 ------> VP: falls
Blues' Cole suspended for three games
 ------> VP: suspended
Kostitsyn's hat-trick helps Predators douse Flames
This is Not the case of Verb Phrase : (VP helps/NN)
This is Not the case of Verb Phrase : (VP Predators/NNS)
 ------> VP: douse
Broncos clinch AFC West, snap playoff drought
This is Not the case of Verb Phrase : (VP clinch/NN)
 ------> VP: snap
Pope Benedict XVI to visit Cuba March 26-28
This is Not the case of Verb Phrase : (VP Benedict/NNP)
This is Not the case of Verb Phrase : (VP XVI/NNP)
 ------> VP: to
China dissident-lawyer Gao jailed in far west
This is Not the case of Verb Phrase : (VP dissident-lawyer/NN)
 ------> VP: jailed
Govt appoints 3-month interim head for India's NMDC
This is Not the case of Verb Phrase : (VP appoints/NNS)
 ------> VP: 3-month
Ravens earn AFC North title and first-round bye
 ------> VP: earn
Colombian police kill leader of powerful drug gang
 ------> VP: kill
S. Korea man

0     (Oakland Raiders fan, falls, from bleachers af...
1             (Blues' Cole, suspended, for three games)
2     (Kostitsyn's hat-trick helps Predators, douse,...
3     (Broncos clinch AFC West,, snap, playoff drought)
4       (Pope Benedict XVI, to, visit Cuba March 26-28)
5     (China dissident-lawyer Gao, jailed, in far west)
6     (Govt appoints, 3-month, interim head for Indi...
7     (Ravens, earn, AFC North title and first-round...
8     (Colombian police, kill, leader of powerful dr...
9                                       (nan, nan, nan)
10    (India, lets, foreign individuals invest in st...
11    (Freeport Indonesia union, says, delays return...
12    (Freeport Indonesia union, says, delays return...
13    (Philippine stock exchange, extends, trading h...
14           (Seoul shares, inch, up, tech shares lead)
15    (Hyundai Heavy targets 20 percent order growth...
16                                      (nan, nan, nan)
17    (Hyundai Heavy targets 20 pct order growth

## Test2: VP chunker -> Accept NNS, NN in 'VP' case

In [7]:
def ExtractSVO_usingVPchunker(text):
    """ For simplicity, we assume that if there are more than 2 'VP':
        -> We select First 'VP' 
    
    Error Type:
        text='Jinke Property wins land auctions in Chongqing for 117.9 mln yuan'
        > nltk.pos_tag recognize 'wins' as plural noun (NNS)
    """
    vp = []
    tree = VPchunker.parse(pos_tag(word_tokenize(text)))
    for child in tree:
        if str(type(child)) == "<class 'nltk.tree.Tree'>":
            if re.findall(r'\bVP\b', str(child)) != []:
                vp.append(child)
    if vp == []:
        s, v, o = np.nan, np.nan, np.nan
    else:
        if len(vp) > 1:
            vp = vp[0]  # use first 'VP' if all doesn't have prefix 'N'
            # Tree('VP', [...]) shape
            if len(vp) != 1:
                vp_ = ' '.join([x[0] for x in vp])
            else:
                vp_ = vp[0][0]
        else:
            # [Tree('VP', [...])] shape
            if len(vp[0]) != 1:
                vp_ = ' '.join([x[0] for x in vp[0]])
            else:
                vp_ = vp[0][0][0]
        # e.g. +76 can be captured as 'verb' -> then because of special characters in `regex`, we need to use r'
        # But we can think that this type of verb is somewhat error, so pass!
        try:
            m = re.search(vp_, text)
            start = m.start()
            end = m.end()
            s = text[:start].strip()
            v = vp_.strip()
            o = text[end:].strip()
        except:
            s, v, o = np.nan, np.nan, np.nan
                
    return s, v, o

In [8]:
df['clean_title'].apply(ExtractSVO_usingVPchunker)

0     (Oakland Raiders fan, falls, from bleachers af...
1             (Blues' Cole, suspended, for three games)
2     (Kostitsyn's hat-trick, helps, Predators douse...
3     (Broncos, clinch, AFC West, snap playoff drought)
4       (Pope, Benedict, XVI to visit Cuba March 26-28)
5     (China, dissident-lawyer, Gao jailed in far west)
6     (Govt, appoints, 3-month interim head for Indi...
7     (Ravens, earn, AFC North title and first-round...
8     (Colombian police, kill, leader of powerful dr...
9                                       (nan, nan, nan)
10    (India, lets, foreign individuals invest in st...
11    (Freeport Indonesia union, says, delays return...
12    (Freeport Indonesia union, says, delays return...
13    (Philippine stock exchange, extends, trading h...
14           (Seoul shares, inch, up, tech shares lead)
15    (Hyundai Heavy, targets, 20 percent order grow...
16        (Proud 49ers secure bye with, win, over Rams)
17    (Hyundai Heavy, targets, 20 pct order grow

## Test3: NPVP chunker -> Accept NNS, NN in 'VP' case

In [9]:
def ExtractSVO_usingNPVPchunker(text):
    """ For simplicity, we assume that if there are more than 2 'VP':
        -> We select First 'VP' 
    
    Error Type:
        text='Jinke Property wins land auctions in Chongqing for 117.9 mln yuan'
        > nltk.pos_tag recognize 'wins' as plural noun (NNS)
    """
    vp = []
    tree = NPVPchunker.parse(pos_tag(word_tokenize(text)))
    for child in tree:
        if str(type(child)) == "<class 'nltk.tree.Tree'>":
            if re.findall(r'\bVP\b', str(child)) != []:
                vp.append(child)
    if vp == []:
        s, v, o = np.nan, np.nan, np.nan
    else:
        if len(vp) > 1:
            vp = vp[0]  # use first 'VP' if all doesn't have prefix 'N'
            # Tree('VP', [...]) shape
            if len(vp) != 1:
                vp_ = ' '.join([x[0] for x in vp])
            else:
                vp_ = vp[0][0]
        else:
            # [Tree('VP', [...])] shape
            if len(vp[0]) != 1:
                vp_ = ' '.join([x[0] for x in vp[0]])
            else:
                vp_ = vp[0][0][0]
        # e.g. +76 can be captured as 'verb' -> then because of special characters in `regex`, we need to use r'
        # But we can think that this type of verb is somewhat error, so pass!
        try:
            m = re.search(vp_, text)
            start = m.start()
            end = m.end()
            s = text[:start].strip()
            v = vp_.strip()
            o = text[end:].strip()
        except:
            s, v, o = np.nan, np.nan, np.nan
                
    return s, v, o

In [10]:
df['clean_title'].apply(ExtractSVO_usingVPchunker)

0     (Oakland Raiders fan, falls, from bleachers af...
1             (Blues' Cole, suspended, for three games)
2     (Kostitsyn's hat-trick, helps, Predators douse...
3     (Broncos, clinch, AFC West, snap playoff drought)
4       (Pope, Benedict, XVI to visit Cuba March 26-28)
5     (China, dissident-lawyer, Gao jailed in far west)
6     (Govt, appoints, 3-month interim head for Indi...
7     (Ravens, earn, AFC North title and first-round...
8     (Colombian police, kill, leader of powerful dr...
9                                       (nan, nan, nan)
10    (India, lets, foreign individuals invest in st...
11    (Freeport Indonesia union, says, delays return...
12    (Freeport Indonesia union, says, delays return...
13    (Philippine stock exchange, extends, trading h...
14           (Seoul shares, inch, up, tech shares lead)
15    (Hyundai Heavy, targets, 20 percent order grow...
16        (Proud 49ers secure bye with, win, over Rams)
17    (Hyundai Heavy, targets, 20 pct order grow

## All data: Extraction based on 'NP' & 'VP' Chunker

In [11]:
def ExtractSVO_usingNPVPchunker(text):
    """ For simplicity, we assume that if there are more than 2 'VP':
        -> We select First 'VP' 
    
    Error Type:
        text='Jinke Property wins land auctions in Chongqing for 117.9 mln yuan'
        > nltk.pos_tag recognize 'wins' as plural noun (NNS)
    """
    vp = []
    tree = NPVPchunker.parse(pos_tag(word_tokenize(text)))
    for child in tree:
        if str(type(child)) == "<class 'nltk.tree.Tree'>":
            if re.findall(r'\bVP\b', str(child)) != []:
                vp.append(child)
    if vp == []:
        s, v, o = np.nan, np.nan, np.nan
    else:
        if len(vp) > 1:
            vp = vp[0]  # use first 'VP' if all doesn't have prefix 'N'
            # Tree('VP', [...]) shape
            if len(vp) != 1:
                vp_ = ' '.join([x[0] for x in vp])
            else:
                vp_ = vp[0][0]
        else:
            # [Tree('VP', [...])] shape
            if len(vp[0]) != 1:
                vp_ = ' '.join([x[0] for x in vp[0]])
            else:
                vp_ = vp[0][0][0]
        # e.g. +76 can be captured as 'verb' -> then because of special characters in `regex`, we need to use r'
        # But we can think that this type of verb is somewhat error, so pass!
        try:
            m = re.search(vp_, text)
            start = m.start()
            end = m.end()
            s = text[:start].strip()
            v = vp_.strip()
            o = text[end:].strip()
        except:
            s, v, o = np.nan, np.nan, np.nan
                
    return s, v, o

In [12]:
dat_path = '/home/junhyuki/DLproject/DAT'
usecols=['connected_url', 'keywords', 'timestamp', 'clean_title', 'location']

def deleteNaN(df):
    """eliminate 'nan'"""
    idx_s = df[df['subject'].isna() == True].index
    idx_v = df[df['verb'].isna() == True].index
    idx_o = df[df['object'].isna() == True].index
    idx = list(set(list(set(idx_s)) + list(set(idx_v)) + list(set(idx_o))))
    df_new = df.drop(idx)
    print('original dataframe rows: ' + str(df.shape[0]) +
          ' -> after delete: ' + str(df_new.shape[0]))
    return df_new.reset_index(drop=True)

for year in np.arange(2012, 2019):
    print("processing {} news data...".format(year))
    df = pd.read_csv(dat_path + '/1-DailyNews/cleaned_news_{}.csv'.format(year), usecols=usecols)
    svo = df['clean_title'].apply(ExtractSVO_usingVPchunker)
    append = pd.DataFrame(list(svo), columns=['subject', 'verb', 'object'])
    df = df.join(append)
    df = deleteNaN(df)
    print(df.head())
    print(df.shape)
    df.to_csv(dat_path + '/2-SVO/NPVPchunker/SVO_chunker_{}.csv'.format(year), encoding='utf-8-sig', index=False)

processing 2012 news data...
original dataframe rows: 277843 -> after delete: 234381
                                       connected_url  \
0  http://www.reuters.com/article/2012/01/02/us-f...   
1  http://www.reuters.com/article/2012/01/02/us-n...   
2  http://www.reuters.com/article/2012/01/02/us-n...   
3  http://www.reuters.com/article/2012/01/02/us-n...   
4  http://www.reuters.com/article/2012/01/02/us-c...   

                                            keywords  timestamp  \
0  Juan Salceto;Stephen Glatstein;US;FOOTBALL;ACC...   20120101   
1  Ian Cole;Justin Abdelkader;US;NHL;BLUES;COLE;N...   20120101   
2  Barry Trotz;Brendan Morrison;Jarome Iginla;Kev...   20120101   
3  Mark Leffingwell;Tim Tebow;US;NFL;BRONCOS;Spor...   20120101   
4  Cuba;Mexico;Benedict XVI;Fidel Castro;Raul Cas...   20120101   

                                         clean_title location  \
0  Oakland Raiders fan falls from bleachers after...      NaN   
1              Blues' Cole suspended for thre

processing 2017 news data...
original dataframe rows: 216089 -> after delete: 189041
                                       connected_url  \
0  http://www.reuters.com/article/us-people-bened...   
1  http://www.reuters.com/article/people-benedict...   
2  http://www.reuters.com/article/us-china-econom...   
3  http://www.reuters.com/article/us-health-birdf...   
4  http://www.reuters.com/article/health-birdflu-...   

                                            keywords  timestamp  \
0  United Kingdom;United States;Ancestry;Arthur C...   20170101   
1  United Kingdom;United States;Ancestry;Arthur C...   20170101   
2  China;Zhou Hao;US;CHINA;ECONOMY;PMI;Asia / Pac...   20170101   
3  China;US;HEALTH;BIRDFLU;CHINA;Health / Medicin...   20170101   
4  China;US;HEALTH;BIRDFLU;CHINA;Health / Medicin...   20170101   

                                         clean_title location  \
0  Benedict Cumberbatch and Sherlock Holmes is ma...      NaN   
1  Benedict Cumberbatch and Sherlock Holmes i

## Evaluation

In [13]:
df = pd.read_csv(dat_path + '/2-SVO/NPVPchunker/SVO_chunker_{}.csv'.format(2012))

In [17]:
import numpy as np
idx = np.random.choice(len(df), 20)
df[['clean_title', 'subject', 'verb', 'object']].iloc[idx]

Unnamed: 0,clean_title,subject,verb,object
143821,"Euro extends losses vs dollar, hits session low",Euro,extends,"losses vs dollar, hits session low"
205267,IIF warns of risk from delaying Greek aid paym...,IIF,warns,of risk from delaying Greek aid payments
161964,U.S. swaps regulator says should rely on some ...,U.S. swaps regulator,says,should rely on some foreign rules
153712,Brent crude futures extend gains to $1,Brent crude futures,extend,gains to $1
105252,KKR raises $3 billion for second Asia fund,KKR,raises,$3 billion for second Asia fund
8112,Putin aide says Kremlin must face up to protests,Putin aide,says,Kremlin must face up to protests
110887,Kidnapped Saudi envoy urges king not to forget...,Kidnapped Saudi,envoy,urges king not to forget him
129715,Israel denies report Obama aide shared Iran wa...,Israel,denies report,Obama aide shared Iran war plan
129516,CBS says no fear of TV shows getting drowned o...,CBS,says,no fear of TV shows getting drowned out by mon...
35652,"U.S. wants ""early"" return of U.N. inspectors t...",U.S.,wants,"""early"" return of U.N. inspectors to North Korea"


## Appendix:

Souce: **Tree Syntax of Natural Language (Lecture Note)**

```
"""


TABLE 1.______________________________________
<label> <longname>              <example>
NN      singular                noun pyramid
NNS     plural noun             lectures
NNP     proper noun             Khufu
VBD     past tense verb         claimed
VBZ     3rd person singular     is
        present tense verb
VBP     non-3rd person singular have
        present tense verb
VBN     past participle         found
PRP     pronoun                 they
PRP$    possessive pronoun      their
JJ      adjective               public
IN      preposition             in
        complementizer          that
DT      determiner              the

TABLE 2._________________________________________________
<label> <longname>              <example>(represented by terminal string)
NP      noun phrase             their public lectures
VP      verb phrase             built the pyramid
PP      prepositional           in the five chambers
        phrase
S       sentence                Khufu built the pyramid
SBAR    sbar                    that Khufu built the pyramid


Part of speech tags for verbs____________________________________
TABLE 3._________________________________________________________
<Tag>   <Long name>         <Example>
VBD     past tense          He ate/VBD the cookies.
                            She answered/VBD the question.
                            
VBZ     present tense       He likes/VBZ cookies.

VBP     present tense       They like/VBP cookies.
        3rd person          They answer/VBP such questions.
        plural              They are/VBP tired.
        
VB      base                He may like/VB cookies.
                            I heard her answer/VB the question.
                            They may be/VB tired.
                            
VBG     present participle, Eating/VG cookies is unhealthy.
        Gform               He likes eating/VG cookies.

VBN     past participle,    He has eaten/VBN the cookies.
        N-form              She has ansered/VBN the questions.
                            My question was not answered/VBN.

MD      modal               She will/MD prevail.
TO      auxiliary to        She expects to/TO prevail.

"""
```

Some other References:
    **Introduction to the CoNLL-2000 Shared Task: Chunking (2000, Tjong et al.)**