In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
# Text preprocessing/analysis
import re
import nltk
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", context='talk', 
        palette=['#D44D5C', '#43AA8B'])

In [2]:
while not os.getcwd().endswith("CauseEffect_SharedTask"):
    os.chdir("../")
os.getcwd()

'/home/ifajcik/research/CauseEffect_SharedTask'

In [3]:
subtask_data = "data/train_subtask2.csv"

In [4]:
subtask_df = pd.read_csv(subtask_data)
subtask_df.head()

Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents
0,cnc,train_01_10,0,0,cnc_train_01_10_0_0,The farmworkers ' strike resumed on Tuesday wh...,<ARG1>The farmworkers ' strike resumed on Tues...,1,1,,1
1,cnc,train_01_17,0,0,cnc_train_01_17_0_0,The attack on Karayogams had sparked sharp rea...,<ARG0>The attack on Karayogams</ARG0> <ARG1>ha...,1,1,,1
2,cnc,train_01_47,0,0,cnc_train_01_47_0_0,"The strike began on September 9 , with Numsa d...","<ARG1>The strike began on September 9</ARG1> ,...",1,1,,1
3,cnc,train_01_70,0,0,cnc_train_01_70_0_0,Some police personnel intervened and urged the...,<ARG0>Some police personnel intervened</ARG0> ...,1,1,,1
4,cnc,train_01_83,0,0,cnc_train_01_83_0_0,Two bus drivers were hurt in the attacks .,<ARG1>Two bus drivers were hurt</ARG1> in <ARG...,1,1,,1


In [5]:
#did this just to check if there are more than 1 sentence in each sample
subtask_df.describe()

Unnamed: 0,sent_id,eg_id,seq_label,pair_label,context,num_sents
count,183.0,183.0,183.0,183.0,0.0,183.0
mean,0.0,0.147541,1.0,1.0,,1.0
std,0.0,0.425921,0.0,0.0,,0.0
min,0.0,0.0,1.0,1.0,,1.0
25%,0.0,0.0,1.0,1.0,,1.0
50%,0.0,0.0,1.0,1.0,,1.0
75%,0.0,0.0,1.0,1.0,,1.0
max,0.0,3.0,1.0,1.0,,1.0


In [6]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [7]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w.lower()) for w in w_tokenizer.tokenize(text)]

In [8]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/ifajcik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ifajcik/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
subtask_df['lemmas'] = subtask_df.text.apply(lemmatize_text)
subtask_df["tokens"] = subtask_df.text.apply(nltk.word_tokenize)
subtask_df["posR"] =  subtask_df.tokens.apply(lambda x: nltk.pos_tag(x))
tags = [[tag for word, tag in _] for _ in subtask_df["posR"].to_list()]
subtask_df["pos"] =  tags

In [10]:
subtask_df.head()

Unnamed: 0,corpus,doc_id,sent_id,eg_id,index,text,text_w_pairs,seq_label,pair_label,context,num_sents,lemmas,tokens,posR,pos
0,cnc,train_01_10,0,0,cnc_train_01_10_0_0,The farmworkers ' strike resumed on Tuesday wh...,<ARG1>The farmworkers ' strike resumed on Tues...,1,1,,1,"[the, farmworkers, ', strike, resumed, on, tue...","[The, farmworkers, ', strike, resumed, on, Tue...","[(The, DT), (farmworkers, NNS), (', POS), (str...","[DT, NNS, POS, NN, VBD, IN, NNP, WRB, PRP$, NN..."
1,cnc,train_01_17,0,0,cnc_train_01_17_0_0,The attack on Karayogams had sparked sharp rea...,<ARG0>The attack on Karayogams</ARG0> <ARG1>ha...,1,1,,1,"[the, attack, on, karayogams, had, sparked, sh...","[The, attack, on, Karayogams, had, sparked, sh...","[(The, DT), (attack, NN), (on, IN), (Karayogam...","[DT, NN, IN, NNP, VBD, VBN, JJ, NNS, IN, DT, N..."
2,cnc,train_01_47,0,0,cnc_train_01_47_0_0,"The strike began on September 9 , with Numsa d...","<ARG1>The strike began on September 9</ARG1> ,...",1,1,,1,"[the, strike, began, on, september, 9, ,, with...","[The, strike, began, on, September, 9, ,, with...","[(The, DT), (strike, NN), (began, VBD), (on, I...","[DT, NN, VBD, IN, NNP, CD, ,, IN, NNP, VBG, DT..."
3,cnc,train_01_70,0,0,cnc_train_01_70_0_0,Some police personnel intervened and urged the...,<ARG0>Some police personnel intervened</ARG0> ...,1,1,,1,"[some, police, personnel, intervened, and, urg...","[Some, police, personnel, intervened, and, urg...","[(Some, DT), (police, NN), (personnel, NNS), (...","[DT, NN, NNS, VBD, CC, VBD, PRP, TO, VB, RP, P..."
4,cnc,train_01_83,0,0,cnc_train_01_83_0_0,Two bus drivers were hurt in the attacks .,<ARG1>Two bus drivers were hurt</ARG1> in <ARG...,1,1,,1,"[two, bus, driver, were, hurt, in, the, attack...","[Two, bus, drivers, were, hurt, in, the, attac...","[(Two, CD), (bus, NN), (drivers, NNS), (were, ...","[CD, NN, NNS, VBD, VBN, IN, DT, NNS, .]"


In [11]:
def replace_labels(s):
    s = s.replace("<ARG0>","")
    s = s.replace("</ARG0>","")
    s = s.replace("<ARG1>","")
    s = s.replace("</ARG1>","")
    s = s.replace("<SIG0>","")
    s = s.replace("</SIG0>","")
    return s

In [12]:
args0=[]
args1=[]
sigs0=[]
arg0_regex = re.compile(r"(<ARG0>)(.*)(<\/ARG0>)")
arg1_regex = re.compile(r"(<ARG1>)(.*)(<\/ARG1>)")
sig0_regex = re.compile(r"(<SIG0>)(.*)(<\/SIG0>)")

for s in subtask_df.text_w_pairs.to_list():
    #print (s)
    match_arg0 = arg0_regex.search(s.strip())
    match_arg1 = arg1_regex.search(s.strip())
    match_sig0 = sig0_regex.search(s.strip())
    if match_arg0:
        arg0 = match_arg0.group()
        arg0 = replace_labels(arg0)
        args0.append(arg0)
        #print('Match found: {}\n: '.format(arg0))
    if match_arg1:
        arg1 = match_arg1.group()
        arg1 = replace_labels(arg1)
        args1.append(arg1)
    if match_sig0:
        sig0 = match_sig0.group()
        sig0 = replace_labels(sig0)
        sigs0.append(sig0)
    else:
        sigs0.append('')
        
#print(args0,'\n')
#print(args1,'\n')
#print(sigs0,'\n')
subtask_df['args0'] = args0
subtask_df['args1'] = args1
subtask_df['sigs0'] = sigs0
columns = ['args0','args1','sigs0']
subtask_df[columns].head(10) 

Unnamed: 0,args0,args1,sigs0
0,their demands were not met,The farmworkers ' strike resumed on Tuesday,
1,The attack on Karayogams,had sparked sharp reactions from the CPM leaders,sparked
2,with Numsa demanding a double-digit percentage...,The strike began on September 9,demanding
3,Some police personnel intervened,urged them to give up their protest,
4,the attacks,Two bus drivers were hurt,
5,Hartal supporters pelted stones at a KSRTC bus...,injuring driver Babu,injuring
6,The violence in Yuen Long,heightened tensions and fears of further attacks,has heightened
7,he could provide clues in resolving the blast ...,The NIA wanted to catch Akram alive,as
8,allegedly being involved in the blast,Two more youths were arrested later,for
9,use of the field,The clash took place,over


In [13]:
args0_string = " ".join(subtask_df.args0)
args1_string = " ".join(subtask_df.args1)
sigs0_string = " ".join(subtask_df.sigs0)
args0_splits = args0_string.split()
args1_splits = args1_string.split()
sigs0_splits = sigs0_string.split()
print("Number of strings (tokens) in ARGS0:{}, ARGS1:{}, SIGS0:{}".format(len(args0_splits),len(args1_splits),len(sigs0_splits)))
print("Number of unique strings (vocabulary) in ARGS0:{}, ARGS1:{}, SIGS0:{}".format(len(set(args0_splits)), len(set(args1_splits)),len(set(sigs0_splits))))

Number of strings (tokens) in ARGS0:977, ARGS1:1264, SIGS0:176
Number of unique strings (vocabulary) in ARGS0:508, ARGS1:632, SIGS0:78


In [14]:
args0_freq_splits = FreqDist(args0_splits)
args1_freq_splits = FreqDist(args1_splits)
sigs0_freq_splits = FreqDist(sigs0_splits)
print(F"***** 10 most common strings ARGS0 ***** \n{args0_freq_splits.most_common(10)}", "\n")
print(F"***** 10 most common strings ARGS1 ***** \n{args1_freq_splits.most_common(10)}", "\n")
print(F"***** 10 most common strings SIGS0 ***** \n{sigs0_freq_splits.most_common(10)}", "\n")

***** 10 most common strings ARGS0 ***** 
[('the', 90), ('to', 35), ('a', 28), ('of', 28), ('in', 23), ('The', 15), ('and', 14), ('their', 13), ('on', 12), ('protest', 12)] 

***** 10 most common strings ARGS1 ***** 
[('the', 54), ('to', 30), ('were', 25), ('a', 25), ('The', 23), ('on', 22), ('was', 22), ('in', 21), ('and', 20), ('of', 20)] 

***** 10 most common strings SIGS0 ***** 
[('to', 21), ('for', 12), ('of', 10), ('demanding', 8), ('by', 7), ('as', 6), ('in', 5), ('following', 4), ('left', 4), ('protesting', 3)] 



In [15]:
def summarise(pattern, strings, freq):
    """Summarise strings matching a pattern."""
    # Find matches
    compiled_pattern = re.compile(pattern)
    matches = [s for s in strings if compiled_pattern.search(s)]
    
    # Print volume and proportion of matches
    print("{} strings, that is {:.2%} of total".format(len(matches), len(matches)/ len(strings)))
    
    # Create list of tuples containing matches and their frequency
    output = [(s, freq[s]) for s in set(matches)]
    output.sort(key=lambda x:x[1], reverse=True)
    
    return output

In [16]:
#how frequent are numbers?
print("ARGS0")
print(summarise(r"\d", args0_splits, args0_freq_splits),'\n')
print("ARGS1")
print(summarise(r"\d", args1_splits, args1_freq_splits),'\n')
print("SIGS0")
print(summarise(r"\d", sigs0_splits, sigs0_freq_splits),'\n')

ARGS0
9 strings, that is 0.92% of total
[('2008', 2), ('26', 1), ('R9000', 1), ('<SIG1>to</SIG1>', 1), ('200', 1), ('R16,000', 1), ('50', 1), ('48', 1)] 

ARGS1
26 strings, that is 2.06% of total
[('12', 2), ('<SIG1>condemned</SIG1>', 1), ('250', 1), ('65', 1), ('30,000', 1), ('1956', 1), ('<SIG1>embolden</SIG1>', 1), ('25', 1), ('10,000', 1), ('76', 1), ('69', 1), ('4:45am', 1), ('20,000', 1), ('58', 1), ('9', 1), ('55', 1), ('166', 1), ('<SIG2>stirring</SIG2>', 1), ('120', 1), ('13', 1), ('17,200', 1), ('700', 1), ('60,000', 1), ('417', 1), ('38', 1)] 

SIGS0
0 strings, that is 0.00% of total
[] 



In [17]:
from collections import defaultdict

texts = list(subtask_df.text)
annotations = subtask_df.text_w_pairs.to_list()

multiann_dict = defaultdict(lambda:[])

sent_counts=[]
for i,s in enumerate(texts):
    sent_counts.append(texts.count(s))
    multiann_dict[s].append(annotations[i])
    
assert 0 not in sent_counts
print(f"Average # of annotations per sentence is: {sum(sent_counts)/len(sent_counts):.2f}")

Average # of annotations per sentence is: 1.30


In [18]:
total_multianncases = 0
for k,v in multiann_dict.items():
    if len(v)>1:
        total_multianncases+=1
        print(f"Sentence:\n{k}")
        print("*"*20)
        joined_annotations = '\n----\n'.join(v)
        print(f"Annotations:\n{joined_annotations}")
        print("\n\n")

Sentence:
Shiv Sena workers , including some legislators , today staged demonstrations against Karnataka Chief Minister B S Yeddyurappa here , protesting the ' forcible installation ' of a Kannadiga mayor in Marathi-dominated Belgaum city .
********************
Annotations:
<ARG1>Shiv Sena workers , including some legislators , today staged demonstrations against Karnataka Chief Minister B S Yeddyurappa here</ARG1> , <ARG0><SIG0>protesting</SIG0> the ' forcible installation ' of a Kannadiga mayor in Marathi-dominated Belgaum city</ARG0> .
----
<ARG1>Shiv Sena workers , including some legislators , today staged demonstrations</ARG1> <ARG0>against Karnataka Chief Minister B S Yeddyurappa</ARG0> here , protesting the ' forcible installation ' of a Kannadiga mayor in Marathi-dominated Belgaum city .



Sentence:
Following the incident , the labourers went on a dharna and stopped work .
********************
Annotations:
<SIG0>Following</SIG0> <ARG0>the incident</ARG0> , <ARG1>the labourers 

In [19]:
print(f"Total multiannotation cases: {total_multianncases}/{len(multiann_dict)}")

Total multiannotation cases: 20/160


There is significant amount of multiple annotations per sentence! Every 8th example contains more than 1 triplet!