In [99]:
import sys
sys.path.append("../utils")
import pdtb_utils
import pandas as pd
import json
from os.path import join

base_dir = '../resources/conll16st-en-zh-dev-train-test_LDC2016E50/'

test_types = {'test': 'conll16st-en-03-29-16-test',
              'train': 'conll16st-en-03-29-16-train',
              'blind_test': 'conll15st-en-03-29-16-blind-test',
              'dev': 'conll16st-en-03-29-16-dev'}

def get_relations(test_type, separate_dual_classes=False):
    with open(join(base_dir, test_types[test_type], 'relations.json')) as f:
        for line in f:
            rel = pdtb_utils.DiscourseRelation(json.loads(line.strip()))
            if separate_dual_classes:
                senses = rel.senses().copy()
                for sense in senses:
                    rel.set_senses([sense])
                    if len(rel.senses()) > 1:
                        yield rel
            else:
                yield rel

relations = get_relations('train', separate_dual_classes=False)

df = pd.DataFrame([{'id': rel.relation_id(), 
                    'senses': tuple(rel.senses()), 
                    'relation_type': rel.relation_type(), 
                    'connective_token': rel.connective_token()} 
                   for rel in relations]).set_index('id')

In [93]:
type_distribution = pd.concat([df.groupby('relation_type').size(), pd.Series(df.count()['relation_type'], index=['Total'])])
type_distribution

AltLex        524
EntRel       4133
Explicit    14722
Implicit    13156
Total       32535
dtype: int64

In [100]:
rels_separate_sense = get_relations('train', separate_dual_classes=True)
df = pd.DataFrame([{'id': rel.relation_id(), 
                    'senses': tuple(rel.senses()), 
                    'relation_type': rel.relation_type(), 
                    'connective_token': rel.connective_token()} 
                   for rel in rels_separate_sense]).set_index('id')
df

TypeError: unorderable types: list() > int()

In [70]:
freq = pd.DataFrame(df.groupby('senses').count().sort_values('connective_token', ascending=False)['connective_token'])
freq['ratio'] = freq['connective_token'] / freq['connective_token'].sum()
with open('../paper/tables/sense_frequency.tex', 'w') as w:
    freq.head(20).to_latex(w)

## Initial thoughts

- Temporal contains typical temporal words, "years", "25 days later", etc. This should be fairly easy to solve by asking if it contains a certain set of temporal words.

## Most common discourse connective token

In [71]:
df.connective_token = df.connective_token.apply(str.lower)
df.senses = df.senses.apply(lambda x: "+".join(x))

In [72]:
token_rel_freq = df.groupby(['connective_token', 'senses']).count().sort_values('relation_type', ascending=False).rename(columns={'relation_type': 'freq'})
token_tot = df.groupby('connective_token').count()['senses']
token_ratio = token_rel_freq.divide(token_tot, axis=0, level=0).rename(columns={'freq': 'ratio'})
pd.concat([token_rel_freq, token_ratio], axis=1).sort_values('freq', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,freq,ratio
connective_token,senses,Unnamed: 2_level_1,Unnamed: 3_level_1
,EntRel,4133,1.000000
and,Expansion.Conjunction,3192,0.931971
but,Comparison.Contrast,2290,0.702885
because,Contingency.Cause,2141,0.990745
also,Expansion.Conjunction,1793,0.997219
specifically,Expansion.Restatement,911,0.968119
so,Contingency.Cause,879,0.985426
if,Contingency.Condition,874,0.988688
for example,Expansion.Instantiation,744,0.917386
however,Comparison.Contrast,690,0.727081
