In [1]:
%load_ext autoreload
%autoreload 2  # Autoreload all modules
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# libs
import glob
import pandas as pd
import random
import pprint
pp = pprint.PrettyPrinter(indent=4)
#import importlib

import itertools
from collections import Counter
from collections import OrderedDict
import matplotlib.pyplot as plt
import numpy as np

from nltk import sent_tokenize
from nltk.tokenize import word_tokenize

import networkx as nx

from tqdm import tqdm

# perso
import sys
sys.path.append('utils/')
sys.path.append('argdisc/')

sys.path.insert(1, '/Users/lhuber/Documents/Research/Phd/workdir/SciDTB/code/probing/')

from data_mgmt import *
from graphs_utils import *
from moves_utils import *
from corpus_utils import Corpus
from preprocessing_utils import *
from arg_ana import *
from gspan import *
from arg_disc_ana import *
# Import corpus to dataframe (text and units)

# the corpus is organized following 3 directories
# in each directory, some files are annotated by multiple annotators
# so we take all the uniq documents (just 1 annotator)
pathtodirtrain = "data/train/"
pathtodirtest  = "data/test/gold/"
pathtodirdev   = "data/dev/gold/"
pathtographs   = "data/graphimages/"
path2properties = "data/SciDTB_articles_properties.csv"


# considere only 1 annotator (taken randomly)
# /!\ sera à regarder proprement selon les exéps à venir
allano = "*.dep"
files = glob.glob(pathtodirtrain+allano) # for train keep only one annotator
# remove duplicate documents (other annotators)
files = [x for x in files if "anno2" not in x and "anno3" not in x] 
files += glob.glob(pathtodirtest+allano)
files += glob.glob(pathtodirdev+allano)


c = Corpus(files=files)

# Load global corpus

In [3]:
movesdtf = moves_dtf(c.corpus)

In [4]:
movesdtf["macrotype"] = movesdtf.apply(lambda row: meta_type(row.gbg, row.gmethod, row.geval),  axis=1)

In [5]:
names = list(movesdtf.index)

# Prepare Arg data

In [6]:
arg_annots = "data/scidtb_argmin_annotations/*.conll.good"
basename = "data/scidtb_argmin_annotations/"
endname = "-fexp-corpus.conll.good"
arg_files = [x for x in glob.glob(arg_annots)]

In [7]:
docs = []
for file in arg_files:
    docname = file.replace(basename, "")
    docname = docname.replace(endname, "")
    docs.append(docname)

In [8]:
argdtf = movesdtf.loc[docs]

In [9]:
argdtf["arg_annot"] = argdtf.apply(lambda x: get_arg_annot(basename+x.name+endname), axis=1)
argdtf["arg_graph"] = argdtf.apply(lambda x: get_arg_graph(x.arg_annot), axis=1)
argdtf["arg_types"] = argdtf.apply(lambda x: get_arg_types(get_arg_annot(basename+x.name+endname)), axis=1)

In [10]:
argdiscdtf = argdtf[["segments","edus_sent_ids","graph", "arg_graph", "arg_annot", "arg_types"]]

# Predict Arg segmentation from sentence segmentation

In [11]:
argdiscdtf["arg_sents"] = argdiscdtf.apply(lambda x: get_arg_sents(get_arg_annot(basename+x.name+endname)), axis=1)
argdiscdtf["segtxt"] = argdiscdtf.apply(lambda row: preprocess(row.segments), axis=1)
argdiscdtf["argtxt"] = argdiscdtf.apply(lambda row: preprocess(row.arg_sents), axis=1)
argdiscdtf["edus_adus_mapping"] = argdiscdtf.apply(lambda row: to_edus_adus_mapping(row.argtxt, row.segtxt), axis=1)
argdiscdtf["adus_edus_mapping"] = argdiscdtf.apply(lambda row: remove_nones(to_adus_edus_mapping(row.edus_adus_mapping)), axis=1)
argdiscdtf["sents_edus_mapping"] = argdiscdtf.apply(lambda row: to_sent_edus_mapping(row.edus_sent_ids), axis=1)

In [12]:
argdiscdtf["true"] = argdiscdtf.apply(lambda row: list(row.adus_edus_mapping.values()), axis=1)
argdiscdtf["pred"] = argdiscdtf.apply(lambda row: list(row.sents_edus_mapping.values()), axis=1)
evaluate = argdiscdtf[["true", "pred"]]

In [13]:
def get_tp(true, pred):
    tp = 0
    for p in pred:
        if p in true:
            tp+=1
    return tp

def get_fn(true, pred):
    fn = 0
    for t in true:
        if t not in pred:
            fn+=1
    return fn

def get_fp(true, pred):
    fp = 0
    for p in pred:
        if p not in true:
            fp+=1
    return fp
            
def precision(true, pred):
    tp = get_tp(true, pred)
    fp = get_fp(true, pred)
    return tp/(tp+fp)
    

def rappel(true, pred):
    tp = get_tp(true, pred)
    fn = get_fn(true, pred)
    return tp/(tp+fn)

def jacc(list1, list2):
    list1 = list(map(lambda x: "".join([str(y) for y in x]), list1))
    list2 = list(map(lambda x: "".join([str(y) for y in x]), list2))

    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [14]:
evaluate["prec"] = evaluate.apply(lambda row : precision(row.true, row.pred), axis=1)
evaluate["rappel"] = evaluate.apply(lambda row : rappel(row.true, row.pred), axis=1)
evaluate["jacc"] = evaluate.apply(lambda row : jacc(row.true, row.pred), axis=1)

In [15]:
evaluate["prec"].describe()

count    60.000000
mean      0.875516
std       0.171686
min       0.333333
25%       0.800000
50%       1.000000
75%       1.000000
max       1.000000
Name: prec, dtype: float64

In [16]:
evaluate["rappel"].describe()

count    60.000000
mean      0.838353
std       0.226509
min       0.200000
25%       0.666667
50%       1.000000
75%       1.000000
max       1.000000
Name: rappel, dtype: float64

# Predict Arg structure from discourse structure

In [17]:
def get_sent_edus(edussents):
    out = {}
    for edu, sent in enumerate(edussents):
        if sent not in out.keys():
            out[sent] = [edu]
        else:
            out[sent].append(edu)
    return out

def get_sent_parent_rel(sentsub, graph):
    #draw_graph(sentsub)
    for node in sentsub.nodes():
        succ = list(graph.successors(node))
        # if node is source of arg rel
        if succ != [] and succ[0] not in sentsub.nodes():
            #print("yes, arg from : " + str(node))
            source_edu = node
            trg = list(graph.successors(source_edu))
            parent_edu = trg[0]
            #print("to : " +str(parent_edu))
            return (source_edu, parent_edu)
    return (None, None)

def get_adu(edu, mapping):
    return mapping[edu]

def get_arg_rel(discrelname):
    mapping = {"attribution": "detail",
               "bg-compare": "support", 
               "bg-general": "support", 
               "bg-goal": "support",
               "cause": "detail", 
               "comparison": None,
               "contrast": "additional",
               "elab-addition": "detail", 
               "elab-aspect": "detail",
               "elab-enum_member": "detail", 
               "elab-example": "detail", 
               "elab-process_step": "detail", 
               "enablement": "detail", 
               "evaluation": "support",
               "exp-evidence": "support", 
               "exp-reason": "detail", 
               "joint": "detail", 
               "manner-means": "detail", 
               "progression": "detail", 
               "result": "additionnal", 
               "summary": "detail"}
    return mapping[discrelname]
    
def predict_struct(dg, edussentmapping, labels=False):
    """
     predicts argumentation structure from a graph
     and a edussentmapping
    """
    sent_edus = get_sent_edus(edussentmapping)
    sent_subs = [dg.subgraph(edus) for k,edus in sent_edus.items()]
    
    out_g = nx.DiGraph()
    out_g.add_nodes_from(range(0,len(sent_subs)-1))
    
    for sentid, sentsub in enumerate(sent_subs):
        (edusource, eduparent) = get_sent_parent_rel(sentsub, dg) # first recovers source, and then parent node
        if eduparent is not None and edusource is not None:
            aduparent = get_adu(eduparent, edussentmapping)
            # calculate arg relation if labels==true
            if labels==True:
                label = get_arg_rel(dg[edusource][eduparent]["label"])
            else:
                label = dg[edusource][eduparent]["label"]
            out_g.add_edge(sentid, aduparent, label=label)
    return out_g

In [18]:
argdiscdtf["arg_pred"] = argdiscdtf.apply(lambda row: predict_struct(row.graph, row.edus_sent_ids), axis=1)

In [19]:
# ici le score est très fort
# on pourrait considérer assigner quelques points de précision lorsque l'argument a au moins un 
# ancêtre commun, ce qui veut dire qu'il est rattaché aux mêmes unités mais pas forcément dans
# le même ordre
argdiscdtf["sim_rels"] = argdiscdtf.apply(lambda row: prop_sim_rels(row.arg_graph, row.arg_pred), axis=1)
argdiscdtf["sim_dirrels"] = argdiscdtf.apply(lambda row: prop_sim_rels(row.arg_graph, row.arg_pred, direc=True), axis=1)
argdiscdtf["prec_dir"] = argdiscdtf.apply(lambda row: precision(row.arg_pred.edges(), row.arg_graph.edges()), axis=1)
argdiscdtf["recall_dir"] = argdiscdtf.apply(lambda row: rappel(row.arg_pred.edges(), row.arg_graph.edges()), axis=1)



argdiscdtf["prec_dir"] = argdiscdtf.apply(lambda row: precision(row.arg_graph.edges(), row.arg_pred.edges()), axis=1)
argdiscdtf["recall_dir"] = argdiscdtf.apply(lambda row: rappel(row.arg_graph.edges(), row.arg_pred.edges()), axis=1)


argdiscdtf["prec_dir"] = argdiscdtf.apply(lambda row: precision(row.arg_graph.edges(), row.arg_pred.edges()), axis=1)
argdiscdtf["recall_dir"] = argdiscdtf.apply(lambda row: rappel(row.arg_graph.edges(), row.arg_pred.edges()), axis=1)


argdiscdtf["prec_undir"] = argdiscdtf.apply(lambda row: precision([set(x) for x in row.arg_graph.edges()], [set(x) for x in row.arg_pred.edges()]), axis=1)
argdiscdtf["recall_undir"] = argdiscdtf.apply(lambda row: rappel([set(x) for x in row.arg_graph.edges()], [set(x) for x in row.arg_pred.edges()]), axis=1)

#argdisc["sim_labels"] = argdisc.apply(lambda row: prop_sim_rels(row.arg_graph, row.arg_pred, direc=True), axis=1)

In [20]:
argdiscdtf["prec_dir"].describe()

count    60.000000
mean      0.565714
std       0.319563
min       0.000000
25%       0.296429
50%       0.500000
75%       0.875000
max       1.000000
Name: prec_dir, dtype: float64

In [21]:
argdiscdtf["recall_dir"].describe()

count    60.000000
mean      0.535754
std       0.327063
min       0.000000
25%       0.276786
50%       0.500000
75%       0.781250
max       1.000000
Name: recall_dir, dtype: float64

In [22]:
## look @ rslts
idx = 11
docname = argdiscdtf.index[idx]
pred = argdiscdtf["arg_pred"][idx]
true = argdiscdtf["arg_graph"][idx]
dg = argdiscdtf["graph"][idx]
mapp = argdiscdtf["edus_sent_ids"][idx]
sim = argdiscdtf["sim_rels"][idx]
simdir = argdiscdtf["sim_dirrels"][idx]
simdir = argdiscdtf["sim_dirrels"][idx]

In [23]:
# Predict labels from discourse structure

In [24]:
argdiscdtf.columns

Index(['segments', 'edus_sent_ids', 'graph', 'arg_graph', 'arg_annot',
       'arg_types', 'arg_sents', 'segtxt', 'argtxt', 'edus_adus_mapping',
       'adus_edus_mapping', 'sents_edus_mapping', 'true', 'pred', 'arg_pred',
       'sim_rels', 'sim_dirrels', 'prec_dir', 'recall_dir', 'prec_undir',
       'recall_undir'],
      dtype='object')

In [28]:
argdiscdtf["arg_labels_pred"] = argdiscdtf.apply(lambda row: predict_struct(row.graph, row.edus_sent_ids, labels=True), axis=1)

In [29]:
argdiscdtf["sim_rels"] = argdiscdtf.apply(lambda row: prop_sim_rels(row.arg_graph, row.arg_labels_pred, labels=True), axis=1)
argdiscdtf["prec_dir"] = argdiscdtf.apply(lambda row: precision(row.arg_labels_pred.edges(data=True), row.arg_graph.edges(data=True)), axis=1)
argdiscdtf["recall_dir"] = argdiscdtf.apply(lambda row: rappel(row.arg_labels_pred.edges(data=True), row.arg_graph.edges(data=True)), axis=1)

In [30]:
argdiscdtf["recall_dir"].describe()

count    60.000000
mean      0.470119
std       0.297309
min       0.000000
25%       0.250000
50%       0.500000
75%       0.666667
max       1.000000
Name: recall_dir, dtype: float64

In [31]:
argdiscdtf["prec_dir"].describe()

count    60.000000
mean      0.451587
std       0.304562
min       0.000000
25%       0.200000
50%       0.464286
75%       0.666667
max       1.000000
Name: prec_dir, dtype: float64

## Draft below

In [32]:
## Look at docs that have diff nb args

In [33]:
def has_same_nb_args(g1, g2):
    return len(g1.nodes()) == len(g2.nodes())

In [34]:
argdisc["diff_nb_args"] =  argdisc.apply(lambda row: not(has_same_nb_args(row.arg_pred, row.arg_graph)), axis=1)
argdisc["same_nb_args"] =  argdisc.apply(lambda row: has_same_nb_args(row.arg_pred, row.arg_graph), axis=1)

NameError: name 'argdisc' is not defined

In [35]:
len(argdisc[argdisc["same_nb_args"]]["arg_graph"][0])

NameError: name 'argdisc' is not defined

In [None]:
len(argdisc[argdisc["diff_nb_args"]]["arg_graph"][0])

In [None]:
draw_graph(argdisc[argdisc["same_nb_args"]]["arg_graph"][0])
draw_graph(argdisc[argdisc["same_nb_args"]]["arg_pred"][0])

In [None]:
argdisc[argdisc["same_nb_args"]]["sim_rels"].describe()

In [None]:
argdisc[argdisc["same_nb_args"]]["sim_dirrels"].describe()

In [None]:
argdisc[argdisc["same_nb_args"]]["sim_rels"].describe()

In [None]:
argdisc[argdisc["diff_nb_args"]]["sim_rels"].describe()

In [None]:
argdisc[argdisc["diff_nb_args"]]["sim_dirrels"].describe()