In [1]:
from DataHandler import DataHandler
from CentralityScorer import CentralityScorer
from ArgumentativenessScorer import ArgumentativenessScorer
from ContrastivenessScorer import ContrastivenessScorer
from WordEmbeddingTransformer import WordEmbeddingTransformer

from sklearn.base import BaseEstimator
import numpy as np

# Load data

In [2]:
data = DataHandler()
#data.load_json('../../not-gitted/dataset_as_json_file.json')
data.load_bin('../../not-gitted/dataset_as_json_file.pickle')
len(data.get_arguments())

100

In [3]:
print(data.get_arguments()[0].sentences)

['In 2011 there were about 730,322 abortions reported to the centers for disease control.', "There are about 1.7% of abortion of women's ages from 15-44 each year.", 'Women who already had abortion earlier in there life time have abortion again.', 'At the age of 45 a women will have at least one abortion.', 'By the 12th week of pregnancies 88.7% of women have abortion.', 'In the U.S. black women are 3.3 times likely to have an abortion than white women.']


# Contra LexRank

LexRank is a graph based model (based on PageRank) to assess centrality of sentences. Each node represents one sentence, and the weighted edges in between give the similarity of two sentences. The second term is to account for argumentativeness of the sentences, which is neglected at the moment.
$$
P(s_i) = (1-\alpha) \cdot \sum_{s_j \ne s_i} \frac{sim(s_i, s_j)}{\sum_{s_j \ne s_k} sim(s_j, s_k)} P(s_j) + \alpha \cdot \frac{arg(s_i)}{\sum_{s_k}arg(s_k)}
$$
Now, the idea is to account for contrastiveness between a sentence and its context to make the summary of an argument dissimilar to other summaries of the same stance. First approach is to extend the above formula by dissimilarity term $\delta_C(s_i)$:
$$P(s_i) = \alpha_0 \cdot \sum_{s_j \ne s_i} \frac{sim(s_i, s_j)}{\sum_{s_j \ne s_k} sim(s_j, s_k)} P(s_j) + \alpha_1 \cdot \frac{arg(s_i)}{\sum_{s_k}arg(s_k)} + \alpha_2\cdot\delta_C(s_i)
$$
where $\delta_C$ denotes the dissimilarity towards some context, i.e, a collection of other arguments, and $\alpha_l\in [0,1]$ with  $\sum_{l=0}^2 \alpha_l = 1$. This could be
* $\delta_C(s_i)=\sum_{c \in C}1-sim(s_i, c)$ 

In [7]:
type(1)

int

In [12]:
class ContraLexRank(BaseEstimator):
    def __init__(self,
                 centrality_scorer: CentralityScorer,
                 arg_scorer: ArgumentativenessScorer,
                 contra_scorer: ContrastivenessScorer,
                 alpha: float):
        self.centrality_scorer = centrality_scorer
        self.arg_scorer = arg_scorer
        self.contra_scorer = contra_scorer
        self.alpha = alpha

    def fit(self, X, y=None):
        # Todo: Refactor as pipeline https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
        #X = self.word_embedding.transform(X)
        X = self.arg_scorer.transform(X)
        #self.contra_scorer
        return self

    def predict(self, X):
        X = self.centrality_scorer.transform(X)
        for argument in X:
            _total_score = (1-self.alpha) * np.array(argument.centrality_scores) + self.alpha * np.array(argument.argumentativeness_scores)
            idx = np.argsort(_total_scores)
            excerpt = list()
            excerpt.append(argument.sentences[idx[-1]])
            if len(idx) >=2:
                excerpt.append(argument.sentences[idx[-2]])
            argument.excerpt = excerpt
        return X

In [13]:
clr = ContraLexRank(CentralityScorer(), ArgumentativenessScorer(), ContrastivenessScorer(), .5)

ContrastivenessScorer initialized.


In [14]:
clr.fit(data.get_arguments()[:20])
result = clr.predict(data.get_arguments()[:20])

ValueError: operands could not be broadcast together with shapes (6,) (12,) 

In [70]:
result[4].excerpt, result[4].snippet

(['What this means is that: 1) Abortion is proposed as a singular type 2) However, according to my opponent, there is a difference in the morality/ethics of abortion, varying based on the time of incubation (meaning that there are two types of abortion) 3) Not getting pregnant is a singular type 4) My opponent judged that abortion is (ethically/morally) the equivalent not getting pregnant in the first place, despite there being black-and-white differences (i.e. singular term) in regards to the abortion in question C) Therefore, my opponent has to keep the morality/ethics of abortion either wrong or "okay" (due to the construction of the resolution requiring abortion to equal not getting pregnant), yet my opponent has argued that the ethics/morality of abortion varies!',
  'Having an abortion is the equivalent of not getting pregnant in the first place".'],
 ['There is no child death involved in not getting pregnant, yet there is always child death with abortion.',
  'Thus, abortion is 

In [29]:
np.argsort(result[0].centrality_scores)[-2]

2