In [3]:
import json
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from lexrank import LexRank
from multiprocessing import Process, Pool
from joblib import Parallel, delayed
import pickle

In [2]:
import requests
import urllib.parse

In [2]:
with open('data/filtered-w-reference-snippets-r_0.1-args-me.json', 'r') as f:
    d = json.load(f)

__Features__
* Text features (as TS-ISF)
    * unigrams, stemmed, stopwords removed
    * bigrams, stemmed, stopwords removed
    * entities, DBpedia spotlight
* Surface features
    * position, 1st, 2nd, 3rd, or later position
    * number of words
    * number of nouns (spacy)
    * tfisf, sum of the TS-ISF scores for unigrams composing the sentence.
    * btfisf, tfisf multiplied with 2(3) if a word appear in the first sentence
    * LexRank scores, How?
    
__Additional__
* Indicator whether the first token is a pronoun (snippet should be self-contained, [BarHaim.2020.b])
* Argumentativeness
* position, last, second-last, third-last, or earlier (arguments seem to have their sum-up at the end)

__Collect all words__ [Maybe for later](https://github.com/dwyl/english-words/)

In [3]:
%%time
vocabulary = set()
for argument in tqdm(d):
    tokens = word_tokenize(argument['premises'][0]['text'])
    vocabulary.update(tokens)

100%|███████████████████████████████████████████████████████████████████████████| 44279/44279 [03:00<00:00, 245.07it/s]

Wall time: 3min





In [4]:
len(vocabulary)

267809

# Text Features

## Uni- and bigrams

In [5]:
vectorizer = TfidfVectorizer(lowercase=True, 
                             preprocessor = lambda s: re.sub('[^A-Za-z,.?!]', '', s),
                             tokenizer=word_tokenize, 
                             stop_words='english', 
                             #ngram_range=(1,2), # Bigrams only for text features
                             #vocabulary=vocabulary
                            )

vectorizer

TfidfVectorizer(preprocessor=<function <lambda> at 0x0000025E13B1B8B8>,
                stop_words='english',
                tokenizer=<function word_tokenize at 0x0000025E70313AF8>)

In [6]:
vectorizer.fit(vocabulary)



TfidfVectorizer(preprocessor=<function <lambda> at 0x0000025E13B1B8B8>,
                stop_words='english',
                tokenizer=<function word_tokenize at 0x0000025E70313AF8>)

%%time
def trnsfrm(argument):
    sentences = argument['premises'][0]['sentences']
    ub_grams = vectorizer.transform(sentences)
    argument['premises'][0]['ub_grams'] = ub_grams
    
Parallel(n_jobs=4, require='sharedmem')(delayed(trnsfrm)(arg) for arg in d)

In [7]:
%%time
for argument in tqdm(d):
    sentences = argument['premises'][0]['sentences']
    ub_grams = vectorizer.transform(sentences)
    argument['premises'][0]['ub_grams'] = ub_grams

100%|████████████████████████████████████████████████████████████████████████████| 44279/44279 [08:16<00:00, 89.16it/s]

Wall time: 8min 16s





# Entities

See [DBpedia API](https://www.dbpedia-spotlight.org/api)

In [20]:
def spotting(argument):
    sentences = argument['premises'][0]['sentences']
    spotted = list()
    for i, s in enumerate(sentences):
        encoded = urllib.parse.quote(s)
        response = requests.get(f'https://api.dbpedia-spotlight.org/en/spot?text={encoded}',  headers={"accept":"application/json"})
        json_response = json.loads(response.text)
        if 'surfaceForm' in json_response['annotation']:
            spotted.append((i, json_response['annotation']['surfaceForm']['@name']))
    argument['premises'][0]['entities'] = spotted

## Surface Features

In [8]:
def position(argument):
    number_of_sents = len(argument['premises'][0]['sentences'])
    values = [3 if i > 2 else i for i in range(number_of_sents)]
    argument['premises'][0]['position'] = np.array(values)

In [9]:
def count_words(argument):
    counts = list()
    for s in argument['premises'][0]['sentences']:
        counts.append(len(word_tokenize(s)))
    argument['premises'][0]['word_counts'] = np.array(counts)

In [10]:
def count_nouns(argument):
    counts = list()
    for s in argument['premises'][0]['sentences']:
        tags = pos_tag(word_tokenize(s))
        count = sum([1 if 'NN' in t[1] else 0 for t in tags])
        counts.append(count)
    argument['premises'][0]['noun_counts'] = np.array(counts)

In [11]:
def tfisf(argument):
    n = len(argument['premises'][0]['sentences'])
    values = np.full(n, 0.0)
    for i in range(n):
        values[i] = np.sum(d[0]['premises'][0]['ub_grams'][:, i])
        
    argument['premises'][0]['tfisf'] = values

In [12]:
def btfisf(argument):
    n = len(argument['premises'][0]['sentences'])
    values = np.full(n, 0.0)
    first_sent = d[0]['premises'][0]['ub_grams'][:, 0]
    for i in range(n):
        if i == 0:
            values[i] = 3 * np.sum(d[0]['premises'][0]['ub_grams'][:, i])
        else:
            np.sum([w*3 if w in first_sent else w for w in d[0]['premises'][0]['ub_grams'][:, i]])
        
    argument['premises'][0]['btfisf'] = values

In [13]:
def lr(argument):
    sentences = argument['premises'][0]['sentences']
    lxr = LexRank(sentences)
    scores_cont = lxr.rank_sentences(
        sentences,
        threshold=None,
        fast_power_method=False,
    )
    assert len(sentences) == len(scores_cont), f'Scores do not match sentences. sents = {len(sentences)}, scores = {scores_cont}'
    argument['premises'][0]['lr'] = scores_cont

In [14]:
%%time
for argument in tqdm(d):
    position(argument)
    count_words(argument)
    count_nouns(argument)
    tfisf(argument)
    #btfisf(argument)
    lr(argument)

100%|██████████████████████████████████████████████████████████████████████████| 44279/44279 [1:19:34<00:00,  9.27it/s]

Wall time: 1h 19min 34s





In [15]:
import pickle

In [18]:
with open('data/features.json', 'wb') as f:
    pickle.dump(d, f)

In [None]:
%%time
for argument in tqdm(d):
    p0=Process(target=position, args=(argument,))
    p0.start()
    #p1=Process(target=count_words, args=(argument,))
    #p1.start()
    #p2=Process(target=count_nouns, args=(argument,))
    #p2.start()
    #p3=Process(target=tfisf, args=(argument,))
    #p3.start()
    #p4=Process(target=lr, args=(argument,))
    #p4.start()
    
    p0.join()
    #p1.join()
    #p2.join()
    #p3.join()
    #p4.join()

# Reversing

In [33]:
d[0]['premises'][0]['sentences']

['Why is it that so-called christians, Because there is no such a thing as a christian, Have serious trouble as READING and COMPREHENDING?',
 'Its not that difficult, Nor is it that hard.',
 'It was stated unto you a very simple "* "You are asking why God would forgive the murderer. "',
 'OK we"re done.',
 'You paid absolutely no attention whatsoever to the verses presented and instead went off into your own la la land. "',
 'But nah, All you did was babble on and on and on.',
 'So in this sense, It was YOU that forfeited.',
 'Sheesh!',
 'Bye.']

In [51]:
np.sum(d[0]['premises'][0]['ub_grams'][:, 0])

0.7208751658653264

In [46]:
vec = d[0]['premises'][0]['ub_grams']

In [31]:
vec.shape

(9, 26991)

In [32]:
vectorizer.inverse_transform(vec)

[array(['trouble', 'thing', 'so-called', 'reading', 'christians',
        'christian', '?', ','], dtype='<U129'),
 array(['hard', 'difficult', '.', ','], dtype='<U129'),
 array(['unto', 'stated', 'simple', 'murderer.', 'god', 'forgive',
        'asking', '``', '*'], dtype='<U129'),
 array(['ok', '.', "''"], dtype='<U129'),
 array(['whatsoever', 'went', 'verses', 'presented', 'paid', 'land.', 'la',
        'instead', 'attention', 'absolutely', '``'], dtype='<U129'),
 array(['nah', 'did', 'babble', '.', ','], dtype='<U129'),
 array(['sense', 'forfeited', '.', ','], dtype='<U129'),
 array(['sheesh', '!'], dtype='<U129'),
 array(['bye', '.'], dtype='<U129')]

In [34]:
fi = vec[0,:].nonzero()[1]
scores = zip(fi, [vec[0,x] for x in fi])

In [36]:
for w, s in [(vectorizer.get_feature_names()[i], s) for (i, s) in scores]:
    print(w,s)

trouble 0.320270212381216
thing 0.2991483041507577
so-called 0.320270212381216
reading 0.2991483041507577
christians 0.2991483041507577
christian 0.3079146881227244
? 0.320270212381216
, 0.5735855597845322
