In [6]:
import os.path as p
import pandas as pd
import nltk
import pickle
import spacy as s
from spacy import displacy
import plotly.express as px
import plotly.graph_objects as go

from nltk.tokenize import word_tokenize
from definitions import *
from dataset_helper_functions import *

In [7]:
nlp = s.load('en_core_web_lg')

sent = 'My car is faster than the other cars.'
# sent = 'You have said that it would undermine who we are as Americans, shutting our doors.'
# sent = 'There is a pretty excruciating process that refugees go through.'
sent = 'There are two economic realities in America today.'
sent = 'We must invest in our people.'
# sent = 'I prefer the morning flight through Denver.'
nlpd = nlp(sent)

# for t in nlpd:
#     print(t.text, ': ', t.pos_, ' | ', t.tag_)
for t in nlpd:
    print(t.text, ': ', t.dep_, ' | head: ', t.head.text)

# displacy.serve(nlpd, style="dep")


We :  nsubj  | head:  invest
must :  aux  | head:  invest
invest :  ROOT  | head:  invest
in :  prep  | head:  invest
our :  poss  | head:  people
people :  pobj  | head:  in
. :  punct  | head:  invest


In [8]:
tags = nlp.get_pipe('parser').labels

for t in tags:
    print(t, ': ', s.explain(t))

ROOT :  None
acl :  clausal modifier of noun (adjectival clause)
acomp :  adjectival complement
advcl :  adverbial clause modifier
advmod :  adverbial modifier
agent :  agent
amod :  adjectival modifier
appos :  appositional modifier
attr :  attribute
aux :  auxiliary
auxpass :  auxiliary (passive)
case :  case marking
cc :  coordinating conjunction
ccomp :  clausal complement
compound :  compound
conj :  conjunct
csubj :  clausal subject
csubjpass :  clausal subject (passive)
dative :  dative
dep :  unclassified dependent
det :  determiner
dobj :  direct object
expl :  expletive
intj :  interjection
mark :  marker
meta :  meta modifier
neg :  negation modifier
nmod :  modifier of nominal
npadvmod :  noun phrase as adverbial modifier
nsubj :  nominal subject
nsubjpass :  nominal subject (passive)
nummod :  numeric modifier
oprd :  object predicate
parataxis :  parataxis
pcomp :  complement of preposition
pobj :  object of preposition
poss :  possession modifier
preconj :  pre-correlati

Global switches.

In [9]:
try:
    IS_MASTER
except: 
    IS_MASTER = False

if not IS_MASTER:
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')
    dev_tsv_path = p.join(dev_path, 'dev.tsv')
    dev_pkl_path = p.join(dev_path, 'dev_spacy.pkl')

    # if p.exists(dev_pkl_path):
    #     dev = pd.read_pickle(dev_pkl_path)
    # else:
    #     if not p.exists(dev_tsv_path):
    #         sample_development_set()
    # # train_path = os.path.join(POLIT_DATA_DIR_PATH, 'train', 'after_leak_train_combined.tsv')
    #     dev = pd.read_csv(dev_tsv_path, sep='\t', index_col=False)
    dev = sample_development_set(should_return=True, train_filename='after_leak_train_combined.tsv')

print(len(dev[dev['label'] == 1]))
print(len(dev[dev['label'] == 0]))

440
440


Configure spacy, create nltk stopwords set.

***might be interesting to try POS features with and without negation words like no, not, n't***

In [10]:
spacy = s.load('en_core_web_lg') # en_core_web_trf for accuracy
stopwords = set(nltk.corpus.stopwords.words('english'))
# print(stopwords)

Apply spacy to content tokenized with nltk and joined. Extra step is done to utilise more sensitive nltk tokenizer.

In [11]:
# print(spacy.get_pipe('parser').labels)
# print(spacy.get_pipe('tagger').labels)

sent = dev.loc[0, 'content']
xx = spacy(sent)
print(sent)
print(len([t for t in xx if not t.is_punct]))
print(len(xx))
# x = [t.pos_ for t in spacy(sent)]
# print(x)
# y = [t.pos_ for t in spacy(' '.join([w for w in nltk.word_tokenize(sent) if w not in stopwords]))]
# print(y)

Thirty-six years ago, Ronald Reagan and George H.W.
9
11


In [12]:
if 'spacy' not in dev.columns:
    dev['spacy'] = dev['content'].apply(
        lambda x: spacy(
            ' '.join([t for t in word_tokenize(x)])
        )
    ).values
    print('temp')
    dev.to_pickle(p.join(dev_path, 'dev_spacy.pkl'))

temp


POS tags analysis
- `pos_` represents coarse-grained POS tag described [here](https://universaldependencies.org/u/pos/)
- `tag_` represents fine-grained POS tag

Initialize pos and tag dataframes and dataframe holding number of sentences each tag appears in.

In [13]:
pos_df = pd.DataFrame(
    index=pd.MultiIndex.from_product(
        [
            ['worthy', 'unworthy'],
            ['wstop', 'wostop'],
        ]
    ),
)
tag_df = pd.DataFrame(
    index=pd.MultiIndex.from_product(
        [
            ['worthy', 'unworthy'],
            ['wstop', 'wostop'],
        ]
    ),
)
dep_df = pd.DataFrame(
    index=pd.MultiIndex.from_product(
        [
            ['worthy', 'unworthy'],
            ['wstop', 'wostop'],
        ]
    ),
)

# this might not make much sense since almost all sentences have most of the tags
# TODO: ask about this
# pos_sent_count = dict()
# tag_sent_count = dict()
# sent_count = pd.DataFrame(index=['pos', ''])

Result of counting occurances of tags in sentences

```
{
    'ADP': 654, 'NUM': 654, 'PROPN': 651, 'VERB': 654, 
    'SCONJ': 654, 'PRON': 654, 'NOUN': 654, 'AUX': 654, 
    'DET': 654, 'ADJ': 654, 'ADV': 652, 'PART': 653, 
    'CCONJ': 653, 'INTJ': 623, 'SYM': 643
}
```

Fill pos and tag dataframes.

To make `'wstop'` represent all the words including stopwords, `'wostop'` needs to be added.

In [14]:
# sentence_count = 1
for _, sent, label in dev[['spacy', 'label']].itertuples():
    worthiness = 'worthy' if label == 1 else 'unworthy'

    for t in sent:
        # ignore punctuation
        if t.is_punct or t.pos_ == 'PUNCT' or t.pos_ == 'X':
            continue

        # pos_sent_count[t.pos_] = sentence_count
        # tag_sent_count[t.tag_] = sentence_count

        stop = 'wstop' if t.is_stop or t.lower_ in stopwords else 'wostop'

        if t.pos_ in pos_df.columns:
            pos_df.loc[(worthiness, stop), t.pos_] += 1
        else:
            pos_df.loc[:, t.pos_] = 0
            pos_df.loc[(worthiness, stop), t.pos_] = 1

        if t.tag_ in tag_df.columns:
            tag_df.loc[(worthiness, stop), t.tag_] += 1
        else:
            tag_df.loc[:, t.tag_] = 0
            tag_df.loc[(worthiness, stop), t.tag_] = 1

        if t.dep_ in dep_df.columns:
            dep_df.loc[(worthiness, stop), t.dep_] += 1
        else:
            dep_df.loc[:, t.dep_] = 0
            dep_df.loc[(worthiness, stop), t.dep_] = 1
    
    # sentence_count += 1

pos_df.loc['worthy', 'wstop'] = pos_df.loc['worthy', 'wstop'] + pos_df.loc['worthy', 'wostop']
tag_df.loc['worthy', 'wstop'] = tag_df.loc['worthy', 'wstop'] + tag_df.loc['worthy', 'wostop']
dep_df.loc['worthy', 'wstop'] = dep_df.loc['worthy', 'wstop'] + dep_df.loc['worthy', 'wostop']

Create simple dataframe 

In [15]:
# print(set(pos_df.index.get_level_values(1)))

Rearange data for easier usage for graphing.

In [16]:
comp_dfs = {'pos': pos_df, 'tag': tag_df, 'dep': dep_df}

for cat, df in comp_dfs.items():
    for cat_s in set(df.index.get_level_values(1)):
        im_df = df.xs(cat_s, level=1, drop_level=True).T.reset_index(level=0).rename(columns={'index': cat})
        im_df['sum'] = im_df['worthy'] + im_df['unworthy']
        im_df['worthy_ratio'] = im_df['worthy'] / im_df['sum']
        im_df['unworthy_ratio'] = im_df['unworthy'] / im_df['sum']
        im_df = im_df.fillna(0)

        if type(comp_dfs[cat]) != dict:
            comp_dfs[cat] = {cat_s: im_df}
        else:
            comp_dfs[cat][cat_s] = im_df

Select those POS tags which are the best discriminants between worthy and unworthy sentences.

Conditions:
- at least `min_occ` occurances
- at least `min_ratio_diff` ratio difference

In [73]:
selected_feats = {
    'pos': {
        'wstop': None,
        'wostop': None,
    },
    'tag': {
        'wstop': None,
        'wostop': None,
    },
    'dep': {
        'wstop': None,
        'wostop': None,
    }
}
thresholds = {
    'min_occ': {
        'wstop': .02,
        'wostop': .005,
    },
    'min_ratio_diff': {
        'wstop': .75,
        'wostop': .5,
    }
}

Create figures depicting counts and ratios of each coarse- and fine-grained POS tags in both worthy and unworthy sentences.

In [85]:
x = comp_dfs['pos']['wstop']
x['Worthy Percentage'] = x['worthy_ratio']
x['Unworthy Percentage'] = x['unworthy_ratio']
x['Check-worthy/-unworthy Fractions Difference'] = (x['worthy_ratio'] - x['unworthy_ratio']).abs()
print(x)
x['Occurrence Percentage'] = x['occ_frac']
# for feat_type, stop_dict in comp_dfs.items():
#     # for stop_type, df in stop_dict.items():
#     df = stop_dict['wstop']
min_occ = thresholds['min_occ']['wstop']
min_ratio_diff = thresholds['min_ratio_diff']['wstop']
fig_count = px.bar(
    x,
    x='POS Tags',
    y='Occurrence Percentage',
    title=None,
    text_auto='.3f'
) # title=f'{feat_type} - {stop_type} count'
fig_ratio = px.bar(
    x,
    x='POS Tags',
    # y=['unworthy_ratio', 'worthy_ratio'],
    y='Check-worthy/-unworthy Fractions Difference', #['Unworthy Percentage', 'Worthy Percentage'],
    title=None,
    text_auto='.2f'
)
fig_count.add_shape(type="line",
    x0=-0.5, y0=min_occ, x1=14.5, y1=min_occ,
    line=dict(
        color="DarkOrange",
        width=2,
        dash="dash",
    )
)
fig_ratio.add_shape(type="line",
    x0=-0.5, y0=min_ratio_diff, x1=14.5, y1=min_ratio_diff,
    line=dict(
        color="DarkOrange",
        width=2,
        dash="dash",
    )
)
fig_count.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig_ratio.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig_count.update(layout_title_text=None,#'Class imbalance in political debates data.',
           layout_showlegend=False)
fig_count.show()
fig_ratio.show()
#         # break
#     break
# fig_w_sum = px.bar(comp_dfs['pos']['wstop'], x='pos', y=['worthy', 'unworthy', 'sum'], title='pos tags')
# fig_percent = 
# # fig_w_sum.show()
# fig_percent.show()

      pos  worthy  unworthy   sum  worthy_ratio  unworthy_ratio  occ_frac  \
0     NUM     278        31   309      0.899676        0.100324  0.026280   
1    NOUN    1503        21  1524      0.986220        0.013780  0.129614   
2     ADV     426       223   649      0.656394        0.343606  0.055196   
3   PROPN     516        14   530      0.973585        0.026415  0.045076   
4   CCONJ     264       232   496      0.532258        0.467742  0.042184   
5    PRON     886       805  1691      0.523950        0.476050  0.143817   
6    VERB    1192       295  1487      0.801614        0.198386  0.126467   
7     DET     818       536  1354      0.604136        0.395864  0.115156   
8     AUX     509       418   927      0.549083        0.450917  0.078840   
9     ADJ     535        57   592      0.903716        0.096284  0.050349   
10    ADP     868       503  1371      0.633115        0.366885  0.116601   
11   PART     207       228   435      0.475862        0.524138  0.036996   

results `r` for thresholds `t`:
- `t`:
```json
    {
        'min_occ': {
            'wstop': .01,
            'wostop': .005,
        },
        'min_ratio_diff': {
            'wstop': .8,
            'wostop': .5,
        }
    }
```
- `r`:
```py
    pos
        wstop
            ['NUM', 'PROPN', 'NOUN', 'ADJ']
        wostop
            ['NUM', 'SYM']
    tag
        wstop
            ['CD', 'NNP', 'VBG', 'NN', 'NNS', 'JJ']
        wostop
            ['CD', '$']
    dep
        wstop
            ['pobj', 'compound', 'amod', 'nummod']
        wostop
            ['pcomp', 'quantmod', 'nummod', 'nsubjpass']
```


In [74]:
for pos_type, stop_dict in comp_dfs.items():
    for stop_type, df in stop_dict.items():
        df['ratio_diff'] = np.abs(df['worthy_ratio'] - df['unworthy_ratio'])
        df['occurances'] = df['sum'] / (df['sum'].sum())

        selected_feats[pos_type][stop_type] = df.loc[
            (df['ratio_diff'] >= thresholds['min_ratio_diff'][stop_type]) & 
            (df['occurances'] >= thresholds['min_occ'][stop_type]),
            pos_type
        ].values

# for k, v in selected_feats.items():
#     print(k)
#     for key, value in v.items():
#         print(key)
#         print(list(value))
print(selected_feats)
# with open(p.join(PROC_DATA_DIR_PATH, 'selected_spacy_features.pkl'), 'wb') as f:
#     pickle.dump(selected_feats, f)

{'pos': {'wstop': array(['NUM', 'NOUN', 'PROPN', 'ADJ'], dtype=object), 'wostop': array(['NUM', 'SYM'], dtype=object)}, 'tag': {'wstop': array(['CD', 'NNS', 'NNP', 'NN', 'JJ'], dtype=object), 'wostop': array(['CD', 'JJS', '$'], dtype=object)}, 'dep': {'wstop': array(['compound', 'amod', 'pobj'], dtype=object), 'wostop': array(['nummod', 'quantmod', 'nsubjpass', 'acl'], dtype=object)}}


***dep should be analyzed more indepth***

In [23]:
for _, sent, label in dev[['spacy', 'label']].itertuples():
    for token in sent:
        print(token.pos_, token.tag_)
        # print(token.head.pos_, token.dep_, token.pos_)
        # print(token.head.pos, token.dep, token.pos)
        # print([int(b) if b != ' ' else 0 for b in f'{token.dep:10b}'])
        # print([int(b) if b != ' ' else 0 for b in f"{int(f'{token.dep}'[-3:]):10b}"])
        # print(type(token.dep))

    break


ADP IN
NUM CD
PUNCT ,
PROPN NNP
VERB VBD
PROPN NNP
PROPN NNP
PUNCT ,
VERB VBG
SCONJ IN
PROPN NNP
VERB VBD
PRON PRP$
NOUN NN
SCONJ IN
ADP IN
NOUN NN
PUNCT HYPH
NOUN NN
PUNCT .


In [28]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

lb = LabelBinarizer()
oh = OneHotEncoder()
lb.fit(spacy.get_pipe('parser').labels)
# oh.fit(spacy.get_pipe('parser').labels)
i = 0
for _, sent, label in dev[['spacy', 'label']].itertuples():

    tokens_deps = []
    for token in sent:
        tokens_deps.append(token.dep_)
        # print(lb.transform([token.dep_]))
    print(len(lb.transform(tokens_deps)[0]))
    print(oh.fit(tokens_deps))
    print(len(sent))
    break
    # if i == 2:
    #     break

    # i += 1


45


AttributeError: 'OneHotEncoder' object has no attribute 'fit_transorm'

In [31]:
x = np.asarray([i for i in range(10)])

print(type(x))
print(type(x + 2))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
