In [1]:
import os.path as p
import pandas as pd
import nltk
import pickle
import spacy as s
import plotly.express as px
import plotly.graph_objects as go

from nltk.tokenize import word_tokenize
from definitions import *
from dataset_helper_functions import *

Global switches.

In [2]:
try:
    IS_MASTER
except: 
    IS_MASTER = False

if not IS_MASTER:
    # train_path = p.join(POLIT_DATA_DIR_PATH, 'train')
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')
    dev_tsv_path = p.join(dev_path, 'dev.tsv')
    dev_pkl_path = p.join(dev_path, 'dev_spacy.pkl')

    if p.exists(dev_pkl_path):
        dev = pd.read_pickle(dev_pkl_path)
    else:
        if not p.exists(dev_tsv_path):
            sample_development_set()

        dev = pd.read_csv(dev_tsv_path, sep='\t', index_col=False)
    

Configure spacy, create nltk stopwords set.

***might be interesting to try POS features with and without negation words like no, not, n't***

In [96]:
spacy = s.load('en_core_web_lg') # en_core_web_trf for accuracy
stopwords = set(nltk.corpus.stopwords.words('english'))
# print(stopwords)

Apply spacy to content tokenized with nltk and joined. Extra step is done to utilise more sensitive nltk tokenizer.

In [84]:
if 'spacy' not in dev.columns:
    dev['spacy'] = dev['content'].apply(
        lambda x: spacy(
            ' '.join([t for t in word_tokenize(x)])
        )
    ).values
    print('temp')
    dev.to_pickle(p.join(dev_path, 'dev_spacy.pkl'))

POS tags analysis
- `pos_` represents coarse-grained POS tag described [here](https://universaldependencies.org/u/pos/)
- `tag_` represents fine-grained POS tag

Initialize pos and tag dataframes and dataframe holding number of sentences each tag appears in.

In [85]:
pos_df = pd.DataFrame(
    index=pd.MultiIndex.from_product(
        [
            ['worthy', 'unworthy'],
            ['wstop', 'wostop'],
        ]
    ),
)
tag_df = pd.DataFrame(
    index=pd.MultiIndex.from_product(
        [
            ['worthy', 'unworthy'],
            ['wstop', 'wostop'],
        ]
    ),
)
dep_df = pd.DataFrame(
    index=pd.MultiIndex.from_product(
        [
            ['worthy', 'unworthy'],
            ['wstop', 'wostop'],
        ]
    ),
)

# this might not make much sense since almost all sentences have most of the tags
# TODO: ask about this
# pos_sent_count = dict()
# tag_sent_count = dict()
# sent_count = pd.DataFrame(index=['pos', ''])

Result of counting occurances of tags in sentences

```
{
    'ADP': 654, 'NUM': 654, 'PROPN': 651, 'VERB': 654, 
    'SCONJ': 654, 'PRON': 654, 'NOUN': 654, 'AUX': 654, 
    'DET': 654, 'ADJ': 654, 'ADV': 652, 'PART': 653, 
    'CCONJ': 653, 'INTJ': 623, 'SYM': 643
}
```

Fill pos and tag dataframes.

To make `'wstop'` represent all the words including stopwords, `'wostop'` needs to be added.

In [86]:
# sentence_count = 1
for _, sent, label in dev[['spacy', 'label']].itertuples():
    worthiness = 'worthy' if label == 1 else 'unworthy'

    for t in sent:
        # ignore punctuation
        if t.is_punct or t.pos_ == 'PUNCT' or t.pos_ == 'X':
            continue

        # pos_sent_count[t.pos_] = sentence_count
        # tag_sent_count[t.tag_] = sentence_count

        stop = 'wstop' if t.is_stop or t.lower_ in stopwords else 'wostop'

        if t.pos_ in pos_df.columns:
            pos_df.loc[(worthiness, stop), t.pos_] += 1
        else:
            pos_df.loc[:, t.pos_] = 0
            pos_df.loc[(worthiness, stop), t.pos_] = 1

        if t.tag_ in tag_df.columns:
            tag_df.loc[(worthiness, stop), t.tag_] += 1
        else:
            tag_df.loc[:, t.tag_] = 0
            tag_df.loc[(worthiness, stop), t.tag_] = 1

        if t.dep_ in dep_df.columns:
            dep_df.loc[(worthiness, stop), t.dep_] += 1
        else:
            dep_df.loc[:, t.dep_] = 0
            dep_df.loc[(worthiness, stop), t.dep_] = 1
    
    # sentence_count += 1

pos_df.loc['worthy', 'wstop'] = pos_df.loc['worthy', 'wstop'] + pos_df.loc['worthy', 'wostop']
tag_df.loc['worthy', 'wstop'] = tag_df.loc['worthy', 'wstop'] + tag_df.loc['worthy', 'wostop']
dep_df.loc['worthy', 'wstop'] = dep_df.loc['worthy', 'wstop'] + dep_df.loc['worthy', 'wostop']

Create simple dataframe 

In [7]:
# print(set(pos_df.index.get_level_values(1)))

Rearange data for easier usage for graphing.

In [87]:
comp_dfs = {'pos': pos_df, 'tag': tag_df, 'dep': dep_df}

for cat, df in comp_dfs.items():
    for cat_s in set(df.index.get_level_values(1)):
        im_df = df.xs(cat_s, level=1, drop_level=True).T.reset_index(level=0).rename(columns={'index': cat})
        im_df['sum'] = im_df['worthy'] + im_df['unworthy']
        im_df['worthy_ratio'] = im_df['worthy'] / im_df['sum']
        im_df['unworthy_ratio'] = im_df['unworthy'] / im_df['sum']
        im_df = im_df.fillna(0)

        if type(comp_dfs[cat]) != dict:
            comp_dfs[cat] = {cat_s: im_df}
        else:
            comp_dfs[cat][cat_s] = im_df

Create figures depicting counts and ratios of each coarse- and fine-grained POS tags in both worthy and unworthy sentences.

In [88]:
for feat_type, stop_dict in comp_dfs.items():
    for stop_type, df in stop_dict.items():
        # print(df)
        fig_count = px.bar(df, x=feat_type, y=['worthy', 'unworthy'], title=f'{feat_type} - {stop_type} count')
        fig_ratio = px.bar(
            df,
            x=feat_type,
            y=['worthy_ratio', 'unworthy_ratio'],
            title=f'{feat_type} - {stop_type} ratio'
        )
        fig_count.show()
        fig_ratio.show()
# fig_w_sum = px.bar(comp_dfs['pos']['wstop'], x='pos', y=['worthy', 'unworthy', 'sum'], title='pos tags')
# fig_percent = 
# # fig_w_sum.show()
# fig_percent.show()

Select those POS tags which are the best discriminants between worthy and unworthy sentences.

Conditions:
- at least `min_occ` occurances
- at least `min_ratio_diff` ratio difference

In [93]:
selected_feats = {
    'pos': {
        'wstop': None,
        'wostop': None,
    },
    'tag': {
        'wstop': None,
        'wostop': None,
    },
    'dep': {
        'wstop': None,
        'wostop': None,
    }
}
thresholds = {
    'min_occ': {
        'wstop': .01,
        'wostop': .005,
    },
    'min_ratio_diff': {
        'wstop': .8,
        'wostop': .5,
    }
}

results `r` for thresholds `t`:
- `t`:
```json
    {
        'min_occ': {
            'wstop': .01,
            'wostop': .005,
        },
        'min_ratio_diff': {
            'wstop': .8,
            'wostop': .5,
        }
    }
```
- `r`:
```py
    pos
        wstop
            ['NUM', 'PROPN', 'NOUN', 'ADJ']
        wostop
            ['NUM', 'SYM']
    tag
        wstop
            ['CD', 'NNP', 'VBG', 'NN', 'NNS', 'JJ']
        wostop
            ['CD', '$']
    dep
        wstop
            ['pobj', 'compound', 'amod', 'nummod']
        wostop
            ['pcomp', 'quantmod', 'nummod', 'nsubjpass']
```


In [95]:
for pos_type, stop_dict in comp_dfs.items():
    for stop_type, df in stop_dict.items():
        df['ratio_diff'] = np.abs(df['worthy_ratio'] - df['unworthy_ratio'])
        df['occurances'] = df['sum'] / (df['sum'].sum())

        selected_feats[pos_type][stop_type] = df.loc[
            (df['ratio_diff'] >= thresholds['min_ratio_diff'][stop_type]) & 
            (df['occurances'] >= thresholds['min_occ'][stop_type]),
            pos_type
        ].values

# for k, v in selected_feats.items():
#     print(k)
#     for key, value in v.items():
#         print(key)
#         print(list(value))
with open(p.join(PROC_DATA_DIR_PATH, 'selected_spacy_features.pkl'), 'wb') as f:
    pickle.dump(selected_feats, f)

***dep should be analyzed more indepth***

In [31]:
for _, sent, label in dev[['spacy', 'label']].itertuples():
    for t in sent:
        print(t.dep_)
    break

nsubj
ROOT
det
amod
attr
punct
advmod
det
nsubj
prep
pobj
aux
relcl
prt
aux
advcl
prep
pobj
prep
det
compound
pobj
cc
conj
prep
det
pobj
npadvmod
punct
