Necessary imports

In [71]:
import os.path as p
import pandas as pd
import nltk
import spacy as s
import plotly.express as px
import pickle

from nltk.tokenize import word_tokenize
from definitions import *
from dataset_helper_functions import *

Pre-process control switch.

In [72]:
process_again = True
# process_again = False

Global switches.

In [73]:
try:
    IS_MASTER
except: 
    IS_MASTER = False

try:
    ANALYZE
except:
    ANALYZE = True

Configure spacy, create nltk stopwords set.

***might be interesting to try POS features with and without negation words like no, not, n't***

In [74]:
spacy = s.load('en_core_web_lg') # en_core_web_trf for accuracy
stopwords = (set(nltk.corpus.stopwords.words('english')))
# print(stopwords)

The following code only needs to be run once at the start.

In [75]:
# combine_debates()
# create_validation_subset()
# sample_development_set()

In [76]:
if not IS_MASTER:
    data = {}
    
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [p.join(dev_path, 'dev.tsv'), p.join(dev_path, 'dev_spacy.pkl')],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
            p.join(PROC_DATA_DIR_PATH, 'test', 'test_spacy.pkl')
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
            p.join(PROC_DATA_DIR_PATH, 'train', 'train_spacy.pkl')
        ],
        # 'val': [
        #     p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        #     p.join(PROC_DATA_DIR_PATH, 'val', 'val_spacy.pkl')
        # ],
    }

    if process_again:
        for dtype, dpaths in data_paths.items():
            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)
    else:
        for dtype, dpaths in data_paths.items():
            if dtype == 'dev' and not p.exists(dpaths[0]):
                sample_development_set()

            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

Apply spacy to content tokenized with nltk and joined. Extra step is done to utilise more sensitive nltk tokenizer.

In [77]:
for dtype, df in data.items():
    if 'spacy' not in df.columns:
        df['spacy'] = [spacy(' '.join([t for t in word_tokenize(sent)])) for sent in df['content'].values]
        df.to_pickle(data_paths[dtype][1])

Deserialize pickled selected spaCy features with thresholds:
```json
    {
        'min_occ': {
            'wstop': .01,
            'wostop': .005,
        },
        'min_ratio_diff': {
            'wstop': .8,
            'wostop': .5,
        }
    }
```

In [78]:
with open(p.join(PROC_DATA_DIR_PATH, 'selected_spacy_features.pkl'), 'rb') as f:
    selected_features = pickle.load(f)

print(selected_features)
# pos_sel = selected_features['pos']
# pos_sel = {v: i for i, v in enumerate(pos_sel)}

{'pos': {'wstop': array(['NUM', 'PROPN', 'NOUN', 'ADJ'], dtype=object), 'wostop': array(['NUM', 'SYM'], dtype=object)}, 'tag': {'wstop': array(['CD', 'NNP', 'VBG', 'NN', 'NNS', 'JJ'], dtype=object), 'wostop': array(['CD', '$'], dtype=object)}, 'dep': {'wstop': array(['pobj', 'compound', 'npadvmod', 'amod', 'nummod'], dtype=object), 'wostop': array(['pcomp', 'quantmod', 'nummod', 'nsubjpass'], dtype=object)}}


Method for creating pos features. 

- `selection` is dict holding pos tags and their respective order in feature.
- `is_one_hot` differentiate between one-hot encoding and counts

In [79]:
def create_stylo_feat(feat_type, sent, selection, is_one_hot):
    # feat type with underscore to match token attributes
    ftwu = f'{feat_type}_'
    feature = [0]*len(selection)

    for t in sent:
        t_attr = getattr(t, ftwu)
        
        if t_attr in selection:
            if is_one_hot:
                feature[selection[t_attr]] = 1
            else:
                feature[selection[t_attr]] += 1

    # encode counts to binary and flatten
    if not is_one_hot:
        feature = [int(b) for digit in feature for b in f'{digit:06b}']
        
    return feature

Prepare path for features.

In [80]:
features_path = p.join(PROC_DATA_DIR_PATH, 'features')
if not p.exists(features_path):
    os.mkdir(features_path)

Prepare feature selection map used in creating stylometric features.

In [81]:
selected_features_map = {
    feat_type: {
        stop_type: {
            ft: ord for ord, ft in enumerate(selection)    
        } for stop_type, selection in selected_features[feat_type].items()
    } for feat_type in selected_features
}

Create and save stylometric features.

In [82]:
for dtype, df in data.items():
    spacy_col = df['spacy'].values
    features_df = df.loc[:, ['id']]

    for feat_type in selected_features:
        for stop_type in selected_features[feat_type]:
            # one-hot
            one_hot_df = pd.DataFrame([
                create_stylo_feat(feat_type, sent, selected_features_map[feat_type][stop_type], True)
                for sent in spacy_col
            ])
            one_hot_col_base = f'{feat_type}_{stop_type}_one_hot'
            one_hot_df.columns = [f'{one_hot_col_base}_{i}' for i in range(one_hot_df.shape[1])]

            features_df = features_df.merge(one_hot_df, left_index=True, right_index=True)

            counts_df = pd.DataFrame([
                create_stylo_feat(feat_type, sent, selected_features_map[feat_type][stop_type], False)
                for sent in spacy_col
            ])
            counts_col_base = f'{feat_type}_{stop_type}_count'
            counts_df.columns = [f'{counts_col_base}_{i}' for i in range(counts_df.shape[1])]

            features_df = features_df.merge(counts_df, left_index=True, right_index=True)

    # TODO: decide whether tsv or pickle is better
    features_df.to_csv(
        p.join(features_path, f'{dtype}_stylometric_features.tsv'),
        sep='\t',
        index=False
    )
    features_df.to_pickle(p.join(features_path, f'{dtype}_stylometric_features.pkl'))

In [33]:
# temp = dev.loc[0, 'content']
# x = spacy(temp)
# temp = dev[['id', 'content']]
# temp['spacied'] = temp['content'].apply(lambda x: spacy(x))
# # temp = ['tag_ | pos_ | dep_ | lemma_ | norm_']
# # for t in x:
# #     temp.append(f'{t.tag_} | {t.pos_} | {t.dep_} | {t.lemma_} | {t.norm_}')
# print(temp.loc[0, 'spacied'][0].tag_)
# [(i.text, i.pos_, i.) for i in x]

# x = [0, 9, 2, 1, 5, 67, 54]

# feature = [int(b) for binary in x for b in f'{binary:06b}']
# # len(x)
# feature

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0]