Necessary imports

In [6]:
import os.path as p
import pandas as pd
import nltk
import spacy as s
import plotly.express as px
import pickle

from nltk.tokenize import word_tokenize
from definitions import *
from dataset_helper_functions import *

Global switches.

In [2]:
try:
    IS_MASTER
except: 
    IS_MASTER = False

try:
    ANALYZE
except:
    ANALYZE = True


Configure spacy, create nltk stopwords set.

***might be interesting to try POS features with and without negation words like no, not, n't***

In [3]:
spacy = s.load('en_core_web_lg') # en_core_web_trf for accuracy
stopwords = (set(nltk.corpus.stopwords.words('english')))
# print(stopwords)

{'did', "mightn't", 'such', "shouldn't", "doesn't", 'above', 'why', 'll', 'didn', 'whom', 'her', "you'll", 'them', 'then', 'so', 'needn', 'all', 'was', 'himself', 'd', 'can', 'he', 'and', 'there', 'she', 'when', 'who', "shan't", 'ourselves', 'of', 'most', 'while', "should've", 'how', 'once', 'haven', "didn't", 'any', "mustn't", 'only', "haven't", 'where', "aren't", 'just', 'each', 'through', 'don', 'ma', 'which', "couldn't", 'very', 'isn', 'their', 'have', 'mightn', 'no', 'o', "she's", 'from', 'before', 'me', 'those', 'my', 'at', 'm', 'shan', "weren't", "needn't", 'off', 'on', 'own', "hasn't", 't', 'should', 'do', 'ours', 'am', 'yours', 'our', "you've", 'few', 'him', 'being', 'over', 'other', 'not', 'y', 'more', 'theirs', 'same', 'these', 'to', 'hadn', 'your', 'hers', 'that', 'again', 'into', 's', 'will', 're', 'shouldn', 'against', 'were', 'its', 'by', "isn't", 'they', "that'll", 'doing', 'are', 'about', 'had', 'but', 'mustn', 'weren', 'yourselves', 'nor', 'itself', "won't", 'after', 

The following code only needs to be run once at the start.

In [4]:
# combine_debates()
# create_validation_subset()
# sample_development_set()

In [12]:
if not IS_MASTER:
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')
    dev_tsv_path = p.join(dev_path, 'dev.tsv')
    dev_pkl_path = p.join(dev_path, 'dev_spacy.pkl')

    if p.exists(dev_pkl_path):
        dev = pd.read_pickle(dev_pkl_path)
    else:
        if not p.exists(dev_tsv_path):
            sample_development_set()

        dev = pd.read_csv(dev_tsv_path, sep='\t', index_col=False)

Apply spacy to content tokenized with nltk and joined. Extra step is done to utilise more sensitive nltk tokenizer.

In [13]:
if 'spacy' not in dev.columns:
    dev['spacy'] = dev['content'].apply(
        lambda x: spacy(
            ' '.join([t for t in word_tokenize(x)])
        )
    ).values
    dev.to_pickle(p.join(dev_path, 'dev_spacy.pkl'))

Deserialize pickled selected spaCy features with thresholds:
```json
    {
        'min_occ': {
            'wstop': .01,
            'wostop': .005,
        },
        'min_ratio_diff': {
            'wstop': .8,
            'wostop': .5,
        }
    }
```

In [8]:
with open(p.join(PROC_DATA_DIR_PATH, 'selected_spacy_features.pkl'), 'rb') as f:
    selected_features = pickle.load(f)

print(selected_features)
# pos_sel = selected_features['pos']
# pos_sel = {v: i for i, v in enumerate(pos_sel)}

{'pos': {'wstop': array(['NUM', 'PROPN', 'NOUN', 'ADJ'], dtype=object), 'wostop': array(['NUM', 'SYM'], dtype=object)}, 'tag': {'wstop': array(['CD', 'NNP', 'VBG', 'NN', 'NNS', 'JJ'], dtype=object), 'wostop': array(['CD', '$'], dtype=object)}, 'dep': {'wstop': array(['pobj', 'compound', 'npadvmod', 'amod', 'nummod'], dtype=object), 'wostop': array(['pcomp', 'quantmod', 'nummod', 'nsubjpass'], dtype=object)}}


Method for creating pos features. 

- `selection` is dict holding pos tags and their respective order in feature.
- `is_one_hot` differentiate between one-hot encoding and counts

In [29]:
def create_stylo_feat(feat_type, sent, selection, is_one_hot):
    # feat type with underscore to match token attributes
    ftwu = f'{feat_type}_'
    feature = [0]*len(selection)

    for t in sent:
        t_attr = getattr(t, ftwu)
        
        if t_attr in selection:
            if is_one_hot:
                feature[selection[t_attr]] = 1
            else:
                feature[selection[t_attr]] += 1

    # encode counts to binary
    if not is_one_hot:
        feature = [f'{c:06b}' for c in feature]
        
    return feature

Prepare path for features.

In [10]:
features_path = p.join(PROC_DATA_DIR_PATH, 'features')
if not p.exists(features_path):
    os.mkdir(features_path)

Initialize features_df and prepare feature selection map used in creating stylometric features.

In [21]:
spacy_col = dev['spacy'].values
features_df = dev.loc[:, ['id']]

selected_features_map = {
    feat_type: {
        stop_type: {
            ft: ord for ord, ft in enumerate(selection)    
        } for stop_type, selection in selected_features[feat_type].items()
    } for feat_type in selected_features
}

Create and save stylometric features.

In [30]:
for feat_type in selected_features:
    for stop_type in selected_features[feat_type]:
        # one-hot
        features_df[f'{feat_type}_{stop_type}_one_hot'] = [
            create_stylo_feat(feat_type, sent, selected_features_map[feat_type][stop_type], True) for sent in spacy_col
        ]
        # counts
        features_df[f'{feat_type}_{stop_type}_count'] = [
            create_stylo_feat(feat_type, sent, selected_features_map[feat_type][stop_type], False) for sent in spacy_col
        ]

# TODO: decide whether tsv or pickle is better
features_df.to_csv(
    p.join(features_path, 'stylometric_features.tsv'),
    sep='\t',
    index=False
)
features_df.to_pickle(p.join(features_path, 'stylometric_features.pkl'))

In [16]:
# temp = dev.loc[0, 'content']
# x = spacy(temp)
# temp = dev[['id', 'content']]
# temp['spacied'] = temp['content'].apply(lambda x: spacy(x))
# # temp = ['tag_ | pos_ | dep_ | lemma_ | norm_']
# # for t in x:
# #     temp.append(f'{t.tag_} | {t.pos_} | {t.dep_} | {t.lemma_} | {t.norm_}')
# print(temp.loc[0, 'spacied'][0].tag_)
# [(i.text, i.pos_, i.) for i in x]