# Notebook for extracting features for authorship attribution.
In this notebook all authorship attribution features will be collected. They will be saved after extraction. Make sure, you define each feature extraction in a function, so it easily can be repurposed.

Author: lkt259@alumni.ku.dk

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer

#### Sentence length
Nice feature, very complex!

In [2]:
string = "This list has overlapping features with content features. For example, word n-grams will capture the content of the text along with stylometric tendencies. Content features consist of word frequencies, word and character n-grams, hapax legomena etc. This overlap is not of concern, however, as Sari et al. \cite{Sari2018} show, using content features is beneficial when performing authorship attribution of news articles because journalists often have certain topics they prefer writing about. They argue that using only stylometric features is beneficial when attributing authors to texts of the same topic or genre, e.g. law text or movie reviews."

In [32]:
def split_sentences(text):
    '''Returns an array with text split into sentences'''
    return np.array(re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text), dtype=str)

def remove_dots(word):
    return re.sub(r',|\.|:|!|\?|;', '', word)

def split_words(text):
    '''Returns an array with text split into words'''
    string = np.array(text.split(), dtype=str)
    no_dot = np.array([remove_dots(x) for x in string])
    return no_dot

def get_sentence_lengths(text):
    '''Returns dictionary with sentence lengh in chars and words'''
    split_text = split_sentences(text)
    num_sentences = len(split_text)
    num_chars = np.array([len(x) for x in split_text], dtype=int)
    num_words = [split_words(x).size for x in split_sentences(string)]
    return {'chars' : num_chars, 'words' : num_words, 'num_sents' : num_sentences}

def get_sentence_length_stats(text):
    '''Returns dictionary with mean, std and median lengths in both chars and words'''
    sentence_lengths = get_sentence_lengths(text)
    output = {'number_of_sentences' : sentence_lengths['num_sents'],
              'avg_sent_len_chars' : np.mean(sentence_lengths['chars']),
              'std_sent_len_chars' : np.std(sentence_lengths['chars']),
              'med_sent_len_chars' : np.median(sentence_lengths['chars']),
              
              'avg_sent_len_words' : np.mean(sentence_lengths['words']),
              'std_sent_len_words' : np.std(sentence_lengths['words']),
              'med_sent_len_words' : np.median(sentence_lengths['words'])
             }
    return output

print("Stats for our test document")
get_sentence_length_stats(string)

Stats for our test document


{'number_of_sentences': 6,
 'avg_sent_len_chars': 107.33333333333333,
 'std_sent_len_chars': 48.65410796862093,
 'med_sent_len_chars': 95.0,
 'avg_sent_len_words': 16.166666666666668,
 'std_sent_len_words': 6.618576550554926,
 'med_sent_len_words': 14.0}

##### Word length
The count of words of the entire text.
Also extremely complex feature, cool shit.

In [40]:
def get_word_lengths(split_text):
    '''Returns length of words in characters'''
    return np.array([len(x) for x in split_text], dtype=int)

def get_word_length_stats(text):
    '''Returns various stats for words in document'''
    #Split text here, to reduce function calls.
    split_text = split_words(text)
    word_lengths = get_word_lengths(split_text)
    output = {
        'number_of_words' : len(split_text),
        'avg_word_len_chars' : np.mean(word_lengths),
        'std_word_len_chars' : np.std(word_lengths),
        'med_word_len_chars' : np.median(word_lengths)
    }
    return output

print("Test word lengths")
get_word_length_stats(string)

Test word lengths


{'number_of_words': 97,
 'avg_word_len_chars': 5.546391752577319,
 'std_word_len_chars': 2.853901719013402,
 'med_word_len_chars': 5.0}

In [None]:
split_words()

In [None]:
'.'.isalnum()