text descriptives such as word complexity, readibility etc

In [1]:
import textdescriptives as td
import spacy



In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
nlp.add_pipe("textdescriptives/all")

<textdescriptives.load_components.TextDescriptives at 0x20d389b58d0>

In [18]:
doc = nlp("Amazon Web Services (AWS) Lambda is a usage-based computing infrastructure service that can execute\nPython 3 code")

In [19]:
# go here for the methods
# https://hlasse.github.io/TextDescriptives/usingthepackage.html#available-attributes
# https://hlasse.github.io/TextDescriptives/readability.html
doc._.readability

{'flesch_reading_ease': 58.42368421052632,
 'flesch_kincaid_grade': 9.830526315789477,
 'smog': nan,
 'gunning_fog': 11.810526315789474,
 'automated_readability_index': 11.619999999999997,
 'coleman_liau_index': 12.042105263157893,
 'lix': 45.315789473684205,
 'rix': 5.0}

In [20]:
doc._.token_length

{'token_length_mean': 5.0,
 'token_length_median': 5.0,
 'token_length_std': 3.1455900626281967}

In [21]:
doc._.syllables

{'syllables_per_token_mean': 1.5263157894736843,
 'syllables_per_token_median': 1.0,
 'syllables_per_token_std': 0.8187552203212656}

In [22]:
# The coherence components calculates the coherence of the document, based on word embedding cosine similarity between sentences.
# see https://www.sciencedirect.com/science/article/pii/S0920996422002742?via%3Dihub and https://www.nature.com/articles/npjschz201530
doc._.coherence

{'first_order_coherence': nan, 'second_order_coherence': nan}

In [23]:
# https://hlasse.github.io/TextDescriptives/quality.html
# gains quality by looking at things like repetition, if theres bulletpoints etc
# from https://arxiv.org/abs/1910.10683 and https://arxiv.org/abs/2112.11446
# could be useful in gaining some metrics, for instance can more easily identify if a text has included in it code
doc._.quality

QualityOutput(
	passed=True, 
	n_stop_words=ThresholdsOutput(value=4.0, passed=True, threshold=(2.0, None)), 
	alpha_ratio=ThresholdsOutput(value=0.77, passed=True, threshold=(0.7, None)), 
	mean_word_length=ThresholdsOutput(value=4.45, passed=True, threshold=(3.0, 10.0)), 
	doc_length=ThresholdsOutput(value=22.0, passed=True, threshold=(10.0, 100000.0)), 
	symbol_to_word_ratio={'#': ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.1))}, 
	proportion_ellipsis=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.3)), 
	proportion_bullet_points=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.8)), 
	contains={'lorem ipsum': ThresholdsOutput(value=0.0, passed=True, threshold=False)}, 
	duplicate_line_chr_fraction=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.2)), 
	duplicate_paragraph_chr_fraction=ThresholdsOutput(value=0.0, passed=True, threshold=(None, 0.2)), 
	duplicate_ngram_chr_fraction={'5': ThresholdsOutput(value=0.0, passed=True, threshol

In [24]:
# https://hlasse.github.io/TextDescriptives/information_theory.html
# calculates measures used to describe the complexity of a text, the higher the entropy, the more complex the text is. 
# one could imagine filtering text based on per word perplexity given the assumption that highly surprising text is in fact non-coherent text pieces
doc._.information_theory

{'entropy': 0.2897752284598386,
 'perplexity': 1.336127130917804,
 'per_word_perplexity': 0.060733051405354725}