A couple different functions to analyse sentence structure

In [16]:
#  A high TTR indicates a high degree of lexical variation while a low TTR indicates the opposite.
TTR_LIMITS = [0.3, 0.7]

def textToTokenRatio(content: str):
    """Equal to number of unique words divided by total number of words.
    
    0  indicates large redundancy, 1 indicates low redundancy of words
    """
    ttr = len(set(content.split())) / len(content.split())

    ttr_score = (ttr -TTR_LIMITS[0] / TTR_LIMITS[1] - TTR_LIMITS[0])
    return min(max(ttr_score, 0), 1)

import re

def text_top_token_ratio(text):
    # Remove punctuation and split the text into a list of words
    words = re.findall(r'\w+', text.lower())

    # Count the number of occurrences of each word
    word_counts = {}
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

    # Sort the words by frequency in descending order
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

    # Calculate the Text Top Token Ratio
    top_tokens = sum(count for word, count in sorted_words[:10])
    total_tokens = sum(count for word, count in word_counts.items())
    ttr = top_tokens / total_tokens if total_tokens > 0 else 0

    return ttr

In [17]:
list = []
sentence = """is	was	are	be	have
had	were	can	said	use
do	will	would	make	like
has	look	write	go	see
could	been	call	am	find
did	get	come	made	may
take	know	live	give	think
say	help	tell	follow	came
want	show	set	put	does
must	ask	went	read	need
move	try	change	play	spell
found	study	learn	should	add
keep	start	thought	saw	turn
might	close	seem	open	begin
got	run	walk	began	grow
took	carry	hear	stop	miss
eat	watch	let	cut	talk
being	leave"""

list.append(textToTokenRatio(sentence))
list.append(text_top_token_ratio(sentence))

print(list)

[0.2714285714285714, 0.11494252873563218]


In [18]:
## An calculate the average word length

WORDLENGTH = [4.5, 5.5]

def evaluate_word_length_text(content: str):
    """Evaluates the average length of words in the content.
    
    Score betwen 0 and 1, 0 = short words, 1 = longer words
    """
    word_length = sum(len(word) for word in content.split()) / len(content.split())

    score = (word_length - WORDLENGTH[0]) / (WORDLENGTH[1] - WORDLENGTH[0])
    return min(max(score, 0), 1)

In [23]:
list.append(evaluate_word_length_text("Utterance of a loquacious vocabulary. Exceedingly sophsiticated terminological apparatus."))
list.append(evaluate_word_length_text("small word not much big word no"))

print(list)

[0.2714285714285714, 0.11494252873563218, 1, 0, 1, 0]
