In [None]:
# install relevant packages
!pip install textstat better_profanity

In [None]:
# import relevant packages

import pandas as pd
import numpy as np

import re
import textstat
import nltk
from nltk.tokenize import word_tokenize
from better_profanity import profanity

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# download prerequsitives
nltk.download('punkt')

In [None]:
# read in data
data = pd.read_csv("/kaggle/input/all-comments-sample/all_comments_sample_137K.csv")

# Compute Measures of Text Complexity

In [None]:
# nr of sentences
data['nr_sentences'] = data['body'].apply(textstat.sentence_count)

# nr of words # with punctuation excluded
data['nr_words'] = data['body'].apply(lambda text: textstat.lexicon_count(text, removepunct=True)) 

# nr of characters
data['nr_characters'] = data['body'].apply(lambda text: textstat.char_count(text, ignore_spaces=True))

#nr of letters
data['nr_letters'] = data['body'].apply(lambda text: textstat.letter_count(text, ignore_spaces=True))

# nr of syllables
data['nr_syllables'] = data['body'].apply(textstat.syllable_count)

# add monosyllables
data['nr_words_one_syllable'] = data['body'].apply(textstat.monosyllabcount)

# and polysyllables
data['nr_words_more_syllables'] = data['body'].apply(textstat.polysyllabcount)

## Semantic Text Complexity

In [None]:
# since some functions will need the tokens, compute tokens
data['tokens'] = data['body'].apply(lambda text: word_tokenize(text.lower()))

# obtain number of unique words per text
data['nr_unique_words'] = data['tokens'].apply(lambda tokens: len(set(tokens)))

# and compute type-token-ratio
data['share_unique words'] = data['nr_unique_words'] / data['nr_words']

**Detect Swear Words:** Using the profanity-check library to check for another dimensions of ''quality''

In [None]:
def count_swear_words(tokens):
    swear_count = sum(profanity.contains_profanity(word) for word in tokens)
    return swear_count

In [None]:
# and compute nr of swear words per reddit post
data['nr_swear_words'] = data['tokens'].apply(count_swear_words)

In [None]:
data['share_swear_words'] = data['nr_swear_words'] / data['nr_words']

## Syntactic Text Complexity

**Gunning Fog:** "Returns the FOG index of the given text. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document." (https://pypi.org/project/textstat/, 29.07.2024)


**Fog Index	Reading level by grade**
- 17	College graduate
- 16	College senior
- 15	College junior
- 14	College sophomore
- 13	College freshman
- 12	High school senior
- 11	High school junior
- 10	High school sophomore
- 9	High school freshman
- 8	Eighth grade
- 7	Seventh grade
- 6	Sixth grade

(https://en.wikipedia.org/wiki/Gunning_fog_index, 31.07.2024)

In [None]:
# and compute Gunning Fog index
data['Gunning_Fog'] = data['body'].apply(textstat.gunning_fog)

In [None]:
# save to csv
data.to_csv('all_comments_complexity.csv', index=False)