# Text Segmments Processor
 *  Jacob Yousif

## Importing the libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%%capture
!pip install --upgrade textstat

In [3]:
import pandas as pd
import nltk
import textstat
import textblob
import scipy.stats
from collections import Counter
import math
import string
from textstat.textstat import textstatistics
import collections as coll
import numpy as np
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt', quiet=True)

True

## Loading the data

In [4]:
file_path = 'datasets/SegmentedLiterature.csv'
df = pd.read_csv(file_path)

In [5]:
has_nan = df.isna().any().any()
print(f"DataFrame has NaN values: {has_nan}")

DataFrame has NaN values: False


## Predefined functions

In [6]:
def tokenize_words(text):
    return nltk.word_tokenize(text)

In [7]:
def tokenize_sents(text):
    return nltk.sent_tokenize(text)

In [8]:
def flesch_reading(text):
    return textstatistics().flesch_reading_ease(text)

In [9]:
def grade_level(text):
    return textstatistics().flesch_kincaid_grade(text)

In [10]:
def gunning_fog(text):
    return textstatistics().gunning_fog(text)

In [11]:
def average_word_length(text):
    words = tokenize_words(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

In [12]:
def average_sentence_length_by_word(text):
    sentences = tokenize_sents(text)
    words = tokenize_words(text)
    return len(words) / len(sentences)

In [13]:
def average_syllables_per_word(text):
    words = tokenize_words(text)
    return textstat.syllable_count(text) / len(words)

In [14]:
def count_punctuation(text):
    chars = set(",.'!\";?:")
    return sum(char in chars for char in text) / max(1, len(text))

In [15]:
def count_functional_words(text):
    words = tokenize_words(text)
    functional_words = nltk.corpus.stopwords.words('english')
    num_functional_words = sum(1 for word in words if word.lower() in functional_words)
    return num_functional_words / len(words)

In [16]:
def dale_chall_readability(text):
    return textstat.dale_chall_readability_score(text)

In [17]:
def simpsons_index(text):
    words = tokenize_words(text)
    word_freq = Counter(words)
    return 1 - sum((freq / len(words)) ** 2 for freq in word_freq.values())

In [18]:
def shannon_entropy(text):
    words = tokenize_words(text)
    word_freq = Counter(words)
    probs = [freq / len(words) for freq in word_freq.values()]
    return scipy.stats.entropy(probs, base=2)

In [19]:
def yules_characteristic_k(text):
    words = tokenize_words(text)
    N = len(words)
    word_freq = Counter(words)
    M1 = N
    M2 = sum(freq * (freq - 1) for freq in word_freq.values())
    return 10**4 * ((M2 / (M1**2)) + (1 / M1))

In [20]:
def brunets_measure_w(text):
    words = tokenize_words(text)
    V = len(set(words))
    N = len(words)
    return N * (V ** -0.172)

In [21]:
def type_token_ratio(text):
    words = tokenize_words(text)
    V = len(set(words))
    N = len(words)
    return V / N

In [22]:
def hapax_dis_legomena(text):
    words = tokenize_words(text)
    word_freq = Counter(words)
    h = sum(1 for _, freq in word_freq.items() if freq == 2)
    hapax_legomena_count = len([word for word, count in word_freq.items() if count == 1])
    
    if hapax_legomena_count == 0:
        s = 0
    else:
        s = 2 * h / hapax_legomena_count
    
    return s, h

In [23]:
def hapax_legomena(text):
    words = tokenize_words(text)
    N = len(words)
    
    if N == 0:
        return 0, 0
    
    h_count = len([word for word, count in Counter(words).items() if count == 1])
    h = h_count / N
    
    if h == 1:
        honore_r = float('inf') 
    else:
        honore_r = (100 * math.log(N)) / (1 - h)
    
    return honore_r, h

## The preprocessing

In [24]:
columns = [
    "Book", "Text", "Author", "AuthorCode", "AverageWordLength", "AverageSentenceLength", "AverageSyllablePerWord",
    "PunctuationCount", "FunctionalWordsCount", "TypeTokenRatio", "HonoreMeasureR",
    "Hapax", "SichelesMeasureS", "Dihapax", "YulesCharacteristicK", "SimpsonsIndex",
    "BrunetsMeasureW", "ShannonEntropy", "FleschReadingEase", "FleschKincaidGradeLevel",
    "DaleChallReadability", "GunningFog"
]

result = pd.DataFrame(columns=columns)

In [25]:
for index, row in df.iterrows():
    text = row['Text']
    honore_measure_r, hapax = hapax_legomena(text)
    sicheles_measure_s, dihapax = hapax_dis_legomena(text)
    _entry = {
        "Book": row['Book'],
        "Text": row['Text'],
        "Author": row['Author'],
        "AuthorCode": row['AuthorCode'],
        "AverageWordLength": average_word_length(text),
        "AverageSentenceLength": average_sentence_length_by_word(text),
        "AverageSyllablePerWord": average_syllables_per_word(text),
        "PunctuationCount": count_punctuation(text),
        "FunctionalWordsCount": count_functional_words(text),
        "TypeTokenRatio": type_token_ratio(text),
        "HonoreMeasureR": honore_measure_r,
        "Hapax": hapax,
        "SichelesMeasureS": sicheles_measure_s,
        "Dihapax": dihapax,
        "YulesCharacteristicK": yules_characteristic_k(text),
        "SimpsonsIndex": simpsons_index(text),
        "BrunetsMeasureW": brunets_measure_w(text),
        "ShannonEntropy": shannon_entropy(text),
        "FleschReadingEase": flesch_reading(text),
        "FleschKincaidGradeLevel": grade_level(text),
        "DaleChallReadability": dale_chall_readability(text),
        "GunningFog": gunning_fog(text),
    }
    _row = pd.DataFrame([_entry])
    result = pd.concat([result, _row], ignore_index=True)

In [26]:
print('The size of the dataset is:', len(result), 'rows')

The size of the dataset is: 106792 rows


In [27]:
has_nan = result.isna().any().any()
print(f"DataFrame has NaN values: {has_nan}")

DataFrame has NaN values: False


In [28]:
result = result.dropna()

In [29]:
has_nan = result.isna().any().any()
print(f"DataFrame has NaN values: {has_nan}")

DataFrame has NaN values: False


In [30]:
print('The size of the dataset is:', len(result), 'rows')

The size of the dataset is: 106792 rows


In [31]:
csv_file_path = 'Datasets/ProcessedSegmentedLiterature.csv'
result.to_csv(csv_file_path, index=False)