In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re


In [8]:
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hadrienstrichard/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
import spacy
nlp_fr = spacy.load("fr_core_news_sm") 

In [5]:
texts = pd.read_parquet('../Data/excerpts_df.parquet')

texts.head()

Unnamed: 0,Author,Title,URL,Excerpt_ID,Excerpt_Text,Cleaned_Text,Tokens,Tokenized_Text,of_which_generated
0,Charles Nodier,Smarra ou les démons de la nuit: Songes romant...,https://www.gutenberg.org/ebooks/18083,18083_1,n'est souvent déterminée que par un mot. En ce...,n'est souvent déterminée que par un mot. en ce...,"[n'est, souvent, déterminée, que, par, un, mot...",,
1,Charles Nodier,Smarra ou les démons de la nuit: Songes romant...,https://www.gutenberg.org/ebooks/18083,18083_2,Le reste ne me regarde point. J'ai dit de qui ...,le reste ne me regarde point. j'ai dit de qui ...,"[le, reste, ne, me, regarde, point, ., j'ai, d...",,
2,Charles Nodier,Smarra ou les démons de la nuit: Songes romant...,https://www.gutenberg.org/ebooks/18083,18083_3,"Les sylphes, tout étourdis du bruit de la veil...","les sylphes, tout étourdis du bruit de la veil...","[les, sylphes, ,, tout, étourdis, du, bruit, d...",,
3,Charles Nodier,Smarra ou les démons de la nuit: Songes romant...,https://www.gutenberg.org/ebooks/18083,18083_4,"À peine mes yeux sont fermés, à peine cesse la...","à peine mes yeux sont fermés, à peine cesse la...","[à, peine, mes, yeux, sont, fermés, ,, à, pein...",,
4,Charles Nodier,Smarra ou les démons de la nuit: Songes romant...,https://www.gutenberg.org/ebooks/18083,18083_5,"C'est en vain que le jour s'éteindrait, tant q...","c'est en vain que le jour s'éteindrait, tant q...","[c'est, en, vain, que, le, jour, s'éteindrait,...",,


In [6]:
texts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14680 entries, 0 to 14679
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Author              14680 non-null  object
 1   Title               14680 non-null  object
 2   URL                 14680 non-null  object
 3   Excerpt_ID          14680 non-null  object
 4   Excerpt_Text        14680 non-null  object
 5   Cleaned_Text        14680 non-null  object
 6   Tokens              14680 non-null  object
 7   Tokenized_Text      55 non-null     object
 8   of_which_generated  55 non-null     object
dtypes: object(9)
memory usage: 1.0+ MB


In [9]:
# Load French function words (common stopwords in French) as our function word list.
french_function_words = set(stopwords.words('french'))
print(f"Number of French function words: {len(french_function_words)}")
# Optionally, inspect a few function words
print("Sample function words:", list(french_function_words)[:10])

Number of French function words: 157
Sample function words: ['s', 'du', 'ayante', 'ayant', 'eurent', 'étés', 'aies', 'serez', 'l', 'avez']


In [12]:
# Define a set of punctuation marks we will consider for certain features.
punct_set = {'.', ',', ';', '!', '?', ':'}

## Function word frequencies

In [14]:
# Compute total number of words (excluding punctuation) for each text, to use in normalization
texts['WordCount'] = texts['Tokens'].apply(lambda tokens: sum(1 for t in tokens if t not in punct_set))

# Add one feature column for each function word in the list
for fw in french_function_words:
    col_name = f"fw_{fw}"
    # Normalized frequency: count of fw divided by number of words (avoid division by zero)
    texts[col_name] = texts.apply(
        lambda row: row['Tokens'].count(fw) / row['WordCount'] if row['WordCount'] > 0 else 0.0,
        axis=1
    )

# Example: check a few function word feature columns for the first text
print(texts.loc[0, ['Author'] + [f"fw_{w}" for w in list(french_function_words)[:5]]])


Author       Charles Nodier
fw_s                    0.0
fw_du              0.012085
fw_ayante               0.0
fw_ayant                0.0
fw_eurent               0.0
Name: 0, dtype: object


## Pos tagging

In [None]:
# Initialize lists to collect POS frequency features for each text
pos_noun_freq = []
pos_verb_freq = []
pos_adj_freq = []

for tokens in texts['Tokens']:
    # Join tokens back into a text string for POS tagging
    doc = nlp_fr(" ".join(tokens))
    # Count total number of word tokens (non-punctuation) in this text
    total_words = sum(1 for token in doc if token.pos_ != "PUNCT")
    # Count specific POS categories
    noun_count = sum(1 for token in doc if token.pos_ in ["NOUN", "PROPN"])  # treat proper nouns as nouns
    verb_count = sum(1 for token in doc if token.pos_ == "VERB")
    adj_count  = sum(1 for token in doc if token.pos_ == "ADJ")
    # Compute relative frequencies (avoid division by zero)
    if total_words > 0:
        pos_noun_freq.append(noun_count / total_words)
        pos_verb_freq.append(verb_count / total_words)
        pos_adj_freq.append(adj_count / total_words)
    else:
        pos_noun_freq.append(0.0)
        pos_verb_freq.append(0.0)
        pos_adj_freq.append(0.0)

# Add POS distribution features to the DataFrame
texts['POS_NOUN'] = pos_noun_freq
texts['POS_VERB'] = pos_verb_freq
texts['POS_ADJ']  = pos_adj_freq

# Example: view POS distribution for first few texts
print(texts.loc[:4, ['Author', 'POS_NOUN', 'POS_VERB', 'POS_ADJ']])


## Lexical features

In [None]:
def compute_lexical_metrics(token_list):
    # Remove punctuation and non-word tokens from the list
    words = []
    for token in token_list:
        if token in punct_set:
            continue  # skip punctuation tokens
        # Remove any punctuation attached to the token (e.g., apostrophes or hyphens) for word length calculation
        cleaned_token = re.sub(r'[\W_]+', '', token, flags=re.UNICODE)  # keep only alphanumeric characters
        if cleaned_token == "" or cleaned_token.isdigit():
            continue  # skip if token is empty or purely numeric after cleaning
        words.append(cleaned_token)
    if len(words) == 0:
        # If no valid word tokens, return zeros
        return pd.Series({"TTR": 0.0, "Hapax_Ratio": 0.0, "AvgWordLen": 0.0})
    total_words = len(words)
    unique_words = set(words)
    # Type-Token Ratio
    ttr = len(unique_words) / total_words
    # Hapax Legomena Ratio
    word_freqs = Counter(words)
    hapax_count = sum(1 for count in word_freqs.values() if count == 1)
    hapax_ratio = hapax_count / total_words
    # Average word length (in characters)
    total_chars = sum(len(w) for w in words)
    avg_word_len = total_chars / total_words
    return pd.Series({"TTR": ttr, "Hapax_Ratio": hapax_ratio, "AvgWordLen": avg_word_len})

# Apply the lexical metrics function to each text
lexical_df = texts['Tokens'].apply(compute_lexical_metrics)
# Merge the resulting metrics columns into the main DataFrame
texts = pd.concat([texts, lexical_df], axis=1)

# Example: show lexical richness metrics for a text
print(texts.loc[0, ['author', 'WordCount', 'TTR', 'Hapax_Ratio', 'AvgWordLen']])


## Punctuation

In [None]:
# Define a mapping of punctuation symbols to descriptive column names
punct_marks = {'.': 'Period', ',': 'Comma', ';': 'Semicolon', '?': 'QuestionMark', '!': 'ExclamationMark', ':': 'Colon'}

for symbol, name in punct_marks.items():
    texts[f"{name}_Count"] = texts['Tokens'].apply(lambda tokens: tokens.count(symbol))

# Example: show punctuation counts for the first text
print(texts.loc[0, ['author'] + [f"{name}_Count" for name in punct_marks.values()]])


## Sentence metrics

In [None]:
def compute_sentence_metrics(token_list):
    # Count sentence-ending punctuation marks
    sentence_enders = {'.', '!', '?'}
    sent_count = sum(token_list.count(sym) for sym in sentence_enders)
    if sent_count == 0:
        # If no explicit end markers but there are words, assume at least one sentence
        sent_count = 1 if any(t not in punct_set for t in token_list) else 0
    # Count words (exclude all punctuation tokens)
    word_count = sum(1 for t in token_list if t not in punct_set)
    avg_sent_len = word_count / sent_count if sent_count > 0 else 0.0
    return pd.Series({"Sentence_Count": sent_count, "AvgSentenceLength": avg_sent_len})

sentence_df = texts['Tokens'].apply(compute_sentence_metrics)
texts = pd.concat([texts, sentence_df], axis=1)

# Example: show sentence metrics for first few texts
print(texts.loc[:4, ['author', 'Sentence_Count', 'AvgSentenceLength']])


## Character Metrics

In [None]:
def reconstruct_text(token_list):
    """Reconstruct text string from tokens, inserting spaces appropriately."""
    text = ""
    for token in token_list:
        if token in punct_set:
            # Attach punctuation directly to the text (no preceding space)
            text += token
        else:
            # If not the first token, add a space before adding the word
            if text:
                text += " "
            text += token
    return text

# List to collect n-gram frequency dictionaries for each text
char_ngram_features = []

for token_list in texts['Tokens']:
    text = reconstruct_text(token_list)
    # Initialize a dict for this text's n-grams
    ngram_counts = {}
    # Character bigrams (2-grams)
    for i in range(len(text) - 1):
        bigram = text[i:i+2]
        # Use an underscore to represent spaces in feature names for clarity
        feat_name = "bi_" + bigram.replace(" ", "_")
        ngram_counts[feat_name] = ngram_counts.get(feat_name, 0) + 1
    # Character trigrams (3-grams)
    for i in range(len(text) - 2):
        trigram = text[i:i+3]
        feat_name = "tri_" + trigram.replace(" ", "_")
        ngram_counts[feat_name] = ngram_counts.get(feat_name, 0) + 1
    char_ngram_features.append(ngram_counts)

# Convert list of dicts to DataFrame (each n-gram becomes a column, fill missing with 0)
char_ngram_df = pd.DataFrame(char_ngram_features).fillna(0).astype(int)
# Merge the n-gram features into the main DataFrame
texts = pd.concat([texts, char_ngram_df], axis=1)

# Example: number of character n-gram feature columns
print(f"Total character n-gram features: {char_ngram_df.shape[1]}")
# Show a small sample of character n-gram features for the first text
print(texts.loc[0, [col for col in texts.columns if col.startswith('bi_')][:5]])


## Final DF

In [None]:
# Drop the Tokens column and the intermediate WordCount column from the DataFrame
feature_columns = [col for col in texts.columns if col not in ('Tokens', 'WordCount')]
features_df = texts[feature_columns].copy()

# Show the final feature DataFrame shape and columns
print("Final feature DataFrame shape:", features_df.shape)
print("Sample columns:", list(features_df.columns[:10]))
# Display the first few rows of the feature DataFrame
features_df.head()
