## Text Preprocessing for Italian Documents

### Import packages

In [None]:
%pip install bs4 nltk seaborn wordcloud autocorrect

In [None]:
import os
import pickle
import pandas as pd
import re
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import itertools
from autocorrect import Speller

# Set a clean visual style
sns.set_theme(style="whitegrid", context="notebook")  # or "talk" for presentations



### Load NLTK resources (from the **resources** folder)

In [None]:
# nltk.download('punkt') # Tokenizer models
# nltk.download('punkt_tab') # Tokenizer models

This setup is needed because there is a known bug with **italian** resources

In [None]:
# Add resource folder to path
base_dir = os.getcwd()  # use working dir
resources_path = os.path.join(base_dir, "resources")

# Load italian tokenize
with open(os.path.join(resources_path, "italian_py3.pickle"), 'rb') as f:
    italian_tokenizer = pickle.load(f)

# Load italian stopwords
with open(os.path.join(resources_path, "stopwords_it.txt"), 'r', encoding='utf-8') as f:
    stop_words = set(line.strip() for line in f if line.strip())

### Import dataset from **Hugging Face** (optional)

In [None]:
#pip install datasets

# from datasets import load_dataset

# Load the CHANGE-IT dataset from Hugging Face
# dataset = load_dataset("gsarti/change_it", split="train")

# Convert Hugging Face dataset to Pandas DataFrame
# df = dataset.to_pandas()


### Sample **change-it** public dataset (optional)

In [None]:
# I generate the datasets sampling change-it datasets (you don't need to run this code)

# SAMPLE_FRAC = 0.01 # 1% sample

# Load datasets
# df_repubblica = pd.read_csv("change-it/change-it.repubblica.train.csv", sep=',')
# df_ilgiornale = pd.read_csv("change-it/change-it.ilgiornale.train.csv", sep=',')

# Estract a 1% sample
# df_repubblica_sample = df_repubblica.sample(frac=SAMPLE_FRAC, random_state=42)
# df_ilgiornale_sample = df_ilgiornale.sample(frac=SAMPLE_FRAC, random_state=42)

# Salva the sample
# df_repubblica_sample.to_csv("data/repubblica_sample.csv", index=False)
# df_ilgiornale_sample.to_csv("data/ilgiornale_sample.csv", index=False)

### Load dataset (stored in **data** folder)

In [None]:
df = pd.read_csv("data/repubblica_sample.csv")

# add a column with the newspaper name
df['newspaper'] = 'repubblica'

# Print df columns
print(df.columns)

df.head()

### 1. Text cleaning

In [None]:
import unicodedata

def apply_text_cleaning(text):
    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Normalize curly quotes and dashes
    text = text.replace("’", "'").replace("‘", "'") \
               .replace("“", '"').replace("”", '"') \
               .replace("–", "-").replace("—", "-")

    # Optional: normalize Unicode characters to ASCII (e.g., é → e)
    # text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")

    # 3. Remove URLs and email addresses
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\S+@\S+', '', text)

    # 4. Lowercase
    text = text.lower()

    # 5. Remove punctuation (keep straight apostrophes and dashes)
    text = re.sub(r"[^\w\s]", '', text)

    # 6. Remove digits
    text = re.sub(r'\d+', '', text)

    # 7. Tokenize and remove stopwords
    sentences = italian_tokenizer.tokenize(text)
    tokens = [word for sent in sentences for word in sent.split()]
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)


### 2. Stemming

In [None]:
# Create italian stemmer
stemmer = SnowballStemmer("italian")

def apply_stemming(text):
    tokens = text.split()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed_tokens)


### 3. Spell checking

In [None]:
# Initialize the spell checker once for efficiency
spell = Speller(lang='it')

# Define vowels including accented ones
vowels = set('aeiouAEIOUàèéìòùÀÈÉÌÒÙ')

def apply_spell_check(text):
    """
    Reduces repeated vowels and consonants in a word 
    (1 max for vowels including accented, 2 max for consonants),
    then applies spell correction.

    Args:
        text (str): Input text to correct

    Returns:
        str: Corrected text
    """
    # Group consecutive identical characters
    grouped_chars = itertools.groupby(text)

    # Apply max 1 repetition for vowels, max 2 for consonants
    cleaned_text = ''.join(
        ''.join(group)[:1] if char in vowels else ''.join(group)[:2]
        for char, group in grouped_chars
    )

    # Apply spell correction
    corrected_text = spell(cleaned_text)
    
    return corrected_text


### Try methods on a sample sentence (playground)

In [None]:
sample_text = "I giornalisti stavano scrivendo articoli mooooooltttttto interessanti sull'inteligensa artificiale."
print("Sample text:", sample_text)

# Phase 1: spell checking
corrected = apply_spell_check(sample_text)
print("After spell checking:", corrected)

# Phase 2: cleaning
cleaned = apply_text_cleaning(corrected)
print("After cleaning text:", cleaned)

# Phase 3: stemming
stemmed = apply_stemming(cleaned)
print("After stemming:", stemmed)



### Apply text cleaning to the dataset

In [None]:
# Apply text cleaning
df['cleaned_text'] = df['full_text'].apply(apply_text_cleaning)
df['stemmed_text'] = df['cleaned_text'].apply(apply_stemming)


### Compute word frequency (using **Counter** method)

In [None]:
# Join all cleaned texts into a single list of words
all_words = ' '.join(df['cleaned_text']).split()

# Count the frequency of each word
word_freq = Counter(all_words)

In [None]:
# Show the 10 most frequent words
print("Most frequent words:")
print(word_freq.most_common(10))

# Show the 10 least frequent words
print("\nLeast frequent words:")
print(word_freq.most_common()[-10:])

### 🛠️ Define a function to plot word frequencies

In [None]:
def plot_top_words(word_freq, title, top_n=10, color='#4C72B0'):
    # Get the top N words and their counts
    words, counts = zip(*word_freq.most_common(top_n))
    
    # Create figure
    plt.figure(figsize=(12, 6))
    bars = sns.barplot(x=list(words), y=list(counts), color=color)

    # Annotate bars with counts
    for i, count in enumerate(counts):
        bars.text(i, count + max(counts)*0.01, str(count), 
                  ha='center', va='bottom', fontsize=10, fontweight='bold')

    # Improve aesthetics
    plt.title(title, fontsize=16, fontweight='bold')
    plt.xlabel("Words", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.xticks(rotation=45, fontsize=10)
    plt.yticks(fontsize=10)
    plt.tight_layout()
    plt.show()


### Apply the plot_top_word to before/after word frequencies

In [None]:
from collections import Counter

# Recalculate frequencies
freq_before = Counter(' '.join(df['full_text']).split())
freq_after = Counter(' '.join(df['cleaned_text']).split())

# Plot comparison
plot_top_words(freq_before, "Top 10 Most Frequent Words (Before Filtering)", color='#1f77b4')
plot_top_words(freq_after, "Top 10 Most Frequent Words (After Filtering)", color='#ff7f0e')


### Define frequent and rare word sets

In [None]:
# Create sets of most and least frequent words
most_common = set([word for word in word_freq.most_common(10)])
least_common = set([word for word in word_freq.most_common()[-10:]])

print("Words to remove (most frequent):", most_common)
print("Words to remove (least frequent):", least_common)


### 🛠️ Define the filtering function  (remove **least_common** and **unwanted** words)

In [None]:
# Function to remove both most and least frequent words from a text
def remove_rare_words(text):
    tokens = text.split()
    return ' '.join([word for word in tokens if word not in least_common])


In [None]:
# Function to remove a list of unwanted words from a text
def remove_custom_words(text, words_to_remove):
    tokens = text.split()
    cleaned_tokens = [word for word in tokens if word not in words_to_remove]
    return ' '.join(cleaned_tokens)


### 🧪 Apply filtering and compare results

In [None]:
words_to_remove = ['cè', 'litalia', 'alcune', 'né']

# Apply the filtering function to the cleaned texts
df['cleaned_text_no_rare'] = df['cleaned_text'].apply(remove_rare_words)

# Apply the filtering function to remove the unwanted words
df['final_text'] = df['cleaned_text_no_rare'].apply(lambda x: remove_custom_words(x, words_to_remove))

# Show comparison between original, cleaned, and final versions
df[['full_text', 'cleaned_text', 'final_text']].head()

### ☁️ Generate word clouds (before and after)

In [None]:
# Create word cloud from cleaned_text (before filtering)
text_before = ' '.join(df['full_text'])
wordcloud_before = WordCloud(width=800, height=400, background_color='white').generate(text_before)

# Create word cloud from final_text (after filtering)
text_after = ' '.join(df['final_text'])
wordcloud_after = WordCloud(width=800, height=400, background_color='white').generate(text_after)


In [None]:
# Set up the figure
plt.figure(figsize=(16, 6))

# Word cloud before filtering
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_before, interpolation='bilinear')
plt.title("Word Cloud – Before Filtering", fontsize=14)
plt.axis('off')

# Word cloud after filtering
plt.subplot(1, 2, 2)
plt.imshow(wordcloud_after, interpolation='bilinear')
plt.title("Word Cloud – After Filtering", fontsize=14)
plt.axis('off')

plt.tight_layout()
plt.show()


### Save cleaned dataset in **data** folder (_repubblica_cleaned.csv_)

In [None]:
df_cleaned = df[['headline','full_text','final_text']]
df_cleaned.to_csv("data/repubblica_cleaned.csv", index=False)