Toxic Comment Classification

Import modules

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import FreqDist, bigrams, word_tokenize, classify, NaiveBayesClassifier, ConfusionMatrix
from sklearn.model_selection import train_test_split
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.probability import ConditionalFreqDist
import nltk
from nltk.corpus import stopwords
import re
from collections import Counter, defaultdict

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

1. Data Download and Loading

1.1 Loading Data with pandas

In [None]:
def load_data(train_path, test_path):
    df_train = pd.read_csv(train_path)
    df_test  = pd.read_csv(test_path)

    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    df_train['non_toxic'] = (df_train[label_cols].sum(axis=1) == 0).astype(int)

    return df_train, df_test

df_train, df_test = load_data(
    'Dataset/train.csv',
    'Dataset/test.csv'
)

1.2 Initial Inspection

Display dimensions:

In [None]:
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

Show first rows:

In [None]:
display(df_train.head())

Null values by column in train:

In [None]:
print(df_train.isnull().sum())

Label distribution in train:

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'non_toxic']
print(df_train[label_cols].sum().sort_values(ascending=False))

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=df_train[label_cols].sum().index, y=df_train[label_cols].sum().values)
plt.title('Label distribution')
plt.title('Comment Distribution by Label')
plt.ylabel('Number of Comments')
plt.xlabel('Labels')
plt.show()

2. First Analysis

2.1 Missing Values and Null Checks

In [None]:
df_train.isnull().sum()

2.2 Class Distribution

The dataset has six target columns: toxic, severe_toxic, obscene, threat, insult, identity_hate.

Display the count and proportion of each class:

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'non_toxic']
class_counts = df_train[label_cols].sum().sort_values(ascending=False)
class_props = class_counts / len(df_train)
print(pd.concat([class_counts, class_props], axis=1, keys=['count','proportion']))

2.3 Comment Length Analysis

Compute length of each comment (in characters and words):

In [None]:
df_train['char_count'] = df_train['comment_text'].apply(len)
df_train['word_count'] = df_train['comment_text'].apply(lambda x: len(x.split()))
display(df_train[['char_count','word_count']].describe())

Plot histograms of comment lengths:

In [None]:
plt.figure(figsize=(8,4))
df_train['word_count'].hist(bins=50)
plt.title('Distribution of Comment Word Counts')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

2.4 Sample Comments by Category

Show example comments for each label where the label is 1:

In [None]:
for col in label_cols:
    print(f"\nExamples of {col} comments:")
    examples = df_train[df_train[col]==1]['comment_text'].sample(2, random_state=42).tolist()
    for ex in examples:
        print("- ", ex)

3. Create corpus

In [None]:
# Crear el directorio principal para almacenar el corpus
corpus_dir = 'corpus_by_label'
os.makedirs(corpus_dir, exist_ok=True)

# Guardar los comentarios en carpetas y archivos separados por etiqueta
for label in label_cols:
    comments = df_train[df_train[label] == 1]['comment_text'].dropna()

    # Crear una carpeta con el nombre de la etiqueta
    label_dir = os.path.join(corpus_dir, label)
    os.makedirs(label_dir, exist_ok=True)

    # Ruta del archivo dentro de la carpeta creada
    file_path = os.path.join(label_dir, f"{label}.txt")

    # Guardar todos los comentarios de la etiqueta en el archivo dentro de su carpeta
    with open(file_path, 'w', encoding='utf-8') as f:
        for comment in comments:
            f.write(comment.replace('\n', ' ') + '\n')

print(f"Corpus successfully loaded in: {corpus_dir}")

In [None]:
# Load corpus
corpus = CategorizedPlaintextCorpusReader(
    corpus_dir,
    r'.*\.txt',
    cat_pattern=r"([^/.]+)/.*"
)

In [None]:
print("File IDs:", corpus.fileids())
print("Categories:", corpus.categories())

In [None]:
print("Primeras 10 palabras de 'toxic':")
print(corpus.words(categories='toxic')[:10])

In [None]:
stop_words = set(stopwords.words('english'))
words = [word.lower() for word in corpus.words()]
corpus_norm = [word for word in words if word.isalpha() and word not in stop_words]

def normalizer(category):
    words = [word.lower() for word in corpus.words(categories=category)]
    return [word for word in words if word.isalpha() and word not in stop_words]

categories_norm = {
    "identity_hate": normalizer("identity_hate"),
    "insult": normalizer("insult"),
    "obscene": normalizer("obscene"),
    "severe_toxic": normalizer("severe_toxic"),
    "threat": normalizer("threat"),
    "toxic": normalizer("toxic"),
    "non_toxic": normalizer("non_toxic")
}


4. Corpus Analysis

Number of words and vocabulary

In [None]:
print(f"Word tokens in the corpus: {len(corpus.words())}")
print(f"Wordforms in the corpus: {len(set(corpus.words()))}\n")

for category in corpus.categories():
    print(f"{category}:")
    print(f"Word tokens in {category}: {len(corpus.words(categories=category))}")
    print(f"Wordforms in {category}: {len(set(corpus.words(categories=category)))}\n")

Lexical diversity

In [None]:
def lexical_diversity(words):
    return len(set(words)) / len(words)

print(f"Total corpus lexical diversity: {round(100 * lexical_diversity(corpus.words()), 2)}%\n")

for category in corpus.categories():
    print(f"{category} lexical diversity: {round(100 * lexical_diversity(corpus.words(categories=category)), 2)}%\n")

Words length

In [None]:
print("Total corpus length analysis:\n")

# Create FreqDist of the tokens lenght
fd_len = FreqDist(len(w) for w in corpus_norm)

# 10 most common sizes
print(fd_len.most_common(10))

# Most frequent word size and its frequency
freq_len = fd_len.max()
print(f"Most common lenght: {freq_len} ({fd_len[freq_len]} words)")
print(f"Lenght proportion {freq_len}: {round(100 * fd_len.freq(freq_len), 2)}%")

short_tokens = [w for w in corpus_norm if len(w) < freq_len]
print(f"Tokens with length < {freq_len}: {short_tokens[:10]}")

common_tokens = [w for w in corpus_norm if len(w) == freq_len]
print(f"Tokens with length == {freq_len}: {common_tokens[:10]}")

long_tokens = [w for w in corpus_norm if len(w) > freq_len]
print(f"Tokens with length > {freq_len}: {long_tokens[:5]}")

longest_word = max(corpus_norm, key=len)
print(f"The longest word is: '{longest_word[:25]}...' with {len(longest_word)} characters.")

long_words = [w for w in corpus_norm if len(w) > 15]

fd_long_words = FreqDist(long_words)
print(f"\nTotal words longer than 15 characters: {len(long_words)}")
print(f"Unique words longer than 15 characters: {len(set(long_words))}\n")

for key, category in categories_norm.items():

    print(f"{key} length analysis :\n")

    # Create FreqDist of the tokens lenght
    fd_len = FreqDist(len(w) for w in category)

    # 10 most common sizes
    print(fd_len.most_common(10))

    # Most frequent word size and its frequency
    freq_len = fd_len.max()
    print(f"Most common lenght: {freq_len} ({fd_len[freq_len]} words)")
    print(f"Lenght proportion {freq_len}: {round(100 * fd_len.freq(freq_len), 2)}%")

    short_tokens = [w for w in category if len(w) < freq_len]
    print(f"Tokens with length < {freq_len}: {short_tokens[:10]}")

    common_tokens = [w for w in category if len(w) == freq_len]
    print(f"Tokens with length == {freq_len}: {common_tokens[:10]}")

    common_tokens = [w for w in category if len(w) > freq_len]
    print(f"Tokens with length > {freq_len}: {common_tokens[:5]}")

    longest_word = max(category, key=len)
    print(f"The longest word is: '{longest_word[:25]}...' with {len(longest_word)} characters.")

    long_words = [w for w in category if len(w) > 20]

    fd_long_words = FreqDist(long_words)
    print(f"\nTotal words longer than 20 characters: {len(long_words)}")
    print(f"Unique words longer than 20 characters: {len(set(long_words))}\n")


In [None]:
# Define the maximum word length to include
max_len = 20

# Create the plot
plt.figure(figsize=(12, 6))

# Loop through categories, skipping 'non_toxic'
for key, category in categories_norm.items():
    if key == 'non_toxic':
        continue

    fd_len = FreqDist(len(w) for w in category if len(w) <= max_len)
    lengths = list(range(1, max_len + 1))
    freqs = [fd_len[l] for l in lengths]
    plt.plot(lengths, freqs, label=key)

# Add labels and legend
plt.title("Word Length Frequency by Category (excluding 'non_toxic')")
plt.xlabel("Word Length (characters)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Most Common Words

In [None]:
corpus_fdist = nltk.FreqDist(corpus.words())
print(corpus_fdist.most_common(10))

corpus_norm_fdist = nltk.FreqDist(corpus_norm)
print(corpus_norm_fdist.most_common(10))

def percentage(count, total):
    return 100 * count / total

total_words = len(corpus.words())

for category in corpus.categories():
    words = corpus.words(categories=category)
    total = len(words)
    fdist = FreqDist(words)
    print(f"\nTop 5 words in {category}:")
    for word, freq in fdist.most_common(5):
        print(f"{word}: {freq} ({percentage(freq, total):.2f}%)")

for key, category in categories_norm.items():
    total = len(category)
    fdist = FreqDist(category)
    print(f"\nTop 5 words in {key} (normalized):")
    for word, freq in fdist.most_common(5):
        print(f"{word}: {freq} ({percentage(freq, total):.2f}%)")

Most common bigrams

In [None]:
# For the entire original corpus
corpus_bigrams = list(bigrams(corpus.words()))
corpus_bigram_fdist = FreqDist(corpus_bigrams)
print(corpus_bigram_fdist.most_common(10))

# For the entire normalized corpus
corpus_norm_bigrams = list(bigrams(corpus_norm))
corpus_norm_bigram_fdist = FreqDist(corpus_norm_bigrams)
print(corpus_norm_bigram_fdist.most_common(10))

# For each category in the original corpus
for category in corpus.categories():
    words = corpus.words(categories=category)
    bigrams_cat = list(bigrams(words))
    total = len(bigrams_cat)
    fdist = FreqDist(bigrams_cat)
    print(f"\nTop 5 bigrams in {category}:")
    for bigram, freq in fdist.most_common(5):
        print(f"{bigram}: {freq} ({percentage(freq, total):.2f}%)")

# For each category in the normalized corpus
for key, category_words in categories_norm.items():
    bigrams_cat = list(bigrams(category_words))
    total = len(bigrams_cat)
    fdist = FreqDist(bigrams_cat)
    print(f"\nTop 5 bigrams in {key} (normalized):")
    for bigram, freq in fdist.most_common(5):
        print(f"{bigram}: {freq} ({percentage(freq, total):.2f}%)")

Most common words dispersion

In [None]:
corpus_fdist = nltk.FreqDist(corpus_norm)

top_words = [word for word, freq in corpus_fdist.most_common(10)]

nltk.Text(corpus.words()).dispersion_plot(top_words)

for key, category in categories_norm.items():
    corpus_fdist = nltk.FreqDist(category)

    top_words = [word for word, freq in corpus_fdist.most_common(10)]

    ax = nltk.Text(corpus.words(categories=key)).dispersion_plot(top_words)
    plt.title(f"Lexical Dispersion Plot for {key}")
    plt.show()


Uppercase

In [None]:
cfd_upper = ConditionalFreqDist()

for category in corpus.categories():
    words = corpus.words(categories=category)
    total_letters = sum(1 for w in words for c in w if c.isalpha())
    uppercase_letters = sum(1 for w in words for c in w if c.isupper())
    if total_letters > 0:
        percentage_upper = (uppercase_letters / total_letters) * 100
        cfd_upper[category]['uppercase_percentage'] = percentage_upper

for category in corpus.categories():
    print(f"{category}: {cfd_upper[category]['uppercase_percentage']:.2f}%")

Exclamations

In [None]:
cfd_punct = ConditionalFreqDist()

for category in corpus.categories():
    text = ' '.join(corpus.words(categories=category))
    total_chars = len(text)
    excls = text.count('!')
    ques = text.count('?')
    if total_chars:
        cfd_punct[category]['excl_pct'] = excls / total_chars * 100
        cfd_punct[category]['ques_pct'] = ques / total_chars * 100

for cat in corpus.categories():
    print(
        f"{cat}:\n"
        f"! : {cfd_punct[cat]['excl_pct']:.2f}%\n"
        f"? : {cfd_punct[cat]['ques_pct']:.2f}% \n"
    )

Repetitions

In [None]:
cfd_reps = ConditionalFreqDist()

for category in corpus.categories():
    words = corpus.words(categories=category)
    text = ' '.join(words)
    total_chars = len(text)
    total_words = len(words) or 1

    char_reps = len(re.findall(r'(.)\1{2,}', text))
    word_reps = len(re.findall(r'\b(\w+)( \1\b)+', text.lower()))

    cfd_reps[category]['char_reps_pct'] = char_reps / total_chars * 100
    cfd_reps[category]['word_reps_pct'] = word_reps / total_words * 100

for cat in corpus.categories():
    print(
        f"{cat}:\n"
        f"char repeats: {cfd_reps[cat]['char_reps_pct']:.2f}% \n"
        f"word repeats: {cfd_reps[cat]['word_reps_pct']:.2f}% \n"
    )

5. Tagging corpus and analysis

In [None]:
# Tokenization and POS tagging
total_sents = []
tagged_sents = []
file_to_sent_count = {}

for fid in corpus.fileids():
    sent_count = 0
    for line in corpus.raw(fid).splitlines():
        if line.strip():
            tokens = word_tokenize(line)
            total_sents.append(tokens)
            tagged_sents.append(nltk.pos_tag(tokens))
            sent_count += 1
    file_to_sent_count[fid] = sent_count

In [None]:
# Save tagged corpus in subfolders by category
output_dir = 'tagged_corpus'
os.makedirs(output_dir, exist_ok=True)

offset = 0
for category in corpus.categories():
    category_dir = os.path.join(output_dir, category)
    os.makedirs(category_dir, exist_ok=True)

    # Get all files for the current category
    fileids = corpus.fileids(categories=[category])
    all_comments = []

    for fid in fileids:
        comments = [c for c in corpus.raw(fid).splitlines() if c.strip()]
        all_comments.extend(comments)

    # Save combined tagged output for this category
    output_file = os.path.join(category_dir, f"{category}_tagged.txt")
    with open(output_file, 'w', encoding='utf-8') as f:
        for sent in tagged_sents[offset:offset + len(all_comments)]:
            f.write(' '.join(f"{w}/{t}" for w, t in sent) + '\n')
    offset += len(all_comments)

In [None]:
# Load corpus
tagged_corpus = CategorizedPlaintextCorpusReader(
    output_dir,
    r'.*\.txt',
    cat_pattern=r"([^/.]+)/.*"
)

Analysis

In [None]:

negative_adjectives = {"bad", "satanistic", "antisemmitian", "racist", "evil", "nasty", "ugly", "fat", "nazi", "nigger", "jew", "gay"}
mode_adverb_check = lambda w: w.endswith("ly")


pos_summary = defaultdict(Counter)
specific_summary = defaultdict(Counter)


for category in tagged_corpus.categories():
    for fileid in tagged_corpus.fileids(categories=[category]):

        words_tags = [tuple(token.rsplit("/", 1)) for token in tagged_corpus.raw(fileid).split()]

        for word, tag in words_tags:
            word_lower = word.lower()

            if tag.startswith("NN"):
                pos_summary[category]["Noun"] += 1
            elif tag.startswith("VB"):
                pos_summary[category]["Verb"] += 1
                if tag == "VB":
                    specific_summary[category]["Imperative Verbs"] += 1
            elif tag.startswith("JJ"):
                pos_summary[category]["Adjective"] += 1
                if word_lower in negative_adjectives:
                    specific_summary[category]["Negative Adjectives"] += 1
            elif tag.startswith("RB"):
                pos_summary[category]["Adverb"] += 1
                if mode_adverb_check(word_lower):
                    specific_summary[category]["Manner Adverbs"] += 1
            elif tag in {"PRP", "PRP$", "WP", "WP$"}:
                pos_summary[category]["Pronoun"] += 1

df_pos = pd.DataFrame(pos_summary).T.fillna(0)
df_pos_percent = df_pos.div(df_pos.sum(axis=1), axis=0) * 100


relevant_totals = df_pos["Verb"] + df_pos["Adjective"] + df_pos["Adverb"]
df_specific = pd.DataFrame(specific_summary).T.fillna(0)
df_specific_percent = df_specific.div(relevant_totals, axis=0) * 100


print("POS distribution (%) by category:")
display(df_pos_percent.round(2))

print("\nSpecific features (% of related POS) by category:")
display(df_specific_percent.round(2))

Classifier

In [None]:
def load_data(train_path):
    df_train = pd.read_csv(train_path)

    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    df_train['non_toxic'] = (df_train[label_cols].sum(axis=1) == 0).astype(int)

    return df_train, label_cols + ['non_toxic']

df, all_labels = load_data('Dataset/train.csv')

In [None]:
stop_words = set(stopwords.words("english"))

def tokenize_and_clean(text):
    tokens = word_tokenize(text.lower()) 
    words = [word for word in tokens if word.isalpha() and word not in stop_words]
    return words

global_tokens = []
for text in df['comment_text'].dropna():
    global_tokens.extend(tokenize_and_clean(text))

global_fdist = FreqDist(global_tokens)
global_top_words = [w for w, _ in global_fdist.most_common(10)]

global_bigrams = list(bigrams(global_tokens))
global_bigram_fdist = FreqDist(global_bigrams)
global_top_bigrams = [bg for bg, _ in global_bigram_fdist.most_common(10)]

In [None]:
def extract_length_features(text):
    tokens = tokenize_and_clean(text)
    lengths = [len(w) for w in tokens]
    fd = FreqDist(lengths)
    if not lengths:
        return {
            'most_common_len': 0,
            'most_common_freq': 0,
            'most_common_prop': 0.0,
            'short_count': 0,
            'long_count': 0
        }

    mc_len = fd.max()
    mc_freq = fd[mc_len]
    mc_prop = mc_freq / len(lengths) * 100

    short_count = sum(1 for l in lengths if l < mc_len)
    long_count = sum(1 for l in lengths if l > mc_len)
    return {
        'most_common_len': mc_len,
        'most_common_freq': mc_freq,
        'most_common_prop': mc_prop,
        'short_count': short_count,
        'long_count': long_count
    }

def extract_common_word_features(text):
    tokens = tokenize_and_clean(text)
    feats = {}
    total = len(tokens) or 1
    for word in global_top_words:
        count = tokens.count(word)
        feats[f'count_{word}'] = count
        feats[f'prop_{word}'] = count / total * 100
    return feats

def extract_bigram_features(text):
    tokens = tokenize_and_clean(text)
    bigrams_text = list(bigrams(tokens))
    feats = {}
    total = len(bigrams_text) or 1
    for bg in global_top_bigrams:
        count = bigrams_text.count(bg)
        key = f"bigram_{bg[0]}_{bg[1]}"
        feats[f'{key}_count'] = count
        feats[f'{key}_prop'] = count / total * 100
    return feats

def extract_uppercase_features(text):
    letters = [c for c in text if c.isalpha()]
    uppers = [c for c in letters if c.isupper()]
    pct_upper = (len(uppers) / len(letters) * 100) if letters else 0.0
    return {'uppercase_pct': pct_upper}

def extract_punctuation_features(text):
    total_chars = len(text) or 1
    excl = text.count('!')
    ques = text.count('?')
    return {'excl_pct': excl / total_chars * 100, 'ques_pct': ques / total_chars * 100}

def extract_repetition_features(text):
    tokens = tokenize_and_clean(text)
    text_joined = ' '.join(tokens)
    total_chars = len(text_joined) or 1
    total_words = len(tokens) or 1
    char_reps = len(re.findall(r'(.)\1{2,}', text_joined))
    word_reps = len(re.findall(r'\b(\w+)( \1\b)+', text_joined.lower()))
    return {'char_reps_pct': char_reps / total_chars * 100, 'word_reps_pct': word_reps / total_words * 100}

negative_adjectives = {"bad", "satanistic", "antisemmitian", "racist", "evil", "nasty", "ugly", "fat", "nazi", "nigger", "jew", "gay"}
mode_adverb_check = lambda w: w.endswith("ly")

def extract_pos_features(text):
    tokens = tokenize_and_clean(text)
    tags = nltk.pos_tag(tokens)
    counts = Counter()
    for word, tag in tags:
        if tag.startswith("NN"): counts['noun_count'] += 1
        elif tag.startswith("VB"):
            counts['verb_count'] += 1
            if tag == 'VB': counts['imperative_count'] += 1
        elif tag.startswith("JJ"):
            counts['adj_count'] += 1
            if word.lower() in negative_adjectives:
                counts['neg_adj_count'] += 1
        elif tag.startswith("RB"):
            counts['adv_count'] += 1
            if mode_adverb_check(word.lower()): counts['manner_adv_count'] += 1
        elif tag in {"PRP","PRP$","WP","WP$"}:
            counts['pronoun_count'] += 1
    total_rel = counts['verb_count'] + counts['adj_count'] + counts['adv_count'] or 1

    feats = {
        'noun_count': counts['noun_count'],
        'verb_count': counts['verb_count'],
        'adj_count': counts['adj_count'],
        'adv_count': counts['adv_count'],
        'pronoun_count': counts['pronoun_count'],
        'imperative_pct': counts['imperative_count'] / counts['verb_count'] * 100 if counts['verb_count'] else 0.0,
        'neg_adj_pct': counts['neg_adj_count'] / counts['adj_count'] * 100 if counts['adj_count'] else 0.0,
        'manner_adv_pct': counts['manner_adv_count'] / counts['adv_count'] * 100 if counts['adv_count'] else 0.0
    }
    return feats


def extract_features(text):
    feats = {}
    feats.update(extract_length_features(text))
    feats.update(extract_common_word_features(text))
    feats.update(extract_bigram_features(text))
    feats.update(extract_uppercase_features(text))
    feats.update(extract_punctuation_features(text))
    feats.update(extract_repetition_features(text))
    feats.update(extract_pos_features(text))
    return feats


In [None]:
labeled = []
for _, row in df[['comment_text'] + all_labels].dropna().iterrows():
    text = row['comment_text']
    for lbl in all_labels:
        if row[lbl] == 1:
            feats = extract_features(text)
            labeled.append((feats, lbl))


In [None]:
labels = [lbl for _, lbl in labeled]
train_set, test_set = train_test_split(labeled, test_size=0.2, random_state=42, stratify=labels)

In [None]:
clf = NaiveBayesClassifier.train(train_set)
print("Accuracy:", classify.accuracy(clf, test_set))
print("Most informative features:")
clf.show_most_informative_features(10)

In [None]:
ref = [lbl for _, lbl in test_set]
pred = [clf.classify(feats) for feats, _ in test_set]
print(ConfusionMatrix(ref, pred))

In [None]:
errors = [(true, pred) for (feats, true), pred in zip(test_set, pred) if true != pred]
print(f"Total errores: {len(errors)}")
print("5 primeros errores:")
for true, pred in errors[:5]:
    print(f"True: {true}, Pred: {pred}")