Chapter 6:

In [8]:
# 1) Instalación de librerías y descargas de NLTK
import nltk
import pandas as pd
import random
from nltk import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy

nltk.download('punkt')
nltk.download('punkt_tab')

# 2) Carga del dataset
def load_data(path='Dataset/train.csv', sample_size=20000):
    df = pd.read_csv(path)
    df = df[['comment_text', 'toxic']].dropna().reset_index(drop=True)
    df['label'] = df['toxic'].map({1: 'toxic', 0: 'non_toxic'})
    return df.sample(sample_size, random_state=42).reset_index(drop=True)

# Cargar datos
df = load_data()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hugom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hugom\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [11]:
# 3) Preprocesado y tokenización de los comentarios
import string

def preprocess_and_tokenize(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    # Tokenizar por palabras
    return nltk.word_tokenize(text)

# Aplicar preprocesado y tokenización
df['tokens'] = df['comment_text'].apply(preprocess_and_tokenize)

# Visualizamos un ejemplo
print(df[['comment_text','tokens']].head(1).to_dict(orient='records')[0])

{'comment_text': "Geez, are you forgetful!  We've already discussed why Marx  was  not an anarchist, i.e. he wanted to use a State to mold his 'socialist man.'  Ergo, he is a statist - the opposite of an  anarchist.  I know a guy who says that, when he gets old and his teeth fall out, he'll quit eating meat.  Would you call him a vegetarian?", 'tokens': ['geez', 'are', 'you', 'forgetful', 'weve', 'already', 'discussed', 'why', 'marx', 'was', 'not', 'an', 'anarchist', 'ie', 'he', 'wanted', 'to', 'use', 'a', 'state', 'to', 'mold', 'his', 'socialist', 'man', 'ergo', 'he', 'is', 'a', 'statist', 'the', 'opposite', 'of', 'an', 'anarchist', 'i', 'know', 'a', 'guy', 'who', 'says', 'that', 'when', 'he', 'gets', 'old', 'and', 'his', 'teeth', 'fall', 'out', 'hell', 'quit', 'eating', 'meat', 'would', 'you', 'call', 'him', 'a', 'vegetarian']}


In [12]:
# 4) Construcción del vocabulario (2000 palabras más frecuentes)
all_words = FreqDist(w for tokens in df['tokens'] for w in tokens)
word_features = list(all_words)[:5000]

print("Tamaño de vocabulario:", len(word_features))
print("Primeras 10 features:", word_features[:10])

Tamaño de vocabulario: 2000
Primeras 10 features: ['the', 'to', 'of', 'a', 'and', 'you', 'i', 'is', 'that', 'in']


In [13]:
# 5) Definición del extractor de características
def document_features(tokens):
    token_set = set(tokens)
    return {f'contains({w})': (w in token_set) for w in word_features}

# Probamos el extractor con un ejemplo
test_feats = document_features(df.loc[0,'tokens'])
print(list(test_feats.items())[:10])

[('contains(the)', True), ('contains(to)', True), ('contains(of)', True), ('contains(a)', True), ('contains(and)', True), ('contains(you)', True), ('contains(i)', True), ('contains(is)', True), ('contains(that)', True), ('contains(in)', False)]


In [14]:
# 6) Preparar feature‐sets y división en train / dev‐test / test
featuresets = [(document_features(toks), lbl)
               for toks, lbl in zip(df['tokens'], df['label'])]
random.shuffle(featuresets)

n = len(featuresets)
n_train = int(0.7 * n)
n_dev = int(0.15 * n)

train_set = featuresets[:n_train]
dev_set   = featuresets[n_train:n_train+n_dev]
test_set  = featuresets[n_train+n_dev:]

print("Entrenamiento:", len(train_set), "Dev-test:", len(dev_set), "Test:", len(test_set))

Entrenamiento: 14000 Dev-test: 3000 Test: 3000


In [15]:
# 7) Entrenamiento del clasificador Naive Bayes
classifier = NaiveBayesClassifier.train(train_set)

In [16]:
# 8) Evaluación en dev-test
dev_acc = accuracy(classifier, dev_set)
print(f"Dev-Test Accuracy: {dev_acc:.4f}")

Dev-Test Accuracy: 0.6557


In [17]:
# 9) Features más informativos
classifier.show_most_informative_features(10)

Most Informative Features
          contains(fuck) = True            toxic : non_to =    259.9 : 1.0
         contains(bitch) = True            toxic : non_to =    243.0 : 1.0
       contains(fucking) = True            toxic : non_to =    151.5 : 1.0
          contains(suck) = True            toxic : non_to =    124.2 : 1.0
          contains(cock) = True            toxic : non_to =    118.6 : 1.0
       contains(asshole) = True            toxic : non_to =    115.6 : 1.0
        contains(faggot) = True            toxic : non_to =    114.7 : 1.0
          contains(cunt) = True            toxic : non_to =     87.5 : 1.0
      contains(retarded) = True            toxic : non_to =     87.5 : 1.0
           contains(ass) = True            toxic : non_to =     55.4 : 1.0


In [18]:
# 10) Análisis de errores
errors = []
for feats, label in dev_set:
    guess = classifier.classify(feats)
    if guess != label:
        errors.append((label, guess, feats))

print(f"Número de errores en dev-test: {len(errors)}")
# Mostrar algunos ejemplos
errors[:5]

Número de errores en dev-test: 1033


[('non_toxic',
  'toxic',
  {'contains(the)': True,
   'contains(to)': False,
   'contains(of)': False,
   'contains(a)': False,
   'contains(and)': False,
   'contains(you)': False,
   'contains(i)': False,
   'contains(is)': True,
   'contains(that)': False,
   'contains(in)': False,
   'contains(it)': False,
   'contains(for)': False,
   'contains(not)': False,
   'contains(this)': False,
   'contains(on)': False,
   'contains(be)': False,
   'contains(as)': False,
   'contains(have)': False,
   'contains(are)': False,
   'contains(your)': False,
   'contains(if)': False,
   'contains(with)': False,
   'contains(article)': False,
   'contains(was)': False,
   'contains(or)': False,
   'contains(but)': False,
   'contains(page)': False,
   'contains(my)': False,
   'contains(an)': False,
   'contains(do)': False,
   'contains(by)': False,
   'contains(from)': False,
   'contains(at)': False,
   'contains(me)': False,
   'contains(about)': False,
   'contains(can)': False,
   'contain

In [19]:
#Esto no aparece en el capitulo
# 11) Experimento: distintos tamaños de vocabulario
for size in [1000, 2000, 5000]:
    word_feats = list(FreqDist(w for tokens in df['tokens'] for w in tokens))[:size]
    def feats(tokens): return {f'contains({w})': (w in set(tokens)) for w in word_feats}
    fs = [(feats(t), l) for t,l in zip(df['tokens'], df['label'])]
    random.shuffle(fs)
    n = len(fs)
    train, dev = fs[:int(0.7*n)], fs[int(0.7*n):int(0.85*n)]
    acc = accuracy(NaiveBayesClassifier.train(train), dev)
    print(f"Vocab size {size}: Dev accuracy = {acc:.4f}")

Vocab size 1000: Dev accuracy = 0.6463
Vocab size 2000: Dev accuracy = 0.6387
Vocab size 5000: Dev accuracy = 0.6400


In [22]:
# Función mejorada con sufijos y bigramas
from nltk import bigrams

# Función mejorada con sufijos y bigramas
def rich_features(tokens):
    tokens_set = set(tokens)
    feats = {f'contains({w})': (w in tokens_set) for w in word_features}
    # sufijos
    feats['suffix1'] = tokens[-1] if tokens else ''
    # suffix2 como concatenación de los últimos dos caracteres
    feats['suffix2'] = ''.join(tokens[-2:]) if len(tokens) >= 2 else ''
    # bigramas frecuentes
    from nltk import bigrams
    for bg in bigrams(tokens):
        feats[f'bigram({bg[0]}_{bg[1]})'] = True
    return feats

# Entrenar y evaluar con rich_features
fs_rich = [(rich_features(t), l) for t,l in zip(df['tokens'], df['label'])]
random.shuffle(fs_rich)
n = len(fs_rich)
train_r, dev_r = fs_rich[:int(0.7*n)], fs_rich[int(0.7*n):int(0.85*n)]
clf_rich = NaiveBayesClassifier.train(train_r)
print("Dev accuracy con rich features:", accuracy(clf_rich, dev_r))

Dev accuracy con rich features: 0.6773333333333333


Section 3

In [25]:
# 1) Imports y configuración
import nltk
import pandas as pd
import random
import string
from nltk import FreqDist
from nltk.classify import NaiveBayesClassifier
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Asegurar recursos NLTK
nltk.download('punkt')
nltk.download('punkt_tab')

# Cargar y tokenizar (usando preprocesado del capítulo 6)
def load_and_tokenize(path='Dataset/train.csv', sample_size=20000):
    df = pd.read_csv(path)[['comment_text','toxic']].dropna()
    df['label'] = df['toxic'].map({1:'toxic',0:'non_toxic'})
    df = df.sample(sample_size, random_state=42)

    # Preprocesado y tokenización
    def preprocess_and_tokenize(text):
        text = text.lower()
        text = ''.join(ch for ch in text if ch not in string.punctuation)
        return nltk.word_tokenize(text)
    df['tokens'] = df['comment_text'].apply(preprocess_and_tokenize)
    return df.reset_index(drop=True)

# Cargar datos
df = load_and_tokenize()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hugom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hugom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [26]:
# 2) Construir featureset y división Training/Dev/Test
all_words = FreqDist(w for tokens in df['tokens'] for w in tokens)
word_features = list(all_words)[:2000]

def doc_feats(tokens):
    s = set(tokens)
    return {f'contains({w})': (w in s) for w in word_features}

# Crear featuresets
features = [(doc_feats(t), l) for t,l in zip(df['tokens'], df['label'])]
random.shuffle(features)

# Divisiones
n = len(features)
train = features[:int(0.7*n)]
dev   = features[int(0.7*n):int(0.85*n)]
test  = features[int(0.85*n):]

print("Tamaños: Train=", len(train), "Dev=", len(dev), "Test=", len(test))

Tamaños: Train= 14000 Dev= 3000 Test= 3000


In [27]:
# 3) Accuracy en Dev
clf = NaiveBayesClassifier.train(train)
accuracy_dev = nltk.classify.accuracy(clf, dev)
print(f"Dev Accuracy: {accuracy_dev:.4f}")

Dev Accuracy: 0.6447


In [28]:
# 4) Precision, Recall y F1 en Test
y_true = [label for (_,label) in test]
y_pred = [clf.classify(feats) for (feats,_) in test]

prfs = precision_recall_fscore_support(y_true, y_pred, labels=['toxic','non_toxic'], average=None)
print("Precision, Recall, F1 por clase:\n", prfs)
print("\nReporte completo:\n", classification_report(y_true, y_pred))

Precision, Recall, F1 por clase:
 (array([0.1822542 , 0.97026872]), array([0.81428571, 0.62389706]), array([0.29784455, 0.75945402]), array([ 280, 2720]))

Reporte completo:
               precision    recall  f1-score   support

   non_toxic       0.97      0.62      0.76      2720
       toxic       0.18      0.81      0.30       280

    accuracy                           0.64      3000
   macro avg       0.58      0.72      0.53      3000
weighted avg       0.90      0.64      0.72      3000



In [32]:
# 6) Cross-Validation con Naive Bayes (manual usando StratifiedKFold)
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Preparar etiquetas y folds
labels = np.array([l for _,l in features])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
for train_idx, test_idx in skf.split(features, labels):
    train_fs = [features[i] for i in train_idx]
    test_fs  = [features[i] for i in test_idx]
    clf_cv = NaiveBayesClassifier.train(train_fs)
    acc = nltk.classify.accuracy(clf_cv, test_fs)
    cv_scores.append(acc)

print("Cross-Val Accuracies:", cv_scores)
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Cross-Val Accuracies: [0.6455, 0.6445, 0.64875, 0.65, 0.65525]
Mean CV Accuracy: 0.6488 ± 0.0038
