<a href="https://colab.research.google.com/github/hashirmohammad/JigsawCommentClassification/blob/main/NeuralNetwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, roc_auc_score
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/MLProject')
from src.preprocess import clean_text

TRAIN_PATH = "/content/drive/MyDrive/MLProject/data/train.csv"
TEST_PATH = "/content/drive/MyDrive/MLProject/data/test.csv"
TEST_LABELS_PATH = "/content/drive/MyDrive/MLProject/data/test_labels.csv"

df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']


# Use your existing clean_text() function
df["comment_text"] = df["comment_text"].apply(clean_text)
test_df["comment_text"] = test_df["comment_text"].fillna("").apply(clean_text)


X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["comment_text"],
    df[labels].values,
    test_size=0.1,
    random_state=42,
    shuffle=True
)

# =========================
# 4. TF-IDF Vectorization
# =========================
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_val_tfidf   = tfidf.transform(X_val_text)

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=256, random_state=42)

X_train_svd = svd.fit_transform(X_train_tfidf)
X_val_svd   = svd.transform(X_val_tfidf)

print("Dense shapes:", X_train_svd.shape, X_val_svd.shape)

Dense shapes: (143613, 256) (15958, 256)


In [None]:
input_layer = tf.keras.Input(shape=(X_train_svd.shape[1],))

x = tf.keras.layers.Dense(256, activation='relu')(input_layer)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)

output_layer = tf.keras.layers.Dense(6, activation='sigmoid')(x)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_svd,
    y_train,
    validation_data=(X_val_svd, y_val),
    epochs=10,
    batch_size=256,
    callbacks=[es],
    verbose=1
)


Epoch 1/10
[1m561/561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.6188 - loss: 0.2061 - val_accuracy: 0.9940 - val_loss: 0.0640
Epoch 2/10
[1m561/561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9537 - loss: 0.0661 - val_accuracy: 0.9940 - val_loss: 0.0625
Epoch 3/10
[1m561/561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9835 - loss: 0.0620 - val_accuracy: 0.9940 - val_loss: 0.0609
Epoch 4/10
[1m561/561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9917 - loss: 0.0607 - val_accuracy: 0.9940 - val_loss: 0.0601
Epoch 5/10
[1m561/561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9927 - loss: 0.0592 - val_accuracy: 0.9940 - val_loss: 0.0600
Epoch 6/10
[1m561/561[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9935 - loss: 0.0578 - val_accuracy: 0.9940 - val_loss: 0.0596
Epoch 7/10
[1m561/561[0m 

In [None]:
y_pred_proba = model.predict(X_val_svd)
y_pred = (y_pred_proba > 0.5).astype(int)


[1m499/499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [None]:
print(classification_report(y_val, y_pred, target_names=labels))
roc_auc = roc_auc_score(y_val, y_pred_proba, average='macro')
print("ROC-AUC:", roc_auc)


               precision    recall  f1-score   support

        toxic       0.84      0.61      0.71      1480
 severe_toxic       0.58      0.18      0.27       148
      obscene       0.86      0.67      0.75       836
       threat       0.00      0.00      0.00        37
       insult       0.75      0.59      0.66       791
identity_hate       0.56      0.07      0.12       147

    micro avg       0.81      0.57      0.67      3439
    macro avg       0.60      0.35      0.42      3439
 weighted avg       0.79      0.57      0.66      3439
  samples avg       0.05      0.05      0.05      3439

ROC-AUC: 0.9644776404297865


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("\n\n=== Running Test Set Evaluation (using test_labels.csv) ===")

# 1️⃣ Load and clean test comments (same cleaning as train!)
test_df = pd.read_csv(TEST_PATH)
test_df["comment_text"] = test_df["comment_text"].apply(clean_text)

# 2️⃣ Transform with SAME TF-IDF and SVD as training
X_test_tfidf = tfidf.transform(test_df["comment_text"])
X_test_svd = svd.transform(X_test_tfidf) # Apply SVD to make it dense, no need for to_sparse_tensor

# 3️⃣ Load test labels (Kaggle released AFTER competition)
test_labels = pd.read_csv(TEST_LABELS_PATH)

# 4️⃣ Only keep rows where labels are NOT -1
valid_idx = (test_labels[labels] != -1).all(axis=1)

y_test = test_labels.loc[valid_idx, labels].values
X_test_valid = X_test_svd[valid_idx] # Use the SVD transformed dense data

# 5️⃣ Predict with the trained neural network
y_test_pred_proba = model.predict(X_test_valid)
y_test_pred = (y_test_pred_proba > 0.5).astype(int)

# 6️⃣ Print classification report
print("\n=== Neural Network Test Set Report ===")
print(classification_report(y_test, y_test_pred, target_names=labels))

# 7️⃣ Macro ROC-AUC
test_auc = roc_auc_score(y_test, y_test_pred_proba, average="macro")
print("Test ROC-AUC:", test_auc)



=== Running Test Set Evaluation (using test_labels.csv) ===
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step

=== Neural Network Test Set Report ===
               precision    recall  f1-score   support

        toxic       0.58      0.69      0.63      6090
 severe_toxic       0.42      0.23      0.30       367
      obscene       0.63      0.68      0.65      3691
       threat       0.50      0.00      0.01       211
       insult       0.56      0.62      0.59      3427
identity_hate       0.64      0.21      0.31       712

    micro avg       0.59      0.63      0.61     14498
    macro avg       0.56      0.41      0.42     14498
 weighted avg       0.59      0.63      0.59     14498
  samples avg       0.06      0.06      0.05     14498

Test ROC-AUC: 0.9571878079791687


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
y_pred_proba

array([[3.70445162e-01, 9.22971498e-03, 6.68706223e-02, 1.99421067e-02,
        1.14185125e-01, 4.64770570e-02],
       [5.20100701e-04, 7.67859518e-08, 2.61410860e-05, 1.38432497e-07,
        3.79707817e-05, 5.81004258e-07],
       [6.84661925e-01, 4.38422002e-02, 3.74981791e-01, 1.35037117e-02,
        4.26305085e-01, 5.96715435e-02],
       ...,
       [8.28540511e-03, 1.23787295e-05, 1.06226176e-03, 1.34217535e-05,
        1.29464059e-03, 7.11028842e-05],
       [7.65368203e-03, 1.56490823e-05, 8.32154357e-04, 3.24463617e-05,
        1.07869599e-03, 1.19676610e-04],
       [7.97748387e-01, 9.59398225e-02, 4.02463347e-01, 9.79286581e-02,
        5.15177727e-01, 2.24828660e-01]], dtype=float32)

In [None]:
val_loss, val_accuracy = model.evaluate(X_val_svd, y_val, verbose=0)
print("Validation Loss:", val_loss)
print("Validation Accuracy (per-label average):", val_accuracy)


Validation Loss: 0.0586216039955616
Validation Accuracy (per-label average): 0.9939842224121094
