In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv


In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
import re
from wordcloud import WordCloud, STOPWORDS

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

In [37]:
kaggle_input_path = '/kaggle/input/jigsaw-toxic-comment-classification-challenge'

files_in_directory = os.listdir(kaggle_input_path)
print(files_in_directory)


for file_name in files_in_directory:
    # Skip non-zip files if any
    if not file_name.endswith('.zip'):
        continue

    # Path to the zip file
    zip_file_path = os.path.join(kaggle_input_path, file_name)

    # Directory where you want to extract the contents
    output_dir = '/kaggle/working/'

    # Unzip the file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
        # Print complete paths of the extracted files
        extracted_files = zip_ref.namelist()
        for extracted_file in extracted_files:
            complete_path = os.path.join(output_dir, extracted_file)
            print("Extracted:", complete_path)        

['sample_submission.csv', 'test_labels.csv', 'train.csv', 'test.csv']


In [38]:
train = pd.read_csv(kaggle_input_path+"/train.csv")
train.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
148578,4ef2da8ee996b7d9,"See man you really dont get it, why do you thi...",0,0,0,0,0,0
145885,23bf8eb990db3a85,"""\n\n RADIUS Question \n\nI'm taking a Securit...",0,0,0,0,0,0
7967,153990b9dd2f7ef4,"""\n\nIslamic terrorism\nHi again Sir Nicho. I ...",0,0,0,0,0,0
84563,e23a26e0ca5ba8db,"Rose in Richmond, Indiana\nYour last edit was ...",0,0,0,0,0,0
50431,86d9802508ce35f2,Adil Asrar \nHello and welcome to Wikipedia. ...,0,0,0,0,0,0


In [39]:
train.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [40]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove web links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Remove special characters, punctuation marks, and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # Insert spaces between certain patterns (e.g., "ie", "eg")
    text = re.sub(r'(\s)([iI][eE]|[eE][gG])(\s)', r' \2 ', text)

    # Remove extra white spaces
    text = " ".join(text.split())

    return text.lower()

# Example usage with the provided text
texts = [
    "\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of 'types of accidents'  -I think the references may need tidying so that they are all in the exact same format ie date format etc I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport"
]

cleaned_texts = [clean_text(text) for text in texts]
print(cleaned_texts)


['more i can t make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if no one else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it s listed in the relevant form eg wikipedia good article nominations transport']


In [41]:
target_labels= [col for col in train.columns if train[col].dtypes == 'int64']
target_labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [42]:
# Function to generate word cloud
def generate_wordcloud(text,Title):
    wordcloud = WordCloud(width=800, height=400,stopwords=set(STOPWORDS), background_color='black').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(Title)
    plt.show()

In [43]:
train['Cleaned_Comments'] = train['comment_text'].apply(clean_text)
train['Cleaned_Comments'].head()

0    explanation why the edits made under my userna...
1    d aww he matches this background colour i m se...
2    hey man i m really not trying to edit war it s...
3    more i can t make any real suggestions on impr...
4    you sir are my hero any chance you remember wh...
Name: Cleaned_Comments, dtype: object

In [44]:
Toxic_comment_balanced_1 = train[train['toxic'] == 1].iloc[0:15000,:]
Toxic_comment_balanced_0 = train[train['toxic'] == 0].iloc[0:20000,:]

In [45]:
Toxic_comment_balanced=pd.concat([Toxic_comment_balanced_1,Toxic_comment_balanced_0])

In [46]:
Toxic_comment_balanced.shape

(35000, 9)

In [47]:
Toxic_comment_balanced_shuffled = Toxic_comment_balanced.sample(frac=1, random_state=42)  # You can set a random_state for reproducibility

# Reset the index of the shuffled DataFrame
Toxic_comment_balanced_shuffled = Toxic_comment_balanced_shuffled.reset_index(drop=True)

In [48]:
comments=Toxic_comment_balanced_shuffled['Cleaned_Comments'].to_list()
comments[:5]

['other naturally occurring opioid antagonists from wikipedia the compound akuammine has also been shown in vitro to act as an opioid antagonist',
 'women bus drivers thoughts i don t like them what about you do you think it s right they should drive buses or do you think they should stick to washing up',
 'stop changon it or i am twist off youre littal pee wees fuckan morons',
 'you re a smug disruptive asshole i can t believe you re monitoring my talk and userpage so closely as to be replying minutes after i do',
 'still not blocked hey you american asshole i am stil not blocked please do something and block me and all my socks']

In [49]:
# Split data into training, testing sets & validation sets 
Train_texts, Test_texts, Train_labels, Test_labels = train_test_split(
    comments, Toxic_comment_balanced_shuffled[target_labels].values, test_size=0.3, random_state=50)

#validation set
test_texts, val_texts, test_labels, val_labels = train_test_split(
    Test_texts, Test_labels, test_size=0.32, random_state=23)

print('Training Dataset -->',len(Train_texts), Train_labels.shape)
print('Testing Dataset -->',len(test_texts), test_labels.shape)
print('validation Dataset -->',len(val_texts), val_labels.shape)

Training Dataset --> 24500 (24500, 6)
Testing Dataset --> 7140 (7140, 6)
validation Dataset --> 3360 (3360, 6)


In [50]:
def tokenize_and_encode(tokenizer, comments, labels, max_length=128):
    # Initialize empty lists to store tokenized inputs and attention masks
    input_ids = []
    attention_masks = []

    # Iterate through each comment in the 'comments' list
    for comment in comments:
        # Tokenize and encode the comment using the BERT tokenizer
        encoded_dict = tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Append the tokenized input and attention mask to their respective lists
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists of tokenized inputs and attention masks to PyTorch tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Convert the labels to a PyTorch tensor with the data type float32
    labels = torch.tensor(labels, dtype=torch.float32)

    # Return the tokenized inputs, attention masks, and labels as PyTorch tensors
    return input_ids, attention_masks, labels
        

In [51]:
# Token Initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Model Initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [53]:
# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

model =model.to(device)

cuda


In [54]:
# Tokenize and Encode the comments and labels for the training set
input_ids, attention_masks, labels = tokenize_and_encode(
    tokenizer, 
    Train_texts, 
    Train_labels
)

# Step 4: Tokenize and Encode the comments and labels for the test set
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(
    tokenizer,
    test_texts,
    test_labels
)

# Tokenize and Encode the comments and labels for the validation set
val_input_ids, val_attention_masks, val_labels = tokenize_and_encode(
    tokenizer,
    val_texts,
    val_labels
)

print('Training Comments :',len(Train_texts))
print('Input Ids         :',input_ids.shape)
print('Attention Mask    :',attention_masks.shape)
print('Labels            :',labels.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Comments : 24500
Input Ids         : torch.Size([24500, 128])
Attention Mask    : torch.Size([24500, 128])
Labels            : torch.Size([24500, 6])


In [55]:
k = 523
print('Training Comments -->>',Train_texts[k])
print('\nInput Ids -->>\n',input_ids[k])
print('\nDecoded Ids -->>\n',tokenizer.decode(input_ids[k]))
print('\nAttention Mask -->>\n',attention_masks[k])
print('\nLabels -->>',labels[k])

Training Comments -->> please do not create articles on your talk page it creates a massive problem with your talk page history and the article s history and creates incorrect links because of the submission templates you have a sub page already made

Input Ids -->>
 tensor([  101,  3531,  2079,  2025,  3443,  4790,  2006,  2115,  2831,  3931,
         2009,  9005,  1037,  5294,  3291,  2007,  2115,  2831,  3931,  2381,
         1998,  1996,  3720,  1055,  2381,  1998,  9005, 16542,  6971,  2138,
         1997,  1996, 12339, 23561,  2015,  2017,  2031,  1037,  4942,  3931,
         2525,  2081,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  

In [56]:
# Creating DataLoader for the balanced dataset
batch_size = 32
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#test
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#val
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [57]:
print('Batch Size :',train_loader.batch_size)
Batch =next(iter(train_loader))
print('Each Input ids shape :',Batch[0].shape)
print('Input ids :\n',Batch[0][0])
print('Corresponding Decoded text:\n',tokenizer.decode(Batch[0][0]))
print('Corresponding Attention Mask :\n',Batch[1][0])
print('Corresponding Label:',Batch[2][0])

Batch Size : 32
Each Input ids shape : torch.Size([32, 128])
Input ids :
 tensor([  101,  2009,  2987,  1056,  3043,  2129,  2116,  1997,  2068,  2203,
         1999, 21318,  2015,  2672,  2009,  1055,  2025,  2130, 10333,  1045,
         2123,  1056,  2113,  3198,  6316,  1055,  3611,  2021, 21766,  2094,
         4371,  3726, 10841,  2483,  2003,  1037,  2171,  2009,  4152,  4978,
         1998,  2045,  2024, 15665,  1997,  2111,  2041,  2045,  2007,  2008,
         2171,  3398, 21766,  2094,  4371,  7903,  4173,  2003,  2036,  1037,
         2171,  1998,  2009,  1055,  2055,  2062,  2691,  2084, 21766,  2094,
         4371,  3726, 10841,  2483,  2021,  6316, 21766,  2094,  4371,  7903,
         4173,  1998, 10852, 21766,  2094,  4371,  7903,  4173,  2131,  5717,
         4978,  2008,  1055,  1996,  2391,  6316, 21766,  2094,  4371,  3726,
        25393,  4152,  2718,  1998, 10852, 21766,  2094,  4371,  3726, 25393,
         4152,  5717,  6316, 21766,  2094,  4371,  3726, 10841,  248

In [58]:
# Optimizer setup
optimizer = AdamW(model.parameters(), lr=0.00002)

In [59]:
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs):
    # Loop through the specified number of epochs
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train() 
        # Initialize total loss for the current epoch
        total_loss = 0  

        # Loop through the batches in the training data
        for batch in train_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()  # Set the model to evaluation mode
        val_loss = 0

        # Disable gradient computation during validation
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [t.to(device) for t in batch]

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

        # Print the average loss for the current epoch
        print(f'Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}')

# Assuming you have 'train_loader' and 'val_loader' defined elsewhere
# Call the function to train the model
train_model(model, train_loader, val_loader, optimizer, device, num_epochs=2)

Epoch 1, Training Loss: 0.1754996707473271, Validation Loss: 0.1381606556829952
Epoch 2, Training Loss: 0.11272986035784294, Validation Loss: 0.1300277543209848


In [60]:
#Evaluate the Model
def evaluate_model(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode

    true_labels = []
    predicted_probs = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]

            # Get model's predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            predicted_probs_batch = torch.sigmoid(outputs.logits)  # Use sigmoid for multilabel classification
            predicted_probs.append(predicted_probs_batch.cpu().numpy())

            true_labels_batch = labels.cpu().numpy()
            true_labels.append(true_labels_batch)

    # Combine predictions and labels for evaluation
    true_labels = np.concatenate(true_labels, axis=0)
    predicted_probs = np.concatenate(predicted_probs, axis=0)
    predicted_labels = (predicted_probs > 0.5).astype(int)  # Apply threshold for binary classification

    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='micro')
    recall = recall_score(true_labels, predicted_labels, average='micro')

    # Print the evaluation metrics
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')

# Call the function to evaluate the model on the test data
evaluate_model(model, test_loader, device)

Accuracy: 0.7430
Precision: 0.8013
Recall: 0.8869


In [61]:
# Save the tokenizer and model in the same directory
output_dir = "/kaggle/working/Saved_model"
model.save_pretrained(output_dir)  # Save model's state dictionary and configuration
tokenizer.save_pretrained(output_dir)  # Save tokenizer's configuration and vocabulary

('/kaggle/working/Saved_model/tokenizer_config.json',
 '/kaggle/working/Saved_model/special_tokens_map.json',
 '/kaggle/working/Saved_model/vocab.txt',
 '/kaggle/working/Saved_model/added_tokens.json')