In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import string
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler # Import TensorDataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import emoji
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


class Preprocess:
    def remove_non_letters(self, text):
        return re.sub(r'[^a-zA-Z]', ' ', text)

    def remove_stop_words(self, text):
        stop_words = set(stopwords.words('english')) - {"never", "not", "nor"}
        tokens = word_tokenize(text)
        filtered_words = [word for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_words)

    def normalize_words(self, text):
        return text.lower()

    def remove_short_words(self, text, min_length=2):
        return ' '.join([word for word in text.split() if len(word) >= min_length])

    def remove_long_words(self, text, max_length=15):
        return ' '.join([word for word in text.split() if len(word) <= max_length])

    def get_wordnet_pos(self, tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text) 
        text = re.sub(r'<.*?>', '', text)  
        text = re.sub(r'[^A-Za-z0-9\s]', '', text)  
        return text

    def remove_twitter_handles(self, text):
        return re.sub(r'@\w+', '', text)

    def expand_contractions(self, text):
        contractions_dict = {
            "didn't": "did not", "doesn't": "does not", "don't": "do not", "aren't": "are not",
            "can't": "cannot", "couldn't": "could not", "hadn't": "had not", "hasn't": "has not",
            "haven't": "have not", "he's": "he is", "I'm": "I am", "it's": "it is", "let's": "let us",
            "mightn't": "might not", "mustn't": "must not", "shan't": "shall not", "she's": "she is",
            "shouldn't": "should not", "that's": "that is", "there's": "there is", "they're": "they are",
            "we're": "we are", "weren't": "were not", "who's": "who is", "won't": "will not",
            "wouldn't": "would not", "you're": "you are", "you've": "you have", "iam": "i am"
        }
        contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                          flags=re.IGNORECASE|re.DOTALL)

        def replace(match):
            return contractions_dict[match.group(0).lower()]

        return contractions_pattern.sub(replace, text)

    def remove_numbers(self, text):
        return re.sub(r'\d+', '', text)

    def remove_emoticons(self, text):
        return emoji.replace_emoji(text, replace='')

    def expand_abbreviations(self, text):
        abbreviations_dict = {
            "btw": "by the way", "lol": "laughing out loud", "idk": "I don't know", "omg": "oh my god",
            "brb": "be right back", "imo": "in my opinion", "smh": "shaking my head", "tbh": "to be honest"
        }
        abbreviations_pattern = re.compile('({})'.format('|'.join(abbreviations_dict.keys())), flags=re.IGNORECASE)

        def replace_abbreviation(match):
            return abbreviations_dict[match.group(0).lower()]

        return abbreviations_pattern.sub(replace_abbreviation, text)

    def remove_extra_spaces(self, text):
        return ' '.join(text.split())



data = pd.read_csv('train.csv')  


PP = Preprocess()

data['cleaned_text'] = data['text'].apply(PP.remove_non_letters)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_stop_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.normalize_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_short_words)
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: PP.remove_long_words(x, 15))
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_twitter_handles)
data['cleaned_text'] = data['cleaned_text'].apply(PP.clean_text)
data['cleaned_text'] = data['cleaned_text'].apply(PP.expand_contractions)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_numbers)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_stop_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_short_words)
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_emoticons) 
data['cleaned_text'] = data['cleaned_text'].apply(PP.expand_abbreviations) 
data['cleaned_text'] = data['cleaned_text'].apply(PP.remove_extra_spaces)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


MAX_LEN = 128  
input_ids = []
attention_masks = []

for text in data['cleaned_text']:
    encoded_dict = tokenizer.encode_plus(
        text,  
        add_special_tokens=True,  
        max_length=MAX_LEN, 
        pad_to_max_length=True, 
        return_attention_mask=True,  
        return_tensors='pt',  
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['target'].values)


K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

for train_idx, test_idx in skf.split(input_ids, labels):
    train_inputs, val_inputs = input_ids[train_idx], input_ids[test_idx]
    train_masks, val_masks = attention_masks[train_idx], attention_masks[test_idx]
    train_labels, val_labels = labels[train_idx], labels[test_idx]

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    val_data = TensorDataset(val_inputs, val_masks, val_labels)

    train_sampler = RandomSampler(train_data)
    val_sampler = SequentialSampler(val_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=16)


    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False,
    )

    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    epochs = 3
    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)


    for epoch in range(epochs):
        model.train() 
        total_loss = 0

        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss

            total_loss += loss.item()
            loss.backward()

            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)


    model.eval()

    predictions, true_labels = [], []

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.append(b_labels.cpu().numpy())

    predictions = np.concatenate(predictions)
    true_labels = np.concatenate(true_labels)



    accuracy = accuracy_score(true_labels, predictions)
    print(f'Bert Fold Validation Accuracy: {accuracy:.4f}')


    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))


    conf_matrix = confusion_matrix(true_labels, predictions)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [None]:
model.eval()

predictions, true_labels = [], []

for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
    true_labels.append(b_labels.cpu().numpy())

predictions = np.concatenate(predictions)
true_labels = np.concatenate(true_labels)



accuracy = accuracy_score(true_labels, predictions)

print(f'Validation Accuracy: {accuracy:.4f}')


print(f"\nClassification Report:")
print(classification_report(true_labels, predictions))

print(f"\nConfusion Matrix:")
conf_matrix = confusion_matrix(true_labels, predictions)
print(conf_matrix)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score



from transformers import DebertaTokenizer, DebertaForSequenceClassification

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=2
)


MAX_LEN = 128
input_ids, attention_masks = [], []

for text in data['cleaned_text']:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['target'].values)


K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

for train_idx, test_idx in skf.split(input_ids, labels):
    train_inputs, val_inputs = input_ids[train_idx], input_ids[test_idx]
    train_masks, val_masks = attention_masks[train_idx], attention_masks[test_idx]
    train_labels, val_labels = labels[train_idx], labels[test_idx]

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    val_data = TensorDataset(val_inputs, val_masks, val_labels)

    train_sampler = RandomSampler(train_data)
    val_sampler = SequentialSampler(val_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=16)


    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    epochs = 3
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # GPU or CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #memory management
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dataloader)}")


    model.eval()
    predictions, true_labels = [], []

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.append(b_labels.cpu().numpy())

    predictions = np.concatenate(predictions)
    true_labels = np.concatenate(true_labels)
    accuracy = accuracy_score(true_labels, predictions)
    print(f"DeBert Fold Validation Accuracy: {accuracy:.4f}")


In [None]:
model.eval()
predictions, true_labels = [], []


fold = 0 

for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs.logits
    predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
    true_labels.append(b_labels.cpu().numpy())

predictions = np.concatenate(predictions)
true_labels = np.concatenate(true_labels)
accuracy = accuracy_score(true_labels, predictions)

print(f"DeBERT Fold {fold+1} Validation Accuracy: {accuracy:.4f}")


print(f"Classification Report for Fold {fold+1}:\n")
report = classification_report(true_labels, predictions, target_names=['Class 0', 'Class 1'])
print(report)


cm = confusion_matrix(true_labels, predictions)
print(f"Confusion Matrix for Fold {fold+1}:\n", cm)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup


data = pd.read_csv('train.csv')  
data['cleaned_text'] = data['text']  


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  
    output_attentions=False,
    output_hidden_states=False,
)


MAX_LEN = 128  
input_ids = []
attention_masks = []

for text in data['cleaned_text']:
    encoded_dict = tokenizer.encode_plus(
        text,  
        add_special_tokens=True,  
        max_length=MAX_LEN,  
        pad_to_max_length=True,  
        return_attention_mask=True,  
        return_tensors='pt',  
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['target'].values)


K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

for train_idx, test_idx in skf.split(input_ids, labels):
    train_inputs, val_inputs = input_ids[train_idx], input_ids[test_idx]
    train_masks, val_masks = attention_masks[train_idx], attention_masks[test_idx]
    train_labels, val_labels = labels[train_idx], labels[test_idx]


    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    val_data = TensorDataset(val_inputs, val_masks, val_labels)

    train_sampler = RandomSampler(train_data)
    val_sampler = SequentialSampler(val_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=16)


    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    epochs = 3
    total_steps = len(train_dataloader) * epochs


    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)


    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_predictions, train_true_labels = [], []

        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits

            loss.backward()
            optimizer.step()
            scheduler.step()


            train_predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
            train_true_labels.append(b_labels.cpu().numpy())

        avg_train_loss = total_loss / len(train_dataloader)


        train_predictions = np.concatenate(train_predictions)
        train_true_labels = np.concatenate(train_true_labels)
        train_accuracy = accuracy_score(train_true_labels, train_predictions)

        print(f"Epoch {epoch+1}, Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")


    model.eval()
    val_predictions, val_true_labels = [], []

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        val_predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        val_true_labels.append(b_labels.cpu().numpy())

    val_predictions = np.concatenate(val_predictions)
    val_true_labels = np.concatenate(val_true_labels)


    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f"Distilbert Fold Validation Accuracy: {val_accuracy:.4f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from transformers import DebertaTokenizer, DebertaForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup



tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=2,  
    output_attentions=False,
    output_hidden_states=False,
)


MAX_LEN = 128  
input_ids = []
attention_masks = []

for text in data['cleaned_text']:
    encoded_dict = tokenizer.encode_plus(
        text,  
        add_special_tokens=True,  
        max_length=MAX_LEN, 
        pad_to_max_length=True,  
        return_attention_mask=True,  
        return_tensors='pt',  
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['target'].values)


K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

for train_idx, test_idx in skf.split(input_ids, labels):
    train_inputs, val_inputs = input_ids[train_idx], input_ids[test_idx]
    train_masks, val_masks = attention_masks[train_idx], attention_masks[test_idx]
    train_labels, val_labels = labels[train_idx], labels[test_idx]


    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    val_data = TensorDataset(val_inputs, val_masks, val_labels)

    train_sampler = RandomSampler(train_data)
    val_sampler = SequentialSampler(val_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=16)


    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    epochs = 3
    total_steps = len(train_dataloader) * epochs


    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)


    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_predictions, train_true_labels = [], []

        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits

            loss.backward()
            optimizer.step()
            scheduler.step()


            train_predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
            train_true_labels.append(b_labels.cpu().numpy())

        avg_train_loss = total_loss / len(train_dataloader)

   
        train_predictions = np.concatenate(train_predictions)
        train_true_labels = np.concatenate(train_true_labels)
        train_accuracy = accuracy_score(train_true_labels, train_predictions)

        print(f"Epoch {epoch+1}, Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")


    model.eval()
    val_predictions, val_true_labels = [], []

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        val_predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        val_true_labels.append(b_labels.cpu().numpy())

    val_predictions = np.concatenate(val_predictions)
    val_true_labels = np.concatenate(val_true_labels)


    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f"Deberta Fold Validation Accuracy: {val_accuracy:.4f}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


model.eval()
val_predictions, val_true_labels = [], []

for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    val_predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
    val_true_labels.append(b_labels.cpu().numpy())

val_predictions = np.concatenate(val_predictions)
val_true_labels = np.concatenate(val_true_labels)


val_accuracy = accuracy_score(val_true_labels, val_predictions)
print(f"Deberta Fold Validation Accuracy: {val_accuracy:.4f}")


report = classification_report(val_true_labels, val_predictions, target_names=['Class 0', 'Class 1'])
print("Classification Report:\n", report)


conf_matrix = confusion_matrix(val_true_labels, val_predictions)
print("Confusion Matrix:\n", conf_matrix)


plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup



tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  
    output_attentions=False,
    output_hidden_states=False,
)


MAX_LEN = 128  
input_ids = []
attention_masks = []

for text in data['cleaned_text']:
    encoded_dict = tokenizer.encode_plus(
        text, 
        add_special_tokens=True,  
        max_length=MAX_LEN, 
        pad_to_max_length=True,  
        return_attention_mask=True,  
        return_tensors='pt', 
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['target'].values)


K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

for train_idx, test_idx in skf.split(input_ids, labels):
    train_inputs, val_inputs = input_ids[train_idx], input_ids[test_idx]
    train_masks, val_masks = attention_masks[train_idx], attention_masks[test_idx]
    train_labels, val_labels = labels[train_idx], labels[test_idx]


    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    val_data = TensorDataset(val_inputs, val_masks, val_labels)

    train_sampler = RandomSampler(train_data)
    val_sampler = SequentialSampler(val_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=16)


    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

    epochs = 3
    total_steps = len(train_dataloader) * epochs


    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)


    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_predictions, train_true_labels = [], []

        for batch in train_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits

            loss.backward()
            optimizer.step()
            scheduler.step()


            train_predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
            train_true_labels.append(b_labels.cpu().numpy())

        avg_train_loss = total_loss / len(train_dataloader)


        train_predictions = np.concatenate(train_predictions)
        train_true_labels = np.concatenate(train_true_labels)
        train_accuracy = accuracy_score(train_true_labels, train_predictions)

        print(f"Epoch {epoch+1}, Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")


    model.eval()
    val_predictions, val_true_labels = [], []

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        val_predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
        val_true_labels.append(b_labels.cpu().numpy())

    val_predictions = np.concatenate(val_predictions)
    val_true_labels = np.concatenate(val_true_labels)


    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f"Fold Validation Accuracy: {val_accuracy:.4f}")



model.eval()
val_loss = 0
predictions, true_labels = [], []

for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

    logits = outputs.logits
    loss = outputs.loss
    val_loss += loss.item()

    predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
    true_labels.append(b_labels.cpu().numpy())

avg_val_loss = val_loss / len(val_dataloader)
predictions = np.concatenate(predictions)
true_labels = np.concatenate(true_labels)


accuracy = accuracy_score(true_labels, predictions)
print(f"Fold Validation Accuracy: {accuracy:.4f}")
print(f"Validation Loss: {avg_val_loss:.4f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


model.eval()
val_loss = 0
predictions, true_labels = [], []

for batch in val_dataloader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

    logits = outputs.logits
    loss = outputs.loss
    val_loss += loss.item()

    predictions.append(torch.argmax(logits, dim=1).cpu().numpy())
    true_labels.append(b_labels.cpu().numpy())

avg_val_loss = val_loss / len(val_dataloader)
predictions = np.concatenate(predictions)
true_labels = np.concatenate(true_labels)


accuracy = accuracy_score(true_labels, predictions)
print(f"Fold Validation Accuracy: {accuracy:.4f}")
print(f"Validation Loss: {avg_val_loss:.4f}")


print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Class 0', 'Class 1']))

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predictions))


train_val_diff = abs(train_accuracy - accuracy)
if train_val_diff > 0.1:  
    print(f"Potential Overfitting Detected! Train-Validation Accuracy Difference: {train_val_diff:.4f}")
else:
    print("No significant overfitting detected.")
