## Importing Libraries and Loading Data

In [None]:
from urllib import request
import os
import csv

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
from datasets import load_dataset
dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed_augmented")
train_df = dataset['train'].to_pandas()
validation_df = dataset['valid'].to_pandas()

In [None]:
train_df

In [None]:
train_df = train_df[['text', 'label']]
train_df

In [None]:
validation_df = validation_df[['text', 'label']]

## Using TFID Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vec = vectorizer.fit_transform(train_df['text'])
vec = np.array(vec.todense())

X_train = vec
y_train = train_df['label']

vec = vectorizer.transform(validation_df['text'])
vec = np.array(vec.todense())

X_test = vec
y_test = validation_df['label']

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Using Different Scikit-Learn Models

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, f1_score

model = MultinomialNB().fit(X_train, y_train)

y_preds = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_preds))
print('F1 score:', f1_score(y_test, y_preds, average="binary"))
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average="binary")
print("F1 score:", f1)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average="binary")
print("F1 score:", f1)
print(classification_report(y_test, y_pred))

## Using Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vectorizer = CountVectorizer()
vec = vectorizer.fit_transform(train_df['text'])
vec = np.array(vec.todense())

X_train = vec
y_train = train_df['label']

vec = vectorizer.transform(validation_df['text'])
vec = np.array(vec.todense())

X_test = vec
y_test = validation_df['label']

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vectorizer = CountVectorizer(ngram_range=(2, 2))
vec = vectorizer.fit_transform(train_df['text'])
vec = np.array(vec.todense())

X_train = vec
y_train = train_df['label']

vec = vectorizer.transform(validation_df['text'])
vec = np.array(vec.todense())

X_test = vec
y_test = validation_df['label']

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
lis = list(vectorizer.get_feature_names_out())
for i in lis[0:100]:
    print(i)

In [None]:
lis = list(vectorizer.get_feature_names_out())
for i in lis[0:100]:
    print(i)

## Using Different Scikit-Learn Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average="binary")
print("F1 score:", f1)
print(classification_report(y_test, y_pred))

In [None]:
false_positives = np.where((y_pred == 1) & (y_test == 0))[0]

In [None]:
false_positive_texts = train_df['text'][false_positives]

In [None]:
for i in false_positive_texts[0:10]:
    print(i)

In [None]:
fp_vectorized = vectorizer.transform(false_positive_texts)
fp_sum = np.array(fp_vectorized.sum(axis=0)).flatten()
unigrams_scores = dict(zip(vectorizer.get_feature_names_out(), fp_sum))
sorted_unigrams = sorted(unigrams_scores.items(), key=lambda x: x[1], reverse=True)
top_n = 50
print(sorted_unigrams[:top_n])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average="binary")
print("F1 score:", f1)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average="binary")
print("F1 score:", f1)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, f1_score

model = MultinomialNB().fit(X_train, y_train)

y_preds = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_preds))
print('F1 score:', f1_score(y_test, y_preds, average="binary"))
print(classification_report(y_test, y_preds))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, f1_score

model = SVC().fit(X_train, y_train)

y_preds = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_preds))
print('F1 score:', f1_score(y_test, y_preds, average="binary"))
print(classification_report(y_test, y_preds))

## Using RoBERTa embeddings to train SVM

In [None]:
from datasets import load_dataset
dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed_augmented")
train_df = dataset['train'].to_pandas()
validation_df = dataset['valid'].to_pandas()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
from transformers import RobertaTokenizer
import torch
from transformers import RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Assuming `df['text']` is your column with text data
tokenized_data = tokenizer.batch_encode_plus(
    train_df['text'].tolist(),
    max_length=512,  # Max length for RoBERTa base
    padding='max_length',
    truncation=True,
    return_tensors="pt"
)

# Separate inputs for the model
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

In [None]:
attention_mask.shape

In [None]:
input_ids.shape

In [None]:
model = RobertaModel.from_pretrained('roberta-base')

# Ensure the model is in evaluation mode
model.eval()
model.to(device)

In [None]:
model.device

In [None]:
# Move tensors to the same device as the model
input_ids = input_ids.to(model.device)
attention_mask = attention_mask.to(model.device)

In [None]:
def get_embeddings(model, input_ids, attention_mask, batch_size=16):
    model.eval()  # Ensure the model is in evaluation mode
    
    # Initialize an empty tensor for storing embeddings
    embeddings = torch.empty((0, model.config.hidden_size)).to(device)
    
    # Calculate total number of batches
    total_batches = len(input_ids) // batch_size + (0 if len(input_ids) % batch_size == 0 else 1)
    
    with torch.no_grad():
        for i in range(total_batches):
            batch_input_ids = input_ids[i*batch_size : (i+1)*batch_size].to(device)
            batch_attention_mask = attention_mask[i*batch_size : (i+1)*batch_size].to(device)
            
            batch_outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
            batch_embeddings = batch_outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token's embeddings
            
            embeddings = torch.cat((embeddings, batch_embeddings), dim=0)
    
    return embeddings.cpu().numpy()  # Move the concatenated embeddings back to CPU

In [None]:
embeddings = get_embeddings(model, input_ids, attention_mask, batch_size=16)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

clf = SVC(kernel='linear')
clf.fit(embeddings, train_df['label'])

In [None]:
# Move tensors to the same device as the model
from transformers import RobertaTokenizer
import torch
from transformers import RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Assuming `df['text']` is your column with text data
tokenized_data = tokenizer.batch_encode_plus(
    validation_df['text'].tolist(),
    max_length=512,  # Max length for RoBERTa base
    padding='max_length',
    truncation=True,
    return_tensors="pt"
)

# Separate inputs for the model
input_ids_test = tokenized_data['input_ids']
attention_mask_test = tokenized_data['attention_mask']
input_ids_test = input_ids_test.to(model.device)
attention_mask_test = attention_mask_test.to(model.device)
embeddings = get_embeddings(model, input_ids_test, attention_mask_test, batch_size=16)

In [None]:
# Predict on the test set
from sklearn.metrics import accuracy_score, f1_score
y_pred = clf.predict(embeddings)

# Evaluate the model
print("Accuracy:", accuracy_score(validation_df['label'], y_pred))
print("F1 score:", f1_score(validation_df['label'], y_pred))