In [10]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import re
import nltk
import time
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv("/content/dp_best.csv",header=None)

In [4]:
df.head()

Unnamed: 0,0,1
0,No thanks! I don't like deals,Shaming
1,"No, I'll rather pay full price.",Shaming
2,I don't like discounts,Shaming
3,"No, thanks. I don't like great deals.",Shaming
4,"No Thanks, I rather pay full price",Shaming


In [5]:
df[1].unique()

array(['Shaming', 'False Urgency', 'Nagging', 'Subscription Trap',
       'Basket Sneaking', 'Not Dark Pattern'], dtype=object)

In [6]:
# changing to lowercase
df[0] = df[0].str.lower()

# removing urls
df[0] = df[0].str.replace('http\S+|www.\S+', '', case=False)

# removing new lines "\n"
df[0] = df[0].str.replace('\n',' ', regex=True)

# removing all the punctuations
df[0] = df[0].str.replace('[^\w\s]',' ')

# removing integers
df[0] = df[0].str.replace('\d','', regex=True)

# removing emojis
df[0] = df[0].str.replace('[^\w\s#@/:%.,_-]', ' ', flags=re.UNICODE)

In [8]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

df[0] = df[0].apply(lambda text: cleaning_stopwords(text))

df[0].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0               thanks like deals
1           rather pay full price
2                  like discounts
3         thanks like great deals
4    thanks rather pay full price
Name: 0, dtype: object

In [9]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return lemmatizer.lemmatize(text)

df[0] = df[0].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
sentiment_mapping = {
    'Shaming': 0,
    'False Urgency': 1,
    'Nagging': 2,
    'Subscription Trap': 3,
    'Basket Sneaking': 4,
    'Not Dark Pattern':5
}
df[1] = df[1].map(sentiment_mapping)

In [12]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)


# Custom BERT model without additional embeddings
class CustomBERT(nn.Module):
    def __init__(self, bert_model):
        super(CustomBERT, self).__init__()
        self.bert = bert_model

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Process further or return the outputs

        return outputs

custom_bert= CustomBERT(bert_model)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df[0], df[1], test_size=0.2, random_state=42)

In [14]:
X_train = X_train.astype(str).tolist()
X_test = X_test.astype(str).tolist()

In [15]:
y_train=torch.tensor(y_train.values)
y_test=torch.tensor(y_test.values)

In [16]:
tokenized_train = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
tokenized_test = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

# Get the token IDs and attention masks
X_train_ids = tokenized_train['input_ids']
X_train_masks = tokenized_train['attention_mask']
X_test_ids = tokenized_test['input_ids']
X_test_masks = tokenized_test['attention_mask']


# Define the datasets
train_dataset = torch.utils.data.TensorDataset(X_train_ids, X_train_masks, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test_ids, X_test_masks, y_test)

In [17]:
# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=2, shuffle=False)

In [18]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(custom_bert.parameters(), lr=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

custom_bert.train()
for epoch in range(5):
    running_loss = 0.0
    total_batches = len(train_loader)
    for batch_idx, batch in enumerate(train_loader):
        inputs, masks, labels = batch
        inputs = inputs.to(device)
        masks = masks.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = custom_bert(inputs, attention_mask=masks)
        logits = outputs.last_hidden_state[:, 0, :]

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    average_loss = running_loss / total_batches
    print(f"Epoch {epoch + 1}, Loss: {average_loss}")

# Evaluation on test set
custom_bert.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        inputs, masks, labels = batch
        inputs = inputs.to(device)
        masks = masks.to(device)
        labels = labels.to(device)

        outputs = custom_bert(inputs, attention_mask=masks)
        logits = outputs.last_hidden_state[:, 0, :]
        _, predicted = torch.max(logits, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy on test set: {accuracy}")


Epoch 1, Loss: 0.8009604238580184
Epoch 2, Loss: 0.2933027506338333
Epoch 3, Loss: 0.21514653667162498
Epoch 4, Loss: 0.15539643944976658
Epoch 5, Loss: 0.13537060162640585
Accuracy on test set: 0.930841121495327
