In [15]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import re
import nltk
import time
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
df=pd.read_csv("/content/dp_best.csv",header=None)

In [6]:
df.head()

Unnamed: 0,0,1
0,No thanks! I don't like deals,Shaming
1,"No, I'll rather pay full price.",Shaming
2,I don't like discounts,Shaming
3,"No, thanks. I don't like great deals.",Shaming
4,"No Thanks, I rather pay full price",Shaming


In [7]:
df[1].unique()

array(['Shaming', 'False Urgency', 'Nagging', 'Subscription Trap',
       'Basket Sneaking', 'Not Dark Pattern'], dtype=object)

In [8]:
# changing to lowercase
df[0] = df[0].str.lower()

# removing urls
df[0] = df[0].str.replace('http\S+|www.\S+', '', case=False)

# removing new lines "\n"
df[0] = df[0].str.replace('\n',' ', regex=True)

# removing all the punctuations
df[0] = df[0].str.replace('[^\w\s]',' ')

# removing integers
df[0] = df[0].str.replace('\d','', regex=True)

# removing emojis
df[0] = df[0].str.replace('[^\w\s#@/:%.,_-]', ' ', flags=re.UNICODE)

In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

df[0] = df[0].apply(lambda text: cleaning_stopwords(text))

df[0].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0               thanks like deals
1           rather pay full price
2                  like discounts
3         thanks like great deals
4    thanks rather pay full price
Name: 0, dtype: object

In [10]:
nltk.download('wordnet')
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return lemmatizer.lemmatize(text)

df[0] = df[0].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
sentiment_mapping = {
    'Shaming': 0,
    'False Urgency': 1,
    'Nagging': 2,
    'Subscription Trap': 3,
    'Basket Sneaking': 4,
    'Not Dark Pattern':5
}
df[1] = df[1].map(sentiment_mapping)

In [12]:
classifiers = {
    'MultinomialNB': MultinomialNB(),
    'RandomForest': RandomForestClassifier(),
    'LinearSVC': LinearSVC(),
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier()
}
def create_pipeline(classifier):
    return Pipeline([('Normalizing', MinMaxScaler()), (classifier.__class__.__name__, classifier)])


In [13]:
X = df[0]  # Assuming the comments are in the first column
y = df[1]  # Assuming the sentiments are in the second column

# Text preprocessing using Bag of Words (BoW) representation
count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(X.values.astype('U'))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)  # Train the classifier

    y_pred = clf.predict(X_test)  # Predict using the test set

    accuracy = accuracy_score(y_test, y_pred)  # Evaluate accuracy
    print(f"{clf_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("-------------------------")

MultinomialNB:
Accuracy: 0.8131
-------------------------
RandomForest:
Accuracy: 0.9308
-------------------------
LinearSVC:
Accuracy: 0.9346
-------------------------
LogisticRegression:
Accuracy: 0.9252
-------------------------
DecisionTree:
Accuracy: 0.9084
-------------------------


In [14]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)  # Train the classifier

    y_pred = clf.predict(X_test)  # Predict using the test set

    accuracy = accuracy_score(y_test, y_pred)  # Evaluate accuracy
    print(f"{clf_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("-------------------------")


MultinomialNB:
Accuracy: 0.8224
-------------------------
RandomForest:
Accuracy: 0.9402
-------------------------
LinearSVC:
Accuracy: 0.9364
-------------------------
LogisticRegression:
Accuracy: 0.9178
-------------------------
DecisionTree:
Accuracy: 0.9271
-------------------------


In [16]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Instantiate the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.4s
[CV] END max_dep

In [17]:
import pickle
with open('rf_classifier_bog.pkl', 'wb') as f:
    pickle.dump((best_model,count_vectorizer), f)