In [1]:
!pip install pandas



In [2]:
!pip install openpyxl



In [3]:
!pip install transformers



In [4]:
!pip install torch



In [5]:
!pip install -U accelerate



In [6]:
import pandas as pd

In [7]:
df = pd.read_excel("review.xlsx")

In [8]:
df.head()

Unnamed: 0,id,review,class
0,39,Make it like better with a giant pig bigger th...,feature request
1,42,These screens are small enough without crowdin...,information giving
2,49,Dear Rovio; If you absolutely must continue tr...,information giving
3,56,App crashes when new power up notice pops up.,problem discovery
4,62,It would be nice to have an update that didn ...,information giving


In [9]:
df['class'].value_counts()

class
information giving     603
problem discovery      494
feature request        192
information seeking    101
Name: count, dtype: int64

In [10]:
#removing extra id column
df_id_removed = df.drop(columns=['id'])
df_id_removed.head()

Unnamed: 0,review,class
0,Make it like better with a giant pig bigger th...,feature request
1,These screens are small enough without crowdin...,information giving
2,Dear Rovio; If you absolutely must continue tr...,information giving
3,App crashes when new power up notice pops up.,problem discovery
4,It would be nice to have an update that didn ...,information giving


In [11]:
#encoding target labels
#reference https://pbpython.com/categorical-encoding.html
find_replace = {"class":     {"feature request": 0, "problem discovery": 1, "information giving": 2, "information seeking": 3}}
target_encoded_df = df_id_removed.replace(find_replace)
target_encoded_df.head()

  target_encoded_df = df_id_removed.replace(find_replace)


Unnamed: 0,review,class
0,Make it like better with a giant pig bigger th...,0
1,These screens are small enough without crowdin...,2
2,Dear Rovio; If you absolutely must continue tr...,2
3,App crashes when new power up notice pops up.,1
4,It would be nice to have an update that didn ...,2


In [12]:
target_encoded_df['class'].value_counts()

class
2    603
1    494
0    192
3    101
Name: count, dtype: int64

In [13]:
#train test split

In [14]:
from sklearn.model_selection import train_test_split

X = target_encoded_df['review']
y = target_encoded_df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1112,), (278,), (1112,), (278,))

In [16]:
X_train

821     \nBut you should be able to rearrange your boa...
1285    I  m planning my wedding and getting ideas fro...
859     I recommend this app to all my friends and wil...
430     boards/subjects should have some type of restr...
1102    I have never had an in issue with this App it ...
                              ...                        
785     s problems with this app , 5,  ptcgkm ,  This ...
909     Pintrest provides an easy way to find and stor...
622     I love this site, but it has been crashing sin...
1312    Wish the app was available on my Philips smart...
79                                         Please fix it!
Name: review, Length: 1112, dtype: object

In [17]:
X_test

627     I  ve found so many good ideas and recipes and...
595     No matter what you  re into and what you  re l...
1065                               I can  t stop pinning!
725     I have been a Dropbox user for years and, unfo...
1031    It  s awesome that you can find almost anythin...
                              ...                        
550     I liked the previous version of the app, but n...
860            Please add a back to top of page" button."
97      I even have the Premium version and bought sev...
159          They updated the app and it is a lot better.
982     Some day my things will be on this app this ap...
Name: review, Length: 278, dtype: object

In [18]:
#TF-IDF Feature Extraction
#TF - Term Frequency
#IDF - Inverse Document Frequecy

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf = TfidfVectorizer(
#     lowercase=True,
#     stop_words='english',
#     max_df=0.9,
#     min_df=5,
#     ngram_range=(1, 2)
# ) 
# Accuracy: 0.6510791366906474

# tfidf = TfidfVectorizer(
#     analyzer="char_wb",
#     ngram_range=(3, 5),
#     min_df=3,
#     sublinear_tf=True
# )Accuracy: 0.6330935251798561

# tfidf = TfidfVectorizer(
#     lowercase=True,
#     stop_words='english',
#     max_df=0.9,
#     min_df=5,
#     ngram_range=(1, 3)
# ) Accuracy: 0.6510791366906474

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_df=0.2,
    min_df=5,
    ngram_range=(1, 3),
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [20]:
#naive bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# Confusion matrix
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.6510791366906474
              precision    recall  f1-score   support

           0       0.71      0.26      0.38        38
           1       0.74      0.64      0.68        99
           2       0.60      0.87      0.71       121
           3       1.00      0.15      0.26        20

    accuracy                           0.65       278
   macro avg       0.76      0.48      0.51       278
weighted avg       0.69      0.65      0.62       278

Confusion Matrix:

[[ 10   6  22   0]
 [  1  63  35   0]
 [  2  14 105   0]
 [  1   2  14   3]]


In [22]:
#logistic regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    n_jobs=-1
)

clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

# Accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

# Detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Logistic Regression Accuracy: 0.5719424460431655

Classification Report:

              precision    recall  f1-score   support

           0       0.34      0.53      0.42        38
           1       0.82      0.62      0.71        99
           2       0.63      0.59      0.61       121
           3       0.21      0.35      0.26        20

    accuracy                           0.57       278
   macro avg       0.50      0.52      0.50       278
weighted avg       0.63      0.57      0.59       278

Confusion Matrix:

[[20  3 13  2]
 [ 9 61 21  8]
 [24 10 71 16]
 [ 5  0  8  7]]




In [24]:
#bert

In [29]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

# Dataset wrapper
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = ReviewDataset(train_encodings, y_train.tolist())
test_dataset = ReviewDataset(test_encodings, y_test.tolist())

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Trainer
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=32,
#     eval_strategy='epoch',
#     save_strategy='epoch',
#     logging_dir='./logs',
#     logging_steps=10,
#     load_best_model_at_end=True
# )
# Epoch	Training Loss	Validation Loss	Accuracy	F1	Precision	Recall
# 1	0.559000	0.566445	0.802158	0.781403	0.798839	0.802158
# 2	0.457400	0.535569	0.798561	0.800712	0.808069	0.798561
# 3	0.198300	0.558971	0.805755	0.804878	0.804605	0.805755

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
Q
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.603,0.656889,0.758993,0.737536,0.755248,0.758993
2,0.3937,0.566452,0.794964,0.789734,0.78768,0.794964


TrainOutput(global_step=140, training_loss=0.6612173744610378, metrics={'train_runtime': 48.0729, 'train_samples_per_second': 46.263, 'train_steps_per_second': 2.912, 'total_flos': 138292009544832.0, 'train_loss': 0.6612173744610378, 'epoch': 2.0})

In [30]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

pred_output = trainer.predict(test_dataset)

y_pred_bert = np.argmax(pred_output.predictions, axis=1)

print("BERT Accuracy:", accuracy_score(y_test, y_pred_bert))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_bert))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred_bert))


BERT Accuracy: 0.7949640287769785

Classification Report:

              precision    recall  f1-score   support

           0       0.59      0.45      0.51        38
           1       0.86      0.88      0.87        99
           2       0.80      0.84      0.82       121
           3       0.75      0.75      0.75        20

    accuracy                           0.79       278
   macro avg       0.75      0.73      0.74       278
weighted avg       0.79      0.79      0.79       278

Confusion Matrix:

[[ 17   7  11   3]
 [  1  87  11   0]
 [ 10   7 102   2]
 [  1   0   4  15]]
