In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("some_review.csv")

# Convert the label column to integers, handle invalid values with NaN
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df = df.dropna(subset=['label']).reset_index(drop=True)

train_reviews = df['reviews'].tolist()
train_labels = df['label'].astype(int).tolist()  # Now convert the label column to integers

train_reviews, test_reviews, train_labels, test_labels = train_test_split(train_reviews, train_labels, test_size=0.2, random_state=42)

print(train_reviews[:5])  # Print the first 5 elements of the training reviews
print(test_reviews[:5])   # Print the first 5 elements of the testing reviews

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_reviews, truncation=True, padding=True)
test_encodings = tokenizer(test_reviews, truncation=True, padding=True)

class CustomDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=256):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            review,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label)
        }

        return item

train_dataset = CustomDataset(reviews=train_reviews, labels=train_labels, tokenizer=tokenizer)
test_dataset = CustomDataset(reviews=test_reviews, labels=test_labels, tokenizer=tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

model.to(device)

model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predicted_labels = torch.argmax(logits, dim=1)

        predictions.extend(predicted_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

predictions = np.array(predictions)
true_labels = np.array(true_labels)

accuracy = accuracy_score(true_labels, predictions)
classification_rep = classification_report(true_labels, predictions)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_rep)


['Broken', 'Motorola V860 Cell Phone', 'This one is my favorite. Came amazingly fast', 'OK Phone, Bluetooth stereo audio is awful, voice recognition is unusable', 'Phone is great, battery requires recharging several time a day.']
['Stupid phone', 'Good basic phone for the money', "didn't work", 'WHAT A LET DOWN', 'Good Buy!']


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.0330
Classification Report:
              precision    recall  f1-score   support

           1       0.03      1.00      0.06        33
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        20
           4       0.00      0.00      0.00       100
           5       0.00      0.00      0.00       162
           6       0.00      0.00      0.00       114
           7       0.00      0.00      0.00       110
           8       0.00      0.00      0.00        96
           9       0.00      0.00      0.00        96
          10       0.00      0.00      0.00       100
          11       0.00      0.00      0.00       100
          12       0.00      0.00      0.00        56

    accuracy                           0.03       999
   macro avg       0.00      0.08      0.01       999
weighted avg       0.00      0.03      0.00       999



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
df

Unnamed: 0,ID,name,label,date,verified,reviews,body,helpfulVotes
0,B0000SX2UC,Janet,3.000000,"October 11, 2005",FALSE,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1
1,B0000SX2UC,Luke Wyatt,1.000000,"January 7, 2004",FALSE,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17
2,B0000SX2UC,Brooke,5.000000,"December 30, 2003",FALSE,Love This Phone,"This is a great, reliable phone. I also purcha...",5
3,B0000SX2UC,amy m. teague,3.000000,"March 18, 2004",FALSE,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1
4,B0000SX2UC,tristazbimmer,4.000000,"August 28, 2005",FALSE,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1
...,...,...,...,...,...,...,...,...
4987,B002WTC1NG,DC Mitchell,12.558602,"June 26, 2023",TRUE,Good basic phone!,Bought this for my dad as a birthday present. ...,2.915628242
4988,B002WTC1NG,Amazon Customer,12.560531,"October 18, 2022",TRUE,"Still best ""rugged"" phone you can get but it w...","Was not ""new"" as advertised. Had been previous...",2.915540065
4989,B002WTC1NG,Robert J.,12.562460,"August 7, 2023",TRUE,I would like to have this one replaced,"After 3 weeks of use , three of the button cov...",2.915451887
4990,B002WTC1NG,R Mosley,12.564389,"December 3, 2019",TRUE,BARRAGE V867,9th PHONE EXACTLY LIKE THIS. PURCHASED BECAUSE...,2.91536371
