In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# dfBert = pd.read_csv('./midleDatasets/dataset_With_TS_Clean_Text.csv')
dfBert = pd.read_csv('./midleDatasets/train_dataset.csv')


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dfBert = pd.read_csv('./midleDatasets/dataset_With_True_Sentiments.csv')

In [4]:
dfBert.head()

Unnamed: 0,title,text,date,hotel_class,name,region,service,cleanliness,overall,value,location,sleep_quality,rooms,True_Sentiment
0,"“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,2012-12-17,5.0,Hotel Beacon,NY,5.0,5.0,5.0,5.0,5.0,5.0,5.0,positive
1,“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...",2012-12-17,5.0,Hotel Beacon,NY,5.0,5.0,5.0,5.0,5.0,5.0,5.0,positive
2,“Excellent location”,Loved the hotel. Great location - only 2 block...,2012-12-17,5.0,Hotel Beacon,NY,5.0,5.0,5.0,5.0,5.0,5.0,5.0,positive
3,“All-round fantastic NYC hotel”,Our first stay on the upper west side and can'...,2012-12-17,5.0,Hotel Beacon,NY,5.0,5.0,5.0,4.0,5.0,5.0,5.0,positive
4,“Great hotel in nice area”,"Great room, very big with huge bed! Great loca...",2012-12-17,5.0,Hotel Beacon,NY,5.0,5.0,5.0,4.0,5.0,5.0,5.0,positive


In [13]:

# Filtering the dataset for entries with the name 'Alice'
filtered_df = dfBert[dfBert['name'] == "Hotel Beacon"]
filtered_df.head()
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1193 entries, 0 to 1192
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           1193 non-null   object 
 1   text            1193 non-null   object 
 2   date            1193 non-null   object 
 3   hotel_class     1193 non-null   float64
 4   name            1193 non-null   object 
 5   region          1193 non-null   object 
 6   service         1193 non-null   float64
 7   cleanliness     1193 non-null   float64
 8   overall         1193 non-null   float64
 9   value           1193 non-null   float64
 10  location        1193 non-null   float64
 11  sleep_quality   1193 non-null   float64
 12  rooms           1193 non-null   float64
 13  True_Sentiment  1193 non-null   object 
dtypes: float64(8), object(6)
memory usage: 139.8+ KB


In [7]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [8]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [9]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [10]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [11]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
    return "positive" if preds.item() == 1 else "negative"

In [12]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [9]:
dfBert.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770951 entries, 0 to 770950
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   title           770951 non-null  object 
 1   text            770951 non-null  object 
 2   date            770951 non-null  object 
 3   hotel_class     770951 non-null  float64
 4   name            770951 non-null  object 
 5   region          770951 non-null  object 
 6   service         770951 non-null  float64
 7   cleanliness     770951 non-null  float64
 8   overall         770951 non-null  float64
 9   value           770951 non-null  float64
 10  location        770951 non-null  float64
 11  sleep_quality   770951 non-null  float64
 12  rooms           770951 non-null  float64
 13  True_Sentiment  770951 non-null  object 
 14  clean_text      770951 non-null  object 
dtypes: float64(8), object(7)
memory usage: 88.2+ MB


In [15]:
filtered_df['Value_True_sentiment'].head()

0    1
1    1
2    1
3    1
4    1
Name: Value_True_sentiment, dtype: int64

In [16]:
filtered_df['True_Sentiment'].head()

0    positive
1    positive
2    positive
3    positive
4    positive
Name: True_Sentiment, dtype: object

In [14]:
# Map sentiment labels to numeric values
filtered_df['Value_True_sentiment'] = filtered_df['True_Sentiment'].map({'positive': 1, 'negative': 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Value_True_sentiment'] = filtered_df['True_Sentiment'].map({'positive': 1, 'negative': 0})


In [17]:
dfBert=filtered_df

In [18]:
train_texts, val_texts, train_labels, val_labels = train_test_split(dfBert["text"], dfBert["Value_True_sentiment"], test_size=0.3, random_state=42)

In [19]:
# Initialize the tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)
print(device)

In [21]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [22]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

Epoch 1/4


KeyError: 23

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Define training and evaluation functions
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

def evaluate(model, dataloader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['negative', 'positive'])

    return accuracy, report

train_texts, val_texts, train_labels, val_labels = train_test_split(dfBert["text"], dfBert["Value_True_sentiment"], test_size=0.2, random_state=42)


# Parameters
bert_model_name = 'bert-base-uncased'
max_length = 128
batch_size = 16
num_epochs = 3
learning_rate = 2e-5

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Create datasets
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2).to(device)

# Initialize the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Training and evaluation loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


KeyError: 2266

In [8]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts.reset_index(drop=True)  # Ensure indices are reset
        self.labels = labels.reset_index(drop=True)  # Ensure indices are reset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Example data
# dfBert = pd.read_csv('your_dataset.csv') # Uncomment this line to load your dataset

# Assuming dfBert["text"] and dfBert["Value_True_sentiment"] contain the text and labels respectively
train_texts, val_texts, train_labels, val_labels = train_test_split(
    dfBert["text"], dfBert["Value_True_sentiment"], test_size=0.2, random_state=42)

# Parameters
bert_model_name = 'bert-base-uncased'
max_length = 128
batch_size = 8  # Reduce batch size if running on a CPU
num_epochs = 3  # Start with 1 epoch to see how long it takes
learning_rate = 2e-5

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Define the dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize the model
device = torch.device("cpu")
print(device)  # Check if using GPU or CPU
model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2).to(device)

# Initialize the optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)  # Use PyTorch's AdamW
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Define the train function
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

# Define the evaluate function
def evaluate(model, dataloader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['negative', 'positive'])

    return accuracy, report

# Training and evaluation loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)


WRONG MODEL THE OUTPuT IS 1star,2stars,3stats,4stars,5satars

In [23]:
# Install necessary libraries
%pip install transformers scikit-learn tqdm

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm import tqdm

# Load pre-trained model and tokenizer
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)


texts = dfBert['text'].tolist()
true_labels = dfBert['Value_True_sentiment'].tolist()

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Tokenize and make predictions in batches to avoid memory issues
batch_size = 32
predictions = []

for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        predictions.extend(batch_predictions)

# Convert predictions to a DataFrame
dfBert['predictions'] = predictions

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
report = classification_report(true_labels, predictions, target_names=['negative', 'positive'])
print("Classification Report:")
print(report)

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
Processing batches: 100%|██████████| 38/38 [14:46<00:00, 23.32s/it]

Accuracy: 0.0277





ValueError: Number of classes, 5, does not match size of target_names, 2. Try specifying the labels parameter

THE CORRECT ONE IS THIS

In [25]:
# Install necessary libraries
%pip install transformers scikit-learn tqdm

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
import pandas as pd
import numpy as np
import random

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Load pre-trained model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

# Assuming dfBert is your DataFrame and it is already loaded
# Example: dfBert = pd.read_csv('your_dataset.csv')
texts = dfBert['text'].tolist()
labels = dfBert['Value_True_sentiment'].tolist()

# Split dataset into training and validation sets
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create a custom dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Parameters
max_length = 128
batch_size = 16
num_epochs = 3
learning_rate = 2e-5

# Create datasets and dataloaders
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

# Initialize optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training batches"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating batches"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)

    accuracy = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=['negative', 'positive'])
    return accuracy, report

# Training and evaluation loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_distilbert')
tokenizer.save_pretrained('./fine_tuned_distilbert')



[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip







cpu
Epoch 1/3


Training batches: 100%|██████████| 60/60 [03:28<00:00,  3.48s/it]


Training Loss: 0.2099


Evaluating batches: 100%|██████████| 15/15 [00:17<00:00,  1.19s/it]


Validation Accuracy: 0.9498
Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.27      0.40        15
    positive       0.95      1.00      0.97       224

    accuracy                           0.95       239
   macro avg       0.88      0.63      0.69       239
weighted avg       0.94      0.95      0.94       239

Epoch 2/3


Training batches: 100%|██████████| 60/60 [03:28<00:00,  3.48s/it]


Training Loss: 0.1169


Evaluating batches: 100%|██████████| 15/15 [00:17<00:00,  1.17s/it]


Validation Accuracy: 0.9582
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.40      0.55        15
    positive       0.96      1.00      0.98       224

    accuracy                           0.96       239
   macro avg       0.91      0.70      0.76       239
weighted avg       0.95      0.96      0.95       239

Epoch 3/3


Training batches: 100%|██████████| 60/60 [03:31<00:00,  3.52s/it]


Training Loss: 0.0869


Evaluating batches: 100%|██████████| 15/15 [00:18<00:00,  1.21s/it]

Validation Accuracy: 0.9582
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.40      0.55        15
    positive       0.96      1.00      0.98       224

    accuracy                           0.96       239
   macro avg       0.91      0.70      0.76       239
weighted avg       0.95      0.96      0.95       239






('./fine_tuned_distilbert\\tokenizer_config.json',
 './fine_tuned_distilbert\\special_tokens_map.json',
 './fine_tuned_distilbert\\vocab.txt',
 './fine_tuned_distilbert\\added_tokens.json')

-----------------------RESULTS:------------------------


cpu
Epoch 1/3
Training batches: 100%|██████████| 60/60 [03:28<00:00,  3.48s/it]
Training Loss: 0.2099
Evaluating batches: 100%|██████████| 15/15 [00:17<00:00,  1.19s/it]
Validation Accuracy: 0.9498
Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.27      0.40        15
    positive       0.95      1.00      0.97       224

    accuracy                           0.95       239
   macro avg       0.88      0.63      0.69       239
weighted avg       0.94      0.95      0.94       239

Epoch 2/3
Training batches: 100%|██████████| 60/60 [03:28<00:00,  3.48s/it]
Training Loss: 0.1169
Evaluating batches: 100%|██████████| 15/15 [00:17<00:00,  1.17s/it]
Validation Accuracy: 0.9582
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.40      0.55        15
    positive       0.96      1.00      0.98       224

    accuracy                           0.96       239
   macro avg       0.91      0.70      0.76       239
weighted avg       0.95      0.96      0.95       239

Epoch 3/3
Training batches: 100%|██████████| 60/60 [03:31<00:00,  3.52s/it]
Training Loss: 0.0869
Evaluating batches: 100%|██████████| 15/15 [00:18<00:00,  1.21s/it]
Validation Accuracy: 0.9582
Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.40      0.55        15
    positive       0.96      1.00      0.98       224

    accuracy                           0.96       239
   macro avg       0.91      0.70      0.76       239
weighted avg       0.95      0.96      0.95       239

In [26]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_distilbert'
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [27]:
def preprocess_review(review, tokenizer, max_length=128):
    # Tokenize the review
    inputs = tokenizer(review, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    return inputs

review = "I really enjoyed hotel XS, the rooms were reaaly big, it had a huge tv and the bathrom was very confortable.It had a shower and a bathub. the people were also very nice and helpful"
inputs = preprocess_review(review, tokenizer)

In [28]:
def predict_sentiment(inputs, model, device):
    model.eval()
    with torch.no_grad():
        # Move input tensors to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return predictions.item()

prediction = predict_sentiment(inputs, model, device)
print

In [29]:
# Map predictions to sentiment labels
def map_prediction_to_label(prediction):
    labels = ['negative', 'positive']
    return labels[prediction]

sentiment_label = map_prediction_to_label(prediction)
print(f"The sentiment of the review is: {sentiment_label}")

The sentiment of the review is: positive


RUNNING THE FINE TUNNED MODEL: SAME CODE BUT ALL TOGETHER FOR ARRAY OF REVIEWS

In [31]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_distilbert'
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def preprocess_reviews(reviews, tokenizer, max_length=128):
    # Tokenize the reviews
    inputs = tokenizer(reviews, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    return inputs

def predict_sentiments(inputs, model, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        # Move input tensors to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        predictions.extend(batch_predictions)
    return predictions

def map_predictions_to_labels(predictions):
    labels = ['negative', 'positive']
    return [labels[prediction] for prediction in predictions]

# Example array of reviews
reviews = [
    "I really enjoyed hotel XS, the rooms were reaaly big, it had a huge tv and the bathrom was very confortable.It had a shower and a bathub. the people were also very nice and helpful",
    "The service was terrible, and I will never come back.",
    "It's okay, but I've had better experiences elsewhere."
]

# Preprocess and predict
inputs = preprocess_reviews(reviews, tokenizer)
predictions = predict_sentiments(inputs, model, device)
sentiment_labels = map_predictions_to_labels(predictions)

# Output the results
for review, sentiment in zip(reviews, sentiment_labels):
    print(f"Review: {review}\nSentiment: {sentiment}\n")


Review: I really enjoyed hotel XS, the rooms were reaaly big, it had a huge tv and the bathrom was very confortable.It had a shower and a bathub. the people were also very nice and helpful
Sentiment: positive

Review: The service was terrible, and I will never come back.
Sentiment: negative

Review: It's okay, but I've had better experiences elsewhere.
Sentiment: negative



---------------------------------------------------------------------

In [21]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
config.json: 100%|██████████| 629/629 [00:00<00:00, 628kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model.safetensors: 100%|██████████| 268M/268M [00:04<00:00, 62.6MB/s] 
tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<?, ?B/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.08MB/s]


In [22]:
classifier('We are very happy to show you the 🤗 Transformers library.')

[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [24]:
results = classifier(dfBert["text"].tolist())
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (549) must match the size of tensor b (512) at non-singleton dimension 1