In [3]:
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk

# Load your local dataset 
data = pd.read_csv('amazon_dataset.csv')  
texts = data['review'].tolist()
labels = data['label'].tolist()  # since we have numerical labels

# Function to remove stopwords from a given text
from nltk.corpus import stopwords
nltk.download('stopwords')


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the 'Text' column
df['text'] = df['text'].apply(remove_stopwords)

# Tokenization 
MODEL = "Davlan/naija-twitter-sentiment-afriberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Tokenize and convert to features
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists of tensors to a single tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Train-test split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.2
)
train_masks, test_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=42, test_size=0.2
)

# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Model setup
model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=2,  # Change this based on your sentiment classes
    output_attentions=False,
    output_hidden_states=False
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        optimizer.zero_grad()
        
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Evaluation
model.eval()
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size)

predictions = []
with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

accuracy = accuracy_score(test_labels.cpu().numpy(), predictions)
print(f"Test Accuracy: {accuracy}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/isab7070/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'df' is not defined

In [None]:
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk

# Load your local dataset 
data = pd.read_csv('amazon_dataset.csv')  


texts = data['review'].tolist()
labels = data['label'].tolist()  # since we have numerical labels


# Tokenization 
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize and convert to features
input_ids = []
attention_masks = []

for text in texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        #max_length=128,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists of tensors to a single tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Train-test split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.2
)
train_masks, test_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=42, test_size=0.2
)

# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Model setup
model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=3,  # Change this based on your sentiment classes
    output_attentions=False,
    output_hidden_states=False
)

device = "mps" if torch.backends.mps.is_available () else "cpu"
model.to(device)

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model.to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        optimizer.zero_grad()
        
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")


# Evaluation
model.eval()
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size)

predictions = []
with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# Calculate additional evaluation metrics
accuracy = accuracy_score(test_labels.cpu().numpy(), predictions)
precision = precision_score(test_labels.cpu().numpy(), predictions, average='weighted')
recall = recall_score(test_labels.cpu().numpy(), predictions, average='weighted')
f1 = f1_score(test_labels.cpu().numpy(), predictions, average='weighted')
confusion = confusion_matrix(test_labels.cpu().numpy(), predictions)

# Print the additional metrics
print(f"Test Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print("Confusion Matrix:")
print(confusion)

In [3]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [1]:
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os

# Define paths for saving and loading the model and tokenizer
model_save_path = 'xlm_roberta_sentiment_model'
tokenizer_save_path = 'xlm_roberta_sentiment_tokenizer'

# Load your local dataset
data = pd.read_csv('amazon_dataset.csv')

texts = data['review'].tolist()
labels = data['label'].tolist()  # since we have numerical labels

# Tokenization
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenization and preprocessing with smaller batches
input_ids = []
attention_masks = []
batch_size = 8  # Set an appropriate batch size

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    batch_encoded_dict = tokenizer.batch_encode_plus(
        batch_texts,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids.append(batch_encoded_dict['input_ids'])
    attention_masks.append(batch_encoded_dict['attention_mask'])

# Convert the lists of tensors to a single tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


# Train-test split
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.2
)
train_masks, test_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=42, test_size=0.2
)

# Create DataLoader
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Model setup
model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=3,  # Change this based on your sentiment classes
    output_attentions=False,
    output_hidden_states=False
)

# Device selection
device = "mps" if torch.backends.mps.is_available () else "cpu"
#device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



# Training loop
num_epochs = 3  # Adjust as needed

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)


for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        optimizer.zero_grad()

        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()  # Update learning rate

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}")

# Save the trained model and tokenizer
os.makedirs(model_save_path, exist_ok=True)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# Load the saved model and tokenizer
loaded_model = XLMRobertaForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = XLMRobertaTokenizer.from_pretrained(model_save_path)

# Load your new dataset in a CSV file
new_data = pd.read_csv('kaggle_amazon2.csv')
new_texts = new_data['review'].tolist()

# Tokenization and preprocessing
input_ids = []
attention_masks = []

for text in new_texts:
    encoded_dict = loaded_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists of tensors to a single tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Create DataLoader
batch_size = 16
new_data = TensorDataset(input_ids, attention_masks)
new_loader = DataLoader(new_data, batch_size=batch_size, shuffle=False)  # No need to shuffle for inference

# Prediction loop
loaded_model.eval()
predicted_labels = []

for batch in new_loader:
    batch = tuple(t.to(device) for t in batch)
    inputs, masks = batch

    with torch.no_grad():
        outputs = loaded_model(inputs, attention_mask=masks)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).tolist()
    predicted_labels.extend(predicted_label)

# Add the predicted labels to the new dataset
new_data['predicted_label'] = predicted_labels

# Save the new dataset with predicted labels to a CSV file
new_data.to_csv('new_dataset_with_labels.csv', index=False)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: MPS backend out of memory (MPS allocated: 15.42 GB, other allocations: 2.88 GB, max allowed: 18.13 GB). Tried to allocate 24.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import os


# Load the saved model and tokenizer
loaded_model = XLMRobertaForSequenceClassification.from_pretrained(xlm_roberta_sentiment_model)
loaded_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Load your new dataset in a CSV file
new_data = pd.read_csv('kaggle_amazon2.csv')
new_texts = new_data['review'].tolist()

# Tokenization and preprocessing
input_ids = []
attention_masks = []

for text in new_texts:
    encoded_dict = loaded_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists of tensors to a single tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Create DataLoader
batch_size = 16
new_data = TensorDataset(input_ids, attention_masks)
new_loader = DataLoader(new_data, batch_size=batch_size, shuffle=False)  # No need to shuffle for inference

# Prediction loop
loaded_model.eval()
predicted_labels = []

for batch in new_loader:
    batch = tuple(t.to(device) for t in batch)
    inputs, masks = batch

    with torch.no_grad():
        outputs = loaded_model(inputs, attention_mask=masks)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).tolist()
    predicted_labels.extend(predicted_label)

# Add the predicted labels to the new dataset
new_data['predicted_label'] = predicted_labels

# Save the new dataset with predicted labels to a CSV file
new_data.to_csv('new_dataset_with_labels.csv', index=False)
