In [1]:

# this is an older code without reporting cuda and batch number

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base").to("cuda" if torch.cuda.is_available() else "cpu")

# Create a list of 17,000 sample texts (replace this with your actual data)
texts = ["Sample text {}".format(i) for i in range(17000)]

# Set the batch size (you may need to adjust this based on your GPU memory)
batch_size = 32

# Function to classify sentiment in a batch of texts
def classify_sentiment(texts_batch):
    encoded_input = tokenizer(texts_batch, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {key: value.to("cuda" if torch.cuda.is_available() else "cpu") for key, value in encoded_input.items()}
    with torch.no_grad():
        logits = model(**encoded_input).logits
    probabilities = torch.softmax(logits, dim=1)
    labels = ["NEGATIVE" if p < 0.5 else "POSITIVE" for p in probabilities[:, 1]]
    return labels

# Process the texts in batches
sentiments = []
for i in range(0, len(texts), batch_size):
    texts_batch = texts[i:i + batch_size]
    sentiments_batch = classify_sentiment(texts_batch)
    sentiments.extend(sentiments_batch)

# Print the first 10 sentiment labels
print(sentiments[:10])


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly i

['NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE']


In [4]:
#2023/5/02 03:58 PM WORKING WITH Python 3.9.6 (WINDOWS non conda)


# code wtih cuda reporting.

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("CUDA available:" if cuda_available else "CUDA not available")

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base").to("cuda" if cuda_available else "cpu")

# Create a list of 17,000 sample texts (replace this with your actual data)
texts = ["Sample text {}".format(i) for i in range(17000)]

# Set the batch size (you may need to adjust this based on your GPU memory)
batch_size = 32

# Function to classify sentiment in a batch of texts
def classify_sentiment(texts_batch):
    encoded_input = tokenizer(texts_batch, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {key: value.to("cuda" if cuda_available else "cpu") for key, value in encoded_input.items()}
    with torch.no_grad():
        logits = model(**encoded_input).logits
    probabilities = torch.softmax(logits, dim=1)
    labels = ["NEGATIVE" if p < 0.5 else "POSITIVE" for p in probabilities[:, 1]]
    return labels

# Process the texts in batches
sentiments = []
num_batches = len(texts) // batch_size + int(len(texts) % batch_size > 0)

for i in range(0, len(texts), batch_size):
    texts_batch = texts[i:i + batch_size]
    sentiments_batch = classify_sentiment(texts_batch)
    sentiments.extend(sentiments_batch)
    
    # Print progress
    batches_completed = i // batch_size + 1
    batches_left = num_batches - batches_completed
    print(f"Batch {batches_completed}/{num_batches} completed. Batches left: {batches_left}")

# Print the first 10 sentiment labels
print(sentiments[:50])



CUDA available:


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Batch 1/532 completed. Batches left: 531
Batch 2/532 completed. Batches left: 530
Batch 3/532 completed. Batches left: 529
Batch 4/532 completed. Batches left: 528
Batch 5/532 completed. Batches left: 527
Batch 6/532 completed. Batches left: 526
Batch 7/532 completed. Batches left: 525
Batch 8/532 completed. Batches left: 524
Batch 9/532 completed. Batches left: 523
Batch 10/532 completed. Batches left: 522
Batch 11/532 completed. Batches left: 521
Batch 12/532 completed. Batches left: 520
Batch 13/532 completed. Batches left: 519
Batch 14/532 completed. Batches left: 518
Batch 15/532 completed. Batches left: 517
Batch 16/532 completed. Batches left: 516
Batch 17/532 completed. Batches left: 515
Batch 18/532 completed. Batches left: 514
Batch 19/532 completed. Batches left: 513
Batch 20/532 completed. Batches left: 512
Batch 21/532 completed. Batches left: 511
Batch 22/532 completed. Batches left: 510
Batch 23/532 completed. Batches left: 509
Batch 24/532 completed. Batches left: 508
B

In [25]:
texts_raw = '''
this is good
real bad
I like what happned
i hated that person
cats have 4 legs
I have shorted the stock, cuz i see a slowdown in business
'''

texts = texts_raw.split('\n')
texts = [i for i in texts if i != '']
print(texts)

['this is good', 'real bad', 'I like what happned', 'i hated that person', 'cats have 4 legs', 'I have shorted the stock, cuz i see a slowdown in business']


In [28]:
# 2023/5/02 03:59 PM --- trying to use roberta updated  (model_name = f'cardiffnlp/twitter-roberta-base-sentiment-latest')
# adding some simple sentences to test roberta with CUDA

# code wtih cuda reporting.

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("CUDA available:" if cuda_available else "CUDA not available")

# Load the RoBERTa tokenizer and model
model_name = f'cardiffnlp/twitter-roberta-base-sentiment-latest'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda" if cuda_available else "cpu")

# Create a list of 17,000 sample texts (replace this with your actual data)
# texts = ["Sample text {}".format(i) for i in range(17000)] #commented out to use my own text

# Set the batch size (you may need to adjust this based on your GPU memory)
batch_size = 32

# Function to classify sentiment in a batch of texts
def classify_sentiment(texts_batch):
    print(texts_batch)
    encoded_input = tokenizer(texts_batch, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {key: value.to("cuda" if cuda_available else "cpu") for key, value in encoded_input.items()}
    with torch.no_grad():
        logits = model(**encoded_input).logits
    probabilities = torch.softmax(logits, dim=1)
    print(probabilities)
    # labels = ["NEGATIVE" if p < 0.5 else "POSITIVE" for p in probabilities[:, 1]]
    max_indices = torch.argmax(probabilities, dim=1)
    # Map indices to labels
    labels_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    labels = [labels_map[index.item()] for index in max_indices]
    return labels
    # return probabilities

# Process the texts in batches
sentiments = []
num_batches = len(texts) // batch_size + int(len(texts) % batch_size > 0)

for i in range(0, len(texts), batch_size):
    texts_batch = texts[i:i + batch_size]
    sentiments_batch = classify_sentiment(texts_batch)
    sentiments.extend(sentiments_batch)
    
    # Print progress
    batches_completed = i // batch_size + 1
    batches_left = num_batches - batches_completed
    print(f"Batch {batches_completed}/{num_batches} completed. Batches left: {batches_left}")

# Print the first 10 sentiment labels
print(sentiments[:50])



CUDA available:


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['this is good', 'real bad', 'I like what happned', 'i hated that person', 'cats have 4 legs', 'I have shorted the stock, cuz i see a slowdown in business']
tensor([[0.0086, 0.0611, 0.9303],
        [0.6685, 0.2612, 0.0703],
        [0.0097, 0.1706, 0.8197],
        [0.9062, 0.0784, 0.0155],
        [0.0499, 0.7933, 0.1569],
        [0.6559, 0.3242, 0.0200]], device='cuda:0')
Batch 1/1 completed. Batches left: 0
['positive', 'negative', 'positive', 'negative', 'neutral', 'negative']


# Loading 300k tweets and getting sentiment on each one then saving that to CSV.


In [29]:
# Load 300K Tweets

In [None]:
# WORKING WITH CUDA and adding 300k tweets to test

# code wtih cuda reporting.

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("CUDA available:" if cuda_available else "CUDA not available")

# Load the RoBERTa tokenizer and model
model_name = f'cardiffnlp/twitter-roberta-base-sentiment-latest'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda" if cuda_available else "cpu")

# Create a list of 17,000 sample texts (replace this with your actual data)
# texts = ["Sample text {}".format(i) for i in range(17000)] #commented out to use my own text

# Set the batch size (you may need to adjust this based on your GPU memory)
batch_size = 32

# Function to classify sentiment in a batch of texts
def classify_sentiment(texts_batch):
    print(texts_batch)
    encoded_input = tokenizer(texts_batch, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {key: value.to("cuda" if cuda_available else "cpu") for key, value in encoded_input.items()}
    with torch.no_grad():
        logits = model(**encoded_input).logits
    probabilities = torch.softmax(logits, dim=1)
    print(probabilities)
    # labels = ["NEGATIVE" if p < 0.5 else "POSITIVE" for p in probabilities[:, 1]]
    max_indices = torch.argmax(probabilities, dim=1)
    # Map indices to labels
    labels_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    labels = [labels_map[index.item()] for index in max_indices]
    return labels
    # return probabilities

# Process the texts in batches
sentiments = []
num_batches = len(texts) // batch_size + int(len(texts) % batch_size > 0)

for i in range(0, len(texts), batch_size):
    texts_batch = texts[i:i + batch_size]
    sentiments_batch = classify_sentiment(texts_batch)
    sentiments.extend(sentiments_batch)
    
    # Print progress
    batches_completed = i // batch_size + 1
    batches_left = num_batches - batches_completed
    print(f"Batch {batches_completed}/{num_batches} completed. Batches left: {batches_left}")

# Print the first 10 sentiment labels
print(sentiments[:50])



CUDA available:


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['this is good', 'real bad', 'I like what happned', 'i hated that person', 'cats have 4 legs', 'I have shorted the stock, cuz i see a slowdown in business']
tensor([[0.0086, 0.0611, 0.9303],
        [0.6685, 0.2612, 0.0703],
        [0.0097, 0.1706, 0.8197],
        [0.9062, 0.0784, 0.0155],
        [0.0499, 0.7933, 0.1569],
        [0.6559, 0.3242, 0.0200]], device='cuda:0')
Batch 1/1 completed. Batches left: 0
['positive', 'negative', 'positive', 'negative', 'neutral', 'negative']


In [None]:
# Add sentiment back into main df


# save main 300ktweetdf to csv.

In [15]:
import torch
print(torch.version.cuda)

11.3
