In [1]:
import torch
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
import numpy as np

In [2]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [3]:
print(f"✅ Tokenizer loaded!")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Maximum sequence length: {tokenizer.model_max_length}")

✅ Tokenizer loaded!
Vocabulary size: 50265
Maximum sequence length: 512


In [4]:
complaint_texts = [
    "I am very disappointed with the service quality",
    "The product arrived damaged and I want a refund",
    "Excellent service, very satisfied with the purchase",
    "Delivery was late and customer support was rude",
    "Amazing product, exactly what I ordered"
]
labels = [1, 1, 0, 1, 0]

print("Sample complaint texts:")
for i, text in enumerate(complaint_texts):
    label_text = "Complaint" if labels[i] == 1 else "Not Complaint"
    print(f"{i+1}. [{label_text}] {text}")

Sample complaint texts:
1. [Complaint] I am very disappointed with the service quality
2. [Complaint] The product arrived damaged and I want a refund
3. [Not Complaint] Excellent service, very satisfied with the purchase
4. [Complaint] Delivery was late and customer support was rude
5. [Not Complaint] Amazing product, exactly what I ordered


In [5]:
print("\n" + "="*50)
print("STEP 4: Understanding Tokenization")
print("="*50)


STEP 4: Understanding Tokenization


In [6]:
# Take first complaint text as example
sample_text = complaint_texts[0]
print(f"Original text: '{sample_text}'")

Original text: 'I am very disappointed with the service quality'


In [7]:
tokens = tokenizer.tokenize(sample_text)
print(f"Broken into tokens: {tokens}")

Broken into tokens: ['I', 'Ġam', 'Ġvery', 'Ġdisappointed', 'Ġwith', 'Ġthe', 'Ġservice', 'Ġquality']


In [8]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"Token numbers: {token_ids}")

Token numbers: [100, 524, 182, 5779, 19, 5, 544, 1318]


In [9]:
decoded_text = tokenizer.decode(token_ids)
print(f"Decoded back: '{decoded_text}'")

Decoded back: 'I am very disappointed with the service quality'


In [10]:
print("\n" + "="*50)
print("STEP 5: Convert Single Text to Vectors")
print("="*50)


STEP 5: Convert Single Text to Vectors


In [11]:
single_text = "The service was terrible and I want my money back"
print(f"Converting text: '{single_text}'")

# Convert text to token IDs with padding and truncation
encoded = tokenizer(
    single_text,
    max_length=50,  # Maximum length (you can change this)
    padding='max_length',  # Pad shorter texts
    truncation=True,  # Cut longer texts
    return_tensors='pt'  # Return as PyTorch tensors
)

Converting text: 'The service was terrible and I want my money back'


In [12]:
print(f"Input IDs shape: {encoded['input_ids'].shape}")
print(f"Input IDs: {encoded['input_ids']}")
print(f"Attention mask: {encoded['attention_mask']}")

Input IDs shape: torch.Size([1, 50])
Input IDs: tensor([[   0,  133,  544,   21, 6587,    8,   38,  236,  127,  418,  124,    2,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])


In [13]:
# ===================================================================
# STEP 6: LOAD ROBERTA MODEL TO GET ACTUAL VECTORS
# ===================================================================
print("\n" + "="*50)
print("STEP 6: Load RoBERTa Model")
print("="*50)

# Load the RoBERTa model
model = RobertaModel.from_pretrained('roberta-base')
model.eval()


STEP 6: Load RoBERTa Model


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [14]:
print("✅ RoBERTa model loaded!")
print(f"Model hidden size: {model.config.hidden_size}")

✅ RoBERTa model loaded!
Model hidden size: 768


In [15]:
# ===================================================================
# STEP 7: GET ACTUAL VECTORS FROM TEXT
# ===================================================================

In [16]:
print("\n" + "="*50)
print("STEP 7: Convert Text to Actual Vectors")
print("="*50)

# Use the encoded text from step 5
input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']

print(f"Input shape: {input_ids.shape}")

# Pass through RoBERTa model
with torch.no_grad():  # Don't calculate gradients (saves memory)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
# Get the vectors (embeddings)
vectors = outputs.last_hidden_state
print(f"Output vectors shape: {vectors.shape}")
print(f"Each word is now a {vectors.shape[-1]}-dimensional vector!")

# Show first few dimensions of first word vector
print(f"First word vector (first 10 dimensions): {vectors[0, 0, :10]}")


STEP 7: Convert Text to Actual Vectors
Input shape: torch.Size([1, 50])
Output vectors shape: torch.Size([1, 50, 768])
Each word is now a 768-dimensional vector!
First word vector (first 10 dimensions): tensor([-0.0439,  0.1084, -0.0172, -0.1317,  0.0851, -0.0702, -0.0444,  0.0221,
         0.0571, -0.0701])


In [17]:
def convert_texts_to_vectors(texts, tokenizer, model, max_length=128):
    """
    Simple function to convert list of texts to vectors
    """
    all_vectors = []
    all_input_ids = []
    all_attention_masks = []
    
    print(f"Processing {len(texts)} texts...")
    
    for i, text in enumerate(texts):
        print(f"Processing text {i+1}: '{text[:50]}...'")
        
        # Tokenize text
        encoded = tokenizer(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Get vectors from model
        with torch.no_grad():
            outputs = model(
                input_ids=encoded['input_ids'],
                attention_mask=encoded['attention_mask']
            )
            vectors = outputs.last_hidden_state
        # Store results
        all_vectors.append(vectors)
        all_input_ids.append(encoded['input_ids'])
        all_attention_masks.append(encoded['attention_mask'])
    
    # Combine all vectors into one tensor
    all_vectors = torch.cat(all_vectors, dim=0)
    all_input_ids = torch.cat(all_input_ids, dim=0)
    all_attention_masks = torch.cat(all_attention_masks, dim=0)
    
    return all_vectors, all_input_ids, all_attention_masks

# Convert all complaint texts to vectors
vectors, input_ids, attention_masks = convert_texts_to_vectors(
    complaint_texts, tokenizer, model, max_length=64
)

print(f"\n✅ All texts converted!")
print(f"Final vectors shape: {vectors.shape}")
print(f"This means: {vectors.shape[0]} texts, {vectors.shape[1]} words each, {vectors.shape[2]} dimensions per word")            

Processing 5 texts...
Processing text 1: 'I am very disappointed with the service quality...'
Processing text 2: 'The product arrived damaged and I want a refund...'
Processing text 3: 'Excellent service, very satisfied with the purchas...'
Processing text 4: 'Delivery was late and customer support was rude...'
Processing text 5: 'Amazing product, exactly what I ordered...'

✅ All texts converted!
Final vectors shape: torch.Size([5, 64, 768])
This means: 5 texts, 64 words each, 768 dimensions per word


In [18]:
vectors

tensor([[[-0.0621,  0.1021, -0.0250,  ..., -0.1021, -0.0638, -0.0200],
         [-0.1524,  0.2207, -0.1002,  ...,  0.0227, -0.1117, -0.2606],
         [ 0.1015,  0.1897,  0.0340,  ..., -0.1063, -0.1235, -0.0184],
         ...,
         [-0.0180,  0.0679, -0.0221,  ..., -0.0988, -0.0824,  0.0349],
         [-0.0180,  0.0679, -0.0221,  ..., -0.0988, -0.0824,  0.0349],
         [-0.0180,  0.0679, -0.0221,  ..., -0.0988, -0.0824,  0.0349]],

        [[-0.0525,  0.1079, -0.0360,  ..., -0.0786, -0.0604, -0.0311],
         [-0.2472,  0.1690, -0.0486,  ..., -0.1750,  0.1431, -0.2118],
         [-0.0501,  0.0207,  0.1038,  ..., -0.1472,  0.0750, -0.0438],
         ...,
         [ 0.0008,  0.1428, -0.0237,  ..., -0.0710, -0.0644,  0.0336],
         [ 0.0008,  0.1428, -0.0237,  ..., -0.0710, -0.0644,  0.0336],
         [ 0.0008,  0.1428, -0.0237,  ..., -0.0710, -0.0644,  0.0336]],

        [[-0.0504,  0.1014, -0.0167,  ..., -0.1163, -0.0670, -0.0075],
         [-0.0319,  0.5034,  0.2318,  ...,  0

In [19]:
print("\n" + "="*50)
print("STEP 9: Save Your Vectors")
print("="*50)

# Save vectors to file
torch.save({
    'vectors': vectors,
    'input_ids': input_ids,
    'attention_masks': attention_masks,
    'labels': torch.tensor(labels),
    'texts': complaint_texts
}, 'complaint_vectors.pt')

print("✅ Vectors saved to 'complaint_vectors.pt'")

# Load vectors back
loaded_data = torch.load('complaint_vectors.pt')
print(f"✅ Loaded back! Shape: {loaded_data['vectors'].shape}")



STEP 9: Save Your Vectors
✅ Vectors saved to 'complaint_vectors.pt'
✅ Loaded back! Shape: torch.Size([5, 64, 768])


In [20]:
print("\n" + "="*50)
print("STEP 10: Example with Your Own Data")
print("="*50)

# This is how you would use it with your own CSV file
def process_your_complaint_data(csv_file_path, complaint_column_name):
    """
    Simple function to process your own complaint data
    """
    print(f"Reading data from: {csv_file_path}")
    
    # Read your CSV file
    df = pd.read_csv(csv_file_path)
    print(f"Data shape: {df.shape}")
    
    # Get complaint texts
    complaint_texts = df[complaint_column_name].astype(str).tolist()
    print(f"Found {len(complaint_texts)} complaint texts")
    
    # Load tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    model.eval()
    
    # Convert to vectors
    vectors, input_ids, attention_masks = convert_texts_to_vectors(
        complaint_texts, tokenizer, model
    )
    
    return vectors, input_ids, attention_masks, complaint_texts


STEP 10: Example with Your Own Data
