In [None]:
# ===================================================================
# STEP 1: INSTALL AND IMPORT LIBRARIES
# ===================================================================
# First, install required libraries:
# pip install transformers torch pandas numpy

import torch
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
import numpy as np

print("Libraries imported successfully!")

# ===================================================================
# STEP 2: LOAD THE ROBERTA TOKENIZER
# ===================================================================
print("\n" + "="*50)
print("STEP 2: Loading RoBERTa Tokenizer")
print("="*50)

# Load the tokenizer - this downloads it first time
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

print(f"✅ Tokenizer loaded!")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Maximum sequence length: {tokenizer.model_max_length}")

# ===================================================================
# STEP 3: PREPARE SAMPLE COMPLAINT DATA
# ===================================================================
print("\n" + "="*50)
print("STEP 3: Prepare Sample Data")
print("="*50)

# Your complaint texts (this would be your complaint column)
complaint_texts = [
    "I am very disappointed with the service quality",
    "The product arrived damaged and I want a refund",
    "Excellent service, very satisfied with the purchase",
    "Delivery was late and customer support was rude",
    "Amazing product, exactly what I ordered"
]

# Labels (1 = complaint, 0 = not complaint)
labels = [1, 1, 0, 1, 0]

print("Sample complaint texts:")
for i, text in enumerate(complaint_texts):
    label_text = "Complaint" if labels[i] == 1 else "Not Complaint"
    print(f"{i+1}. [{label_text}] {text}")

# ===================================================================
# STEP 4: SEE HOW TOKENIZER BREAKS DOWN TEXT
# ===================================================================
print("\n" + "="*50)
print("STEP 4: Understanding Tokenization")
print("="*50)

# Take first complaint text as example
sample_text = complaint_texts[0]
print(f"Original text: '{sample_text}'")

# Step 4a: Break text into tokens
tokens = tokenizer.tokenize(sample_text)
print(f"Broken into tokens: {tokens}")

# Step 4b: Convert tokens to numbers
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f"Token numbers: {token_ids}")

# Step 4c: Convert back to check
decoded_text = tokenizer.decode(token_ids)
print(f"Decoded back: '{decoded_text}'")

# ===================================================================
# STEP 5: CONVERT ONE TEXT TO VECTORS (SIMPLE WAY)
# ===================================================================
print("\n" + "="*50)
print("STEP 5: Convert Single Text to Vectors")
print("="*50)

# Take one complaint text
single_text = "The service was terrible and I want my money back"
print(f"Converting text: '{single_text}'")

# Convert text to token IDs with padding and truncation
encoded = tokenizer(
    single_text,
    max_length=50,  # Maximum length (you can change this)
    padding='max_length',  # Pad shorter texts
    truncation=True,  # Cut longer texts
    return_tensors='pt'  # Return as PyTorch tensors
)

print(f"Input IDs shape: {encoded['input_ids'].shape}")
print(f"Input IDs: {encoded['input_ids']}")
print(f"Attention mask: {encoded['attention_mask']}")

# ===================================================================
# STEP 6: LOAD ROBERTA MODEL TO GET ACTUAL VECTORS
# ===================================================================
print("\n" + "="*50)
print("STEP 6: Load RoBERTa Model")
print("="*50)

# Load the RoBERTa model
model = RobertaModel.from_pretrained('roberta-base')
model.eval()  # Set to evaluation mode

print("✅ RoBERTa model loaded!")
print(f"Model hidden size: {model.config.hidden_size}")  # Should be 768

# ===================================================================
# STEP 7: GET ACTUAL VECTORS FROM TEXT
# ===================================================================
print("\n" + "="*50)
print("STEP 7: Convert Text to Actual Vectors")
print("="*50)

# Use the encoded text from step 5
input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']

print(f"Input shape: {input_ids.shape}")

# Pass through RoBERTa model
with torch.no_grad():  # Don't calculate gradients (saves memory)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
# Get the vectors (embeddings)
vectors = outputs.last_hidden_state
print(f"Output vectors shape: {vectors.shape}")
print(f"Each word is now a {vectors.shape[-1]}-dimensional vector!")

# Show first few dimensions of first word vector
print(f"First word vector (first 10 dimensions): {vectors[0, 0, :10]}")

# ===================================================================
# STEP 8: PROCESS MULTIPLE TEXTS (YOUR COMPLAINT COLUMN)
# ===================================================================
print("\n" + "="*50)
print("STEP 8: Process Multiple Complaint Texts")
print("="*50)

# Function to convert multiple texts to vectors
def convert_texts_to_vectors(texts, tokenizer, model, max_length=128):
    """
    Simple function to convert list of texts to vectors
    """
    all_vectors = []
    all_input_ids = []
    all_attention_masks = []
    
    print(f"Processing {len(texts)} texts...")
    
    for i, text in enumerate(texts):
        print(f"Processing text {i+1}: '{text[:50]}...'")
        
        # Tokenize text
        encoded = tokenizer(
            text,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Get vectors from model
        with torch.no_grad():
            outputs = model(
                input_ids=encoded['input_ids'],
                attention_mask=encoded['attention_mask']
            )
            vectors = outputs.last_hidden_state
        
        # Store results
        all_vectors.append(vectors)
        all_input_ids.append(encoded['input_ids'])
        all_attention_masks.append(encoded['attention_mask'])
    
    # Combine all vectors into one tensor
    all_vectors = torch.cat(all_vectors, dim=0)
    all_input_ids = torch.cat(all_input_ids, dim=0)
    all_attention_masks = torch.cat(all_attention_masks, dim=0)
    
    return all_vectors, all_input_ids, all_attention_masks

# Convert all complaint texts to vectors
vectors, input_ids, attention_masks = convert_texts_to_vectors(
    complaint_texts, tokenizer, model, max_length=64
)

print(f"\n✅ All texts converted!")
print(f"Final vectors shape: {vectors.shape}")
print(f"This means: {vectors.shape[0]} texts, {vectors.shape[1]} words each, {vectors.shape[2]} dimensions per word")

# ===================================================================
# STEP 9: SAVE AND LOAD YOUR VECTORS
# ===================================================================
print("\n" + "="*50)
print("STEP 9: Save Your Vectors")
print("="*50)

# Save vectors to file
torch.save({
    'vectors': vectors,
    'input_ids': input_ids,
    'attention_masks': attention_masks,
    'labels': torch.tensor(labels),
    'texts': complaint_texts
}, 'complaint_vectors.pt')

print("✅ Vectors saved to 'complaint_vectors.pt'")

# Load vectors back
loaded_data = torch.load('complaint_vectors.pt')
print(f"✅ Loaded back! Shape: {loaded_data['vectors'].shape}")

# ===================================================================
# STEP 10: SIMPLE EXAMPLE WITH YOUR OWN DATA
# ===================================================================
print("\n" + "="*50)
print("STEP 10: Example with Your Own Data")
print("="*50)

# This is how you would use it with your own CSV file
def process_your_complaint_data(csv_file_path, complaint_column_name):
    """
    Simple function to process your own complaint data
    """
    print(f"Reading data from: {csv_file_path}")
    
    # Read your CSV file
    df = pd.read_csv(csv_file_path)
    print(f"Data shape: {df.shape}")
    
    # Get complaint texts
    complaint_texts = df[complaint_column_name].astype(str).tolist()
    print(f"Found {len(complaint_texts)} complaint texts")
    
    # Load tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    model.eval()
    
    # Convert to vectors
    vectors, input_ids, attention_masks = convert_texts_to_vectors(
        complaint_texts, tokenizer, model
    )
    
    return vectors, input_ids, attention_masks, complaint_texts

# Example usage (uncomment when you have your data):
# vectors, input_ids, attention_masks, texts = process_your_complaint_data(
#     'your_data.csv', 'complaint_column'
# )

# ===================================================================
# STEP 11: WHAT HAPPENS NEXT (FOR YOUR MODEL)
# ===================================================================
print("\n" + "="*50)
print("STEP 11: What Happens Next")
print("="*50)

print("Now you have vectors for your complaints!")
print(f"Vector shape: {vectors.shape}")
print("\nThese vectors will go to:")
print("1. BiGRU layer (processes sequence)")
print("2. Attention layer (focuses on important parts)")
print("3. CentralNet (makes final prediction)")
print("\nResult: Complaint or Not Complaint prediction!")

# ===================================================================
# STEP 12: QUICK TEST - PREDICT NEW COMPLAINT
# ===================================================================
print("\n" + "="*50)
print("STEP 12: Test with New Complaint")
print("="*50)

# Test with a new complaint
new_complaint = "This product is broken and customer service won't help me"
print(f"New complaint: '{new_complaint}'")

# Convert to vectors
encoded = tokenizer(
    new_complaint,
    max_length=64,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

with torch.no_grad():
    outputs = model(
        input_ids=encoded['input_ids'],
        attention_mask=encoded['attention_mask']
    )
    new_vectors = outputs.last_hidden_state

print(f"New complaint vector shape: {new_vectors.shape}")
print("✅ Ready to be processed by your BiGRU + Attention + CentralNet model!")

print("\n" + "="*70)
print("SUMMARY: YOUR COMPLAINT TEXTS ARE NOW VECTORS!")
print("="*70)
print("1. ✅ Loaded RoBERTa tokenizer and model")
print("2. ✅ Converted complaint texts to token IDs") 
print("3. ✅ Generated 768-dimensional vectors for each word")
print("4. ✅ These vectors capture the meaning of your complaints")
print("5. ✅ Ready for your BiGRU + Attention + CentralNet model!")