In [1]:
!pip install keras



HBox(children=(FloatProgress(value=0.0, description='tokenizer_config.json', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='vocab.json', max=898823.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='merges.txt', max=456318.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='tokenizer.json', max=1355863.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='config.json', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='model.safetensors', max=498818054.0, style=ProgressStyle(…




KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to safely read files
def read_word_set(filename: str) -> set:
    try:
        with open(filename, 'r') as file:
            return set(file.read().splitlines())
    except FileNotFoundError:
        logger.error(f"File not found: {filename}")
        return set()

# Load data and word sets
try:
    df = pd.read_csv('merged_output.csv')
    positive_words = read_word_set('positive-words.txt')
    negative_words = read_word_set('negative-words.txt')
    depressed_words = read_word_set('depressedword.txt')
except Exception as e:
    logger.error(f"Error loading data: {e}")
    raise

def label_sentiment(text: str) -> int:
    words = set(text.lower().split())
    positive_count = len(words.intersection(positive_words))
    negative_count = len(words.intersection(negative_words))
    depressed_count = len(words.intersection(depressed_words))
    
    if depressed_count > max(positive_count, negative_count):
        return 2  # Depressed
    elif negative_count > positive_count:
        return 1  # Negative
    else:
        return 0  # Positive

# Create labels based on word counts
df['label'] = df['content'].apply(label_sentiment)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], test_size=0.2, random_state=42)

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def encode_text(texts, max_length=128):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_text(X_train)
test_encodings = encode_text(X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Create DataLoader for training and testing
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load the RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    logger.info(f'Epoch {epoch + 1}/{num_epochs} completed')

# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
logger.info(f'Test accuracy: {accuracy:.4f}')

# Function to predict sentiment
def predict_sentiment(text: str) -> str:
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
    
    if predicted_class == 2:
        return f"The user might be depressed. Confidence: {confidence:.2f}"
    elif predicted_class == 1:
        return f"The user is in a negative mood. Confidence: {confidence:.2f}"
    else:
        return f"The user is in a positive mood. Confidence: {confidence:.2f}"

# Test the model with some example conversations
examples = [
    "I feel so hopeless and sad all the time.",
    "I'm excited about my new job and looking forward to the future.",
    "I don't know if life is worth living anymore.",
    "I had a great day with my friends and family.",
    "This weather is terrible, and I'm having a bad day.",
    "I'm feeling a bit down today, but I'm sure things will get better."
]

for example in examples:
    logger.info(f"Text: {example}")
    logger.info(f"Prediction: {predict_sentiment(example)}\n")

# Save the model
try:
    torch.save(model.state_dict(), 'sentiment_roberta_model.pth')
    tokenizer.save_pretrained('sentiment_roberta_tokenizer')
    logger.info("Model and tokenizer saved successfully.")
except Exception as e:
    logger.error(f"Error saving model and tokenizer: {e}")

# To load the model and tokenizer later, you can use:
# loaded_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
# loaded_model.load_state_dict(torch.load('sentiment_roberta_model.pth'))
# loaded_tokenizer = RobertaTokenizer.from_pretrained('sentiment_roberta_tokenizer')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Epoch 1/3 completed
INFO:__main__:Epoch 2/3 completed
INFO:__main__:Epoch 3/3 completed
INFO:__main__:Test accuracy: 0.9766
INFO:__main__:Text: I feel so hopeless and sad all the time.
INFO:__main__:Prediction: The user is in a negative mood. Confidence: 0.99

INFO:__main__:Text: I'm excited about my new job and looking forward to the future.
INFO:__main__:Prediction: The user is in a positive mood. Confidence: 1.00

INFO:__main__:Text: I don't know if life is worth living anymore.
INFO:__main__:Prediction: The user is in a positive mood. Confidence: 1.00

INFO:__main__:Text: I had a great day with my friends and fami

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load the dataset
df = pd.read_csv('merged_output.csv')

# Load positive and depressed words
with open('positive-words.txt', 'r') as file:
    positive_words = set(file.read().splitlines())

with open('depressedword.txt', 'r') as file:
    depressed_words = set(file.read().splitlines())

# Function to label the sentiment
def label_sentiment(text):
    words = set(text.lower().split())
    positive_count = len(words.intersection(positive_words))
    depressed_count = len(words.intersection(depressed_words))
    
    if depressed_count > positive_count:
        return 2  # Depressed
    elif positive_count > 0:
        return 1  # Positive
    else:
        return 0  # Neutral/Negative

# Create labels based on word counts
df['label'] = df['content'].apply(label_sentiment)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], test_size=0.2, random_state=42)

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize and encode the text data
def encode_text(texts, max_length=128):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = encode_text(X_train)
test_encodings = encode_text(X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Create DataLoader for training and testing
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Load the RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating'):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3:  16%|████▋                         | 135/855 [11:34<55:26,  4.62s/it]

In [None]:
def predict_sentiment(text: str) -> str:
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
    
    if predicted_class == 2:
        return f"The user might be depressed. Confidence: {confidence:.2f}"
    elif predicted_class == 1:
        return f"The user is in a negative mood. Confidence: {confidence:.2f}"
    else:
        return f"The user is in a positive mood. Confidence: {confidence:.2f}"

# Test the model with some example conversations
examples = [
        "I feel so hopeless and sad all the time.",
        "I'm excited about my new job and looking forward to the future.",
        "I don't know if life is worth living anymore.",
        "I had a great day with my friends and family.",
        "This weather is terrible, and I'm having a bad day.",
        "I'm feeling a bit down today, but I'm sure things will get better.",
        "I'm really proud of what I accomplished today.",
        "Sometimes I wonder if I'll ever find my way.",
        "I had a productive meeting, but I still have a lot on my plate.",
        "Today was just another ordinary day.",
        "I'm thrilled about my upcoming vacation!",
        "I'm feeling overwhelmed by everything that’s happening.",
        "I'm content with where I am in life right now.",
        "I can't shake off this feeling of dread.",
        "I had a fun time at the park with my friends last weekend."
    ]

for example in examples:
    logger.info(f"Text: {example}")
    logger.info(f"Prediction: {predict_sentiment(example)}\n")

# Save the model
try:
    torch.save(model.state_dict(), 'sentiment_roberta_model.pth')
    tokenizer.save_pretrained('sentiment_roberta_tokenizer')
    logger.info("Model and tokenizer saved successfully.")
except Exception as e:
    logger.error(f"Error saving model and tokenizer: {e}")