In [70]:
!pip install datasets transformers




In [71]:
# Import required libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Model

In [72]:
# Load the tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="../CentralCache")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, cache_dir="../CentralCache")
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Load Dataset


In [73]:
# Load the dataset
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", cache_dir="./CentralCache")

# Display the structure of the dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})


In [74]:
# Define the preprocess function
def preprocess_data(examples):
    """Map function to preprocess dataset."""
    encodings = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    encodings['labels'] = examples['label']  # Add labels
    return encodings

# Apply the preprocess function to the dataset
encoded_dataset = dataset.map(preprocess_data, batched=True)

# Ensure the dataset has the required columns
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

In [75]:
# Assuming encoded_dataset is a DatasetDict
# Use the 'train' split and split it further
train_test_split = encoded_dataset['train'].train_test_split(test_size=0.2)

# Update the DatasetDict with the new train and validation splits
encoded_dataset['train'] = train_test_split['train']
encoded_dataset['validation'] = train_test_split['test']

# Access the new train and validation splits
train_dataset = encoded_dataset['train']
val_dataset = encoded_dataset['validation']


In [76]:
# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [77]:
def compute_metrics(pred):
    """Calculate accuracy, precision, recall, and F1 score."""
    labels = pred.label_ids
    preds = torch.argmax(torch.tensor(pred.predictions), dim=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [78]:
from transformers import EarlyStoppingCallback

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",         # Evaluate every few steps
    save_strategy="steps",               # Save model every few steps
    save_steps=500,                      # Save every 500 steps
    learning_rate=3e-5,                  # Reduced learning rate for stability
    per_device_train_batch_size=16,      # Larger batch size if hardware allows
    per_device_eval_batch_size=16,
    num_train_epochs=3,                  # Retain epochs but use early stopping
    weight_decay=0.05,                   # Increased regularization
    logging_dir="./logs",
    logging_steps=100,                    # Log every 50 steps
    load_best_model_at_end=True,         # Load best model at the end
    metric_for_best_model="accuracy",    # Use accuracy as the metric for saving the best model
    greater_is_better=True               # Higher accuracy is better
)
# Initialize the Trainer
trainer = Trainer(
    model=model,                          # The model to train
    args=training_args,                   # The training arguments
    train_dataset=train_dataset,          # The training dataset
    eval_dataset=val_dataset,            # Use 'validation' split instead of 'test'
    tokenizer=tokenizer,                  # The tokenizer
    compute_metrics=compute_metrics,      # Function to compute metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping
)

  trainer = Trainer(


In [79]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.7525,0.545833,0.798848,0.794785,0.804387,0.798848
200,0.558,0.460577,0.833944,0.832734,0.83185,0.833944
300,0.4727,0.424127,0.847564,0.846147,0.845616,0.847564
400,0.4545,0.393623,0.851755,0.851648,0.851575,0.851755
500,0.403,0.37476,0.864851,0.863471,0.862831,0.864851
600,0.276,0.421103,0.865375,0.862825,0.864211,0.865375
700,0.2766,0.375464,0.869565,0.868106,0.869535,0.869565
800,0.2777,0.397112,0.864327,0.862644,0.86243,0.864327
900,0.2636,0.384163,0.867994,0.864513,0.866215,0.867994
1000,0.2098,0.423666,0.87428,0.870813,0.872189,0.87428


TrainOutput(global_step=1300, training_loss=0.33697408309349647, metrics={'train_runtime': 348.8753, 'train_samples_per_second': 65.645, 'train_steps_per_second': 4.11, 'total_flos': 687915468997632.0, 'train_loss': 0.33697408309349647, 'epoch': 2.719665271966527})

In [80]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print(f"Evaluation results: {eval_results}")
eval_results

Evaluation results: {'eval_loss': 0.42366647720336914, 'eval_accuracy': 0.8742797276060765, 'eval_f1': 0.8708131219308641, 'eval_precision': 0.8721889099463794, 'eval_recall': 0.8742797276060765, 'eval_runtime': 6.993, 'eval_samples_per_second': 272.989, 'eval_steps_per_second': 17.16, 'epoch': 2.719665271966527}


{'eval_loss': 0.42366647720336914,
 'eval_accuracy': 0.8742797276060765,
 'eval_f1': 0.8708131219308641,
 'eval_precision': 0.8721889099463794,
 'eval_recall': 0.8742797276060765,
 'eval_runtime': 6.993,
 'eval_samples_per_second': 272.989,
 'eval_steps_per_second': 17.16,
 'epoch': 2.719665271966527}

In [81]:
model.save_pretrained("./distilbert-sentiment-classifier")
tokenizer.save_pretrained("./distilbert-sentiment-classifier")

('./distilbert-sentiment-classifier/tokenizer_config.json',
 './distilbert-sentiment-classifier/special_tokens_map.json',
 './distilbert-sentiment-classifier/vocab.txt',
 './distilbert-sentiment-classifier/added_tokens.json',
 './distilbert-sentiment-classifier/tokenizer.json')

In [82]:

from google.colab import drive

# Define the path to save the model in your Google Drive
save_directory_drive = "/content/drive/MyDrive/Task9a_model"

# Create the directory if it doesn't exist
!mkdir -p {save_directory_drive}

# Save the trained model and tokenizer to Google Drive
model.save_pretrained(save_directory_drive)
tokenizer.save_pretrained(save_directory_drive)

('/content/drive/MyDrive/Task9a_model/tokenizer_config.json',
 '/content/drive/MyDrive/Task9a_model/special_tokens_map.json',
 '/content/drive/MyDrive/Task9a_model/vocab.txt',
 '/content/drive/MyDrive/Task9a_model/added_tokens.json',
 '/content/drive/MyDrive/Task9a_model/tokenizer.json')

In [None]:
def predict_sentiment(text):
    """Classify sentiment for a given text."""
    tokenizer = AutoTokenizer.from_pretrained("./distilbert-sentiment-classifier")
    model = AutoModelForSequenceClassification.from_pretrained("./distilbert-sentiment-classifier")
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    sentiment = torch.argmax(probs, dim=1).item()
    labels = ["Bearish", "Neutral", "Bullish"]
    return labels[sentiment]

# Example usage
example_tweet = "The stock market is showing signs of recovery. Bullish sentiment ahead!"
print("Sentiment:", predict_sentiment(example_tweet))


In [84]:
example_tweet = "Today I'm making profit"
print("Sentiment:", predict_sentiment(example_tweet))

Sentiment: Bullish
