<a href="https://colab.research.google.com/github/katemayuri/speech_diamond_analysis/blob/main/DST_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch scikit-learn pandas matplotlib


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import pandas as pd
import json
import os

# load dataset
data = []
data_dir="combined_dataset.json"
with open(data_dir, "r") as f:
  data = json.load(f)

# Flatten sentences into a single string
for d in data:
    d["text"] = " ".join(d["sentence"])

# Convert to DataFrame
df = pd.DataFrame(data)

# Split into train and test datasets
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [3]:
from transformers import AutoTokenizer

# Load tokenizer for sentiment analysis
sentiment_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize train and test data
train_sentiment_encodings = sentiment_tokenizer(list(train_df["text"]), truncation=True, padding=True, max_length=256)
test_sentiment_encodings = sentiment_tokenizer(list(test_df["text"]), truncation=True, padding=True, max_length=256)

# Convert sentiment labels to numeric
label_mapping = {"positive": 1, "negative": 0, "neutral": 2}
train_sentiment_labels = train_df["sentiment"].map(label_mapping).tolist()
test_sentiment_labels = test_df["sentiment"].map(label_mapping).tolist()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
from transformers import T5Tokenizer

# Load tokenizer for trends extraction
trends_tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Format the data for T5
train_trends_data = [{"input": "extract trends: " + row["text"], "target": ", ".join(row["trends"])} for _, row in train_df.iterrows()]
test_trends_data = [{"input": "extract trends: " + row["text"], "target": ", ".join(row["trends"])} for _, row in test_df.iterrows()]


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Load DistilBERT model
sentiment_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_sentiment",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Prepare dataset
import torch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_sentiment_encodings, train_sentiment_labels)
test_dataset = SentimentDataset(test_sentiment_encodings, test_sentiment_labels)

# Initialize Trainer
trainer = Trainer(
    model=sentiment_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.4378,0.397502
2,0.251,0.434833
3,0.1241,0.543774


TrainOutput(global_step=1713, training_loss=0.2481563476811914, metrics={'train_runtime': 378.2403, 'train_samples_per_second': 36.199, 'train_steps_per_second': 4.529, 'total_flos': 906887983970304.0, 'train_loss': 0.2481563476811914, 'epoch': 3.0})

In [6]:
from transformers import T5ForConditionalGeneration

# Load T5 model
trends_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Convert data to PyTorch Dataset
class TrendsDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_encodings = self.tokenizer(item["input"], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        target_encodings = self.tokenizer(item["target"], truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": input_encodings["input_ids"].squeeze(),
            "attention_mask": input_encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze()
        }

train_trends_dataset = TrendsDataset(train_trends_data, trends_tokenizer)
test_trends_dataset = TrendsDataset(test_trends_data, trends_tokenizer)

# Define training arguments
trends_training_args = TrainingArguments(
    output_dir="./results_trends",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize Trainer
trends_trainer = Trainer(
    model=trends_model,
    args=trends_training_args,
    train_dataset=train_trends_dataset,
    eval_dataset=test_trends_dataset
)

# Train the model
trends_trainer.train()


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.213,0.161261
2,0.1721,0.138908
3,0.1603,0.134777


TrainOutput(global_step=3423, training_loss=0.41775847470380745, metrics={'train_runtime': 653.3718, 'train_samples_per_second': 20.956, 'train_steps_per_second': 5.239, 'total_flos': 926549972877312.0, 'train_loss': 0.41775847470380745, 'epoch': 3.0})

In [7]:
results = trainer.evaluate()
print("Sentiment Analysis Evaluation:", results)


Sentiment Analysis Evaluation: {'eval_loss': 0.5437739491462708, 'eval_runtime': 7.6176, 'eval_samples_per_second': 149.784, 'eval_steps_per_second': 18.772, 'epoch': 3.0}


In [8]:
trends_results = trends_trainer.evaluate()
print("Trends Extraction Evaluation:", trends_results)


Trends Extraction Evaluation: {'eval_loss': 0.1347769945859909, 'eval_runtime': 13.2569, 'eval_samples_per_second': 86.069, 'eval_steps_per_second': 10.787, 'epoch': 3.0}


In [9]:
# Test input
test_text = "Lab-grown diamonds are becoming increasingly popular among millennials."

# Sentiment Analysis
inputs = sentiment_tokenizer(test_text, return_tensors="pt", truncation=True, max_length=256).to("cuda")
sentiment_logits = sentiment_model(**inputs).logits
sentiment = torch.argmax(sentiment_logits, dim=-1).item()
sentiment_label = {1: "positive", 0: "negative", 2: "neutral"}[sentiment]
print("Sentiment:", sentiment_label)

# Trends Extraction
inputs = trends_tokenizer("extract trends: " + test_text, return_tensors="pt", truncation=True, max_length=256).to("cuda")
outputs = trends_model.generate(inputs["input_ids"], max_length=50)
trends = trends_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Trends:", trends)


Sentiment: positive
Trends: 
