In [17]:
from datasets import load_dataset

ds = load_dataset("Yelp/yelp_review_full")

In [18]:
train_sample = ds['train'].shuffle(seed=42).select(range(1500))
test_sample = ds['test'].shuffle(seed=42).select(range(1500))

In [3]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from transformers import pipeline

In [4]:
model_name = "distilbert/distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
sentiment_labels = ["Negative", "Neutral", "Positive"]


In [7]:
train_df = train_sample.to_pandas()

In [6]:
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device='cuda')

In [None]:
# BERT
train_df['sentiment'] = train_df['text'].apply(lambda x: sentiment_analyzer(x)[0]['label'])

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

In [14]:
import pandas as pd

def get_sentiment(text):
    prompt = f"Given the following text, what is the sentiment (Negative, Neutral, Positive)? One word response, and it sohuld be the sentiment label. Text: '{text}' Sentiment:"
    print(prompt)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=5000)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(response)
    if "Sentiment:" in response:
        sentiment = response.split("Sentiment:")[1].strip().split("\n")[0].strip()
        explanation = response.split("Sentiment:")[-1].strip().split("\n", 1)[-1].strip()
        return pd.Series([sentiment, explanation], index=['sentiment', 'explanation'])
    else:
        return pd.Series(["Sentiment not found", "Explanation not available"], index=['sentiment', 'explanation'])

In [16]:
train_df[['sentiment', 'explanation']] = train_df['text'].apply(get_sentiment)

Given the following text, what is the sentiment (Negative, Neutral, Positive)? One word response, and it sohuld be the sentiment label. Text: 'I stalk this truck.  I've been to industrial parks where I pretend to be a tech worker standing in line, strip mall parking lots, and of course the farmer's market.  The bowls are so so absolutely divine.  The owner is super friendly and he makes each bowl by hand with an incredible amount of pride.  You gotta eat here guys!!!' Sentiment:


KeyboardInterrupt: 

In [None]:
train_df