In [1]:
from datasets import load_dataset
from transformers import pipeline
import pandas as pd


ds = load_dataset("Yelp/yelp_review_full")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_sample = ds['train'].shuffle(seed=42).select(range(1500)).to_pandas()
test_sample = ds['test'].shuffle(seed=42).select(range(1500)).to_pandas()

In [3]:
labels = ["Positive", "Neutral", "Negative"]
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device='cuda')

def classify_sentiment(review):
    prompt = f"Please label the following Yelp review as Positive, Negative, or Neutral based on the sentiment:\n\nReview: \"{review}\"\nSentiment:"
    result = classifier(prompt, candidate_labels=labels)
    top_label = result['labels'][0]
    top_score = result['scores'][0]
    return top_label, top_score

In [4]:
train_sample[['sentiment', 'score']] = train_sample['text'].apply(lambda x: pd.Series(classify_sentiment(x)))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [5]:
# COT

def cot_classify_sentiment(review):
    prompt = f"""
    We are given a Yelp review, and we need to determine its sentiment (Positive, Negative, or Neutral). Let's break it down step-by-step:
    
    Step 1: First, we should check for positive keywords such as "great," "delicious," "amazing," or "friendly."
    Step 2: Next, we should check for negative keywords like "slow," "bad," "dirty," or "unfriendly."
    Step 3: If the review contains mostly positive words, we will label it as Positive. If it contains negative words, we will label it as Negative. If the review mentions both positive and negative aspects or feels neutral in tone, we will label it as Neutral.
    
    Review: "{review}"
    
    Based on this reasoning, label the sentiment of the review as one of the following: Positive, Negative, or Neutral.
    
    Sentiment:
    """
    result = classifier(prompt, candidate_labels=labels)
    top_label = result['labels'][0]
    top_score = result['scores'][0]
    return top_label, top_score

In [6]:
train_sample[['cot_sentiment', 'cot_score']] = train_sample['text'].apply(lambda x: pd.Series(cot_classify_sentiment(x)))

In [7]:
train_sample['cot_sentiment'].value_counts()

cot_sentiment
Neutral     1393
Positive      85
Negative      22
Name: count, dtype: int64

In [8]:
train_sample['sentiment'].value_counts()

sentiment
Negative    717
Positive    583
Neutral     200
Name: count, dtype: int64

In [9]:
# removing all labels that were placed with a score less than 1/3
train_sample = train_sample[(train_sample['score'] >= 0.35)]

In [10]:
train_sample

Unnamed: 0,label,text,sentiment,score,cot_sentiment,cot_score
0,4,I stalk this truck. I've been to industrial p...,Positive,0.427783,Neutral,0.558556
1,2,"who really knows if this is good pho or not, i...",Positive,0.473351,Neutral,0.602599
2,4,I LOVE Bloom Salon... all of their stylist are...,Positive,0.720248,Neutral,0.597057
3,0,"We were excited to eat here, it is difficult t...",Negative,0.492704,Neutral,0.599148
4,2,"So this is a place, with food. That much canno...",Neutral,0.433483,Neutral,0.658871
...,...,...,...,...,...,...
1495,3,Have been going to Pappaduex's off and on for ...,Neutral,0.364719,Neutral,0.576681
1496,3,"China Chili is not only a 4, it is a very soli...",Positive,0.756167,Neutral,0.477633
1497,4,If you are considering attending the Universit...,Positive,0.587826,Neutral,0.473443
1498,3,So far so good ! I personally did not eat it.....,Neutral,0.387424,Neutral,0.636017


In [53]:
train_sample.loc[1495]['text']

"Have been going to Pappaduex's off and on for awhile now.  Seems like I go through spurts where I wind up there several times in a row then not going back for what seems like a year.  That said, this chain is consistently good.  The service is always good, some times better than others but never bad.  \\nIt is hard to find good creole and/or cajun food here and while this may not be considered authentic, it is good.  The alligator is lip smackin' good.\\nThe atmosphere is somewhat lacking as the acoustics make conversation almost impossible.  Not a good place for a romantic meal, but a good place to have fun with friends."

In [11]:
sentiment_map = {
    'Positive': 2,
    'Negative': 1,
    'Neutral': 0 
}

train_sample['sentiment'] = train_sample['sentiment'].map(sentiment_map)

In [36]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)


In [42]:
train_sample['label'] = train_sample['sentiment']

In [43]:
from datasets import Dataset
dataset = Dataset.from_pandas(train_sample)

In [44]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1487/1487 [00:00<00:00, 2497.27 examples/s]


In [45]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
tokenized_dataset.to_parquet('tokenized_dataset.parquet')

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 61.50ba/s]


5759439