# Case 03 - Task 03
Kia Vehicle Reviews Classification and Sentiment Analysis

In [5]:
# # Install the required packages
# !conda install -y pandas matplotlib seaborn -c conda-forge
# !pip install transformers
# !conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=11.8 -c pytorch -c nvidia
# !conda  install numpy==1.26.4
# !conda install ipywidgets
# !conda install -c conda-forge datasets

In [6]:
# Imports
import pandas as pd
from datasets import Dataset
from transformers import pipeline
import matplotlib.pyplot as plt
import seaborn as sns

## Load the Dataset
Begin by reading the `car_reviews.csv` file containing around 7000 customer reviews.

In [7]:
# Load CSV using pandas and convert to HuggingFace Dataset
df = pd.read_csv("car_reviews.csv")
print(df.columns)  # Confirm actual column name

df = df.rename(columns={"Review": "review"})

dataset = Dataset.from_pandas(df)

Index(['Review'], dtype='object')


## Select a Zero-Shot Classification Model
Using `facebook/bart-large-mnli` for topic classification.


In [None]:
# Zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

candidate_labels = [
    "talks about driving experience",
    "talks about features",
    "talks about value for money",
    "talks about issues",
    "other"
]

def classify_batch(batch):
    result = classifier(batch['review'], candidate_labels)
    return {"talks_about": [r["labels"][0] for r in result]}

# Apply batched classification
dataset = dataset.map(classify_batch, batched=True, batch_size=16)


Device set to use cuda:0


Map:   0%|          | 0/5959 [00:00<?, ? examples/s]

## Sentiment Analysis
For sentiment analysis, we use `cardiffnlp/twitter-roberta-base-sentiment`

In [None]:
# Sentiment analysis
from transformers import pipeline

# Load sentiment model with truncation enabled
sentiment_model = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment",
    truncation=True,
    max_length=512,
    device=0  # Use GPU
)

# Define function for batch sentiment prediction
def predict_sentiment(batch):
    result = sentiment_model(batch["review"], truncation=True, max_length=512)
    return {"sentiment": [r["label"] for r in result]}

# Apply the function on the dataset in batches
dataset = dataset.map(predict_sentiment, batched=True, batch_size=16)

## Convert back to DataFrame

In [None]:
# Convert back to DataFrame
df_final = dataset.to_pandas()
df_final.head()

## Visualize sentiment & topics

In [None]:
# Visualize sentiment
plt.figure(figsize=(8, 4))
sns.countplot(x="sentiment", data=df_final)
plt.title("Sentiment Distribution")
plt.show()

# Visualize topics
plt.figure(figsize=(10, 4))
sns.countplot(y="talks_about", data=df_final, order=df_final["talks_about"].value_counts().index)
plt.title("Review Topic Distribution")
plt.show()

## Save Results

In [None]:
# Save results
df_final.to_csv("car_reviews_with_predictions.csv", index=False)