In [None]:
!pip install transformers datasets matplotlib seaborn


In [None]:
from datasets import load_dataset

dataset = load_dataset("dair-ai/emotion")




In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load dataset
dataset = load_dataset("dair-ai/emotion")

# Reduce dataset size
dataset = dataset.shuffle(seed=42)
dataset["train"] = dataset["train"].select(range(200))
dataset["test"] = dataset["test"].select(range(50))

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/distilbert-base-uncased-emotion")

# Correct tokenize function
def tokenize_function(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

# Tokenize
encoded_dataset = dataset.map(tokenize_function, batched=True)

# Rename & format
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
import pandas as pd

df_train = encoded_dataset["train"].to_pandas()
df_test = encoded_dataset["test"].to_pandas()
df_train.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))
sns.countplot(x=df_train["labels"])
plt.title("Label Distribution in 200-Sample Training Set")
plt.xlabel("Emotion Label")
plt.ylabel("Count")
plt.show()

In [None]:
label_names = dataset["train"].features["label"].names
df_train["label_name"] = df_train["labels"].apply(lambda x: label_names[x])
df_train.head()

In [None]:
df_train["word_count"] = df_train["text"].apply(lambda x: len(x.split()))

plt.figure(figsize=(6,4))
sns.histplot(df_train["word_count"], bins=20, kde=True)
plt.title("Sentence Length Distribution")
plt.xlabel("Number of Words")
plt.show()


In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

tokenized_train = encoded_dataset["train"].map(tokenize, batched=True)
tokenized_test = encoded_dataset["test"].map(tokenize, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6  # 6 emotion classes
)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="emotion_model",
    report_to="none",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_strategy="no",
    save_strategy="no",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"]
)

trainer.train()


In [None]:
from transformers import pipeline

clf = pipeline("text-classification", model=model, tokenizer=tokenizer)

clf("I am very happy today!")


In [None]:
# Import Counter for word frequency calculation
from collections import Counter
import re # for simple text cleaning

# Function to flatten the text and count words
def get_most_common_words(df, n=20):
    all_text = ' '.join(df['text'].tolist()).lower()
    # Simple tokenization: split by space and remove punctuation
    words = re.sub(r'[^\w\s]', '', all_text).split()

    # Filter out common stop words (a basic list)
    # A more robust list of stop words would be better, but this is a quick example
    stop_words = set(['the', 'a', 'an', 'is', 'it', 'to', 'and', 'i', 'im', 'feeling', 'that', 'of', 'in', 'me'])
    filtered_words = [word for word in words if word not in stop_words and len(word) > 1]

    return Counter(filtered_words).most_common(n)

# Get and display the 20 most common words in the training set
common_words = get_most_common_words(df_train)
df_common_words = pd.DataFrame(common_words, columns=['Word', 'Frequency'])

print("Most Common Words in Training Data (Top 20):")
print(df_common_words)

# Optional: Visualize the top 10 most common words
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=df_common_words.head(10))
plt.title('Top 10 Most Common Words (Excluding Stop Words)')
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Ensure these are imported earlier

# Define a function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)

    return {'accuracy': accuracy, 'f1_weighted': f1_weighted}


# Get predictions on the test set
test_results = trainer.predict(encoded_dataset["test"])
predictions = np.argmax(test_results.predictions, axis=1)

# Corrected: Access the label column using 'labels'
true_labels = encoded_dataset["test"]["labels"]

# Get the list of label names (6 names: 'anger', 'fear', 'joy', 'love', 'sadness', 'surprise')
target_label_names = dataset['train'].features['label'].names

# Get the list of all expected numeric class IDs (0, 1, 2, 3, 4, 5)
# This handles the case where one class is missing from the 'true_labels'
all_numeric_labels = list(range(len(target_label_names)))

# Generate Classification Report
# *** NEW FIX: Pass the 'labels' parameter with all 6 numeric class IDs ***
report = classification_report(
    true_labels,
    predictions,
    target_names=target_label_names,
    labels=all_numeric_labels, # Force the report to use all 6 classes
    zero_division=0
)
print("--- Classification Report on Test Set ({} examples) ---".format(len(true_labels)))
print(report)

# Generate and plot Confusion Matrix
conf_mat = confusion_matrix(true_labels, predictions, labels=all_numeric_labels) # Use the full list of labels here too
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_label_names,
            yticklabels=target_label_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()