In [1]:

!pip install transformers torch datasets

import os
os.environ["WANDB_DISABLED"] = "true"



import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch.nn.functional as F


dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def preprocess_data(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


train_data = train_data.map(preprocess_data, batched=True)
test_data = test_data.map(preprocess_data, batched=True)


train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)


trainer.train()


model_path = "./imdb_sentiment_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)


model.eval()


def preprocess_text(text):
    """
    将输入文本分词并转换为 BERT 模型所需的格式。
    参数:
        text (str): 用户输入的文本
    返回:
        dict: 已分词的输入文本，准备传入模型
    """
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    return inputs

def chatbot_response(user_input):
    """
    基于输入文本的情感生成机器人的回复。
    参数:
        user_input (str): 用户的文本输入。
    返回:
        str: 基于情感分析的聊天机器人回复。
    """

    inputs = preprocess_text(user_input)


    with torch.no_grad():
        outputs = model(**inputs)


    logits = outputs.logits
    probs = F.softmax(logits, dim=-1)

）
    sentiment = torch.argmax(probs, dim=-1).item()


    if sentiment == 1:
        response = "很高兴你有这样的感受！😊 我还能为你做些什么呢？"
    else:
        response = "我在这里为你提供帮助，希望能让情况好一些。有什么困扰吗？"

    return response


print("欢迎来到聊天机器人！输入 'exit' 退出。")
while True:
    user_input = input("你：")
    if user_input.lower() in ["exit", "quit"]:
        print("机器人：再见！")
        break
    response = chatbot_response(user_input)
    print("机器人：", response)


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.2285,0.219218


欢迎来到聊天机器人！输入 'exit' 退出。
你：it's a good film
机器人： 很高兴你有这样的感受！😊 我还能为你做些什么呢？
你：that too bad
机器人： 我在这里为你提供帮助，希望能让情况好一些。有什么困扰吗？
你：太垃圾了
机器人： 很高兴你有这样的感受！😊 我还能为你做些什么呢？
你：exit
机器人：再见！


In [2]:
# Step 1: Install necessary libraries
!pip install transformers torch datasets

# Step 2: Import libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch.nn.functional as F
import random

# Step 3: Load the IMDb dataset
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

# Step 4: Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define data preprocessing function
def preprocess_data(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Apply tokenizer to training and testing datasets
train_data = train_data.map(preprocess_data, batched=True)
test_data = test_data.map(preprocess_data, batched=True)

# Set format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Step 5: Initialize the BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 6: Set training parameters
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to="none"
)

# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)

# Step 8: Train the model
trainer.train()

# Step 9: Save the trained model
model_path = "./imdb_sentiment_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Step 10: Load the saved model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()

# Step 11: Define preprocessing and response generation functions with varied responses
def preprocess_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    return inputs

# Define varied responses for each sentiment
positive_responses = [
    "I'm glad you feel that way! 😊 How can I assist you further?",
    "That's wonderful to hear! Let me know if there's anything else I can do.",
    "It sounds like you're in a good mood! How else can I help?",
    "I'm happy to hear that! What would you like to discuss next?",
]

negative_responses = [
    "I'm here to help. Let's see if we can make things better. What's on your mind?",
    "It sounds like you're having a rough time. How can I assist you?",
    "I'm sorry to hear that. Let me know if there's something I can help with.",
    "I understand things might be tough. I'm here to listen and assist however I can.",
]

def chatbot_response(user_input):
    inputs = preprocess_text(user_input)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probs = F.softmax(logits, dim=-1)
    sentiment = torch.argmax(probs, dim=-1).item()

    # Select a random response based on sentiment
    if sentiment == 1:
        response = random.choice(positive_responses)
    else:
        response = random.choice(negative_responses)

    return response

# Step 12: Interactive chat loop
print("Welcome to the chatbot! Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Bot: Goodbye!")
        break
    response = chatbot_response(user_input)
    print("Bot:", response)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2197,0.223147


Welcome to the chatbot! Type 'exit' to quit.
You: i feel lonely
Bot: I'm here to help. Let's see if we can make things better. What's on your mind?
You: i think i am good
Bot: I'm glad you feel that way! 😊 How can I assist you further?
You: that‘s a good moive
Bot: It sounds like you're in a good mood! How else can I help?
You: that's a bad film
Bot: It sounds like you're having a rough time. How can I assist you?
You: exit
Bot: Goodbye!
