In [2]:
!pip install nltk
!pip install vaderSentiment
!pip install stable-baselines3 optuna
!pip install accelerate>=0.21.0
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers torch -U




Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Collecting stable-baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0 (from optun

In [None]:
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [5]:
import os
import pandas as pd
import random
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import BertTokenizer, BertForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer
import torch.nn as nn
import torch

# Load and preprocess the data
def load_conversations(file_path):
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        lines = file.readlines()
    return lines

def extract_pairs(lines):
    conversations = []
    for line in lines:
        if "+++$+++" in line:
            parts = line.strip().split(" +++$+++ ")
            conversation = parts[-1].strip()[1:-1].replace("'", "").replace(" ", "").split(",")
            conversations.append(conversation)
    return conversations

def get_id_to_line_dict(file_path):
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        lines = file.readlines()
    id_to_line = {}
    for line in lines:
        if "+++$+++" in line:
            parts = line.strip().split(" +++$+++ ")
            id_to_line[parts[0]] = parts[-1]
    return id_to_line

def extract_question_answer_pairs(conversations, id_to_line):
    questions = []
    answers = []
    for conversation in conversations:
        for i in range(len(conversation) - 1):
            questions.append(id_to_line[conversation[i]].strip())
            answers.append(id_to_line[conversation[i + 1]].strip())
    return questions, answers

def load_movie_metadata(metadata_file):
    movie_data = {}
    with open(metadata_file, 'r', encoding='iso-8859-1') as file:
        for line in file:
            parts = line.strip().split(" +++$+++ ")
            movie_data[parts[0]] = {'title': parts[1], 'description': parts[2]}
    return movie_data

def load_character_metadata(character_file):
    character_data = {}
    with open(character_file, 'r', encoding='iso-8859-1') as file:
        for line in file:
            parts = line.strip().split(" +++$+++ ")
            character_data[parts[0]] = {'name': parts[1], 'movie_id': parts[2], 'gender': parts[3]}
    return character_data

# Paths to the dataset files
conversations_file = 'movie_conversations.txt'
lines_file = 'movie_lines.txt'
metadata_file = 'movie_titles_metadata.txt'
character_file = 'movie_characters_metadata.txt'

# Load and process the dataset
conversations = extract_pairs(load_conversations(conversations_file))
id_to_line = get_id_to_line_dict(lines_file)
questions, answers = extract_question_answer_pairs(conversations, id_to_line)
qa_df = pd.DataFrame({'Question': questions, 'Answer': answers})
movie_metadata = load_movie_metadata(metadata_file)
character_metadata = load_character_metadata(character_file)

class SentimentChatbot:
    def __init__(self, qa_df, movie_metadata, character_metadata):
        self.analyzer = SentimentIntensityAnalyzer()
        self.qa_df = qa_df
        self.movie_metadata = movie_metadata
        self.character_metadata = character_metadata
        self.tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
        self.tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
        self.model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
        self.greetings = ["hello", "hi", "hey", "greetings", "what's up"]
        self.farewells = ["bye", "goodbye", "see you", "take care"]
        self.fallback_responses = [
            "I'm not sure how to respond to that.",
            "Can you please rephrase your question?",
            "I'm sorry, I don't understand.",
            "Let's talk about something else!"
        ]
        self.small_talk_responses = {
            "how are you": ["I'm just a bunch of code, but I'm functioning perfectly!", "Doing great, thanks for asking!"],
            "what is your name": ["I'm your friendly chatbot Suko!", "I don't have a name, but you can call me Suko."],
            "what do you do": ["I chat with people and try to help them out!", "I'm here to assist you with your queries."]
        }
        self.fun_facts = [
            "Did you know that honey never spoils?",
            "A day on Venus is longer than a year on Venus.",
            "Bananas are berries, but strawberries aren't."
        ]
        self.user_state = None

    def analyze_sentiment(self, message):
        inputs = self.tokenizer_bert(message, return_tensors='pt', truncation=True, padding=True)
        outputs = self.model_bert(**inputs)
        scores = outputs.logits.softmax(dim=-1).detach().numpy()[0]
        sentiment_scores = {'neg': scores[0], 'neu': scores[1], 'pos': scores[2], 'compound': scores.max()}
        return sentiment_scores

    def get_sentiment_label(self, sentiment_scores):
        if sentiment_scores['compound'] >= 0.05:
            return "Positive"
        elif sentiment_scores['compound'] <= -0.05:
            return "Negative"
        else:
            return "Neutral"

    def generate_response(self, message):
        inputs = self.tokenizer_gpt2.encode(message, return_tensors='pt')
        outputs = self.model_gpt2.generate(inputs, max_length=50, num_return_sequences=1)
        return self.tokenizer_gpt2.decode(outputs[0], skip_special_tokens=True)

    def find_answer(self, message):
        potential_answers = self.qa_df[self.qa_df['Question'].str.contains(message, case=False, na=False)]
        if not potential_answers.empty:
            return random.choice(potential_answers['Answer'].tolist())
        else:
            partial_matches = self.qa_df[self.qa_df['Question'].apply(lambda x: any(word in x.lower() for word in message.lower().split()))]
            if not partial_matches.empty:
                return random.choice(partial_matches['Answer'].tolist())
        return random.choice(self.fallback_responses)

    def find_movie_info(self, message):
        for movie_id, data in self.movie_metadata.items():
            if data['title'].lower() in message.lower():
                return data['title'], data['description']
        return None, None

    def find_character_info(self, message):
        for char_id, data in self.character_metadata.items():
            if data['name'].lower() in message.lower():
                movie_title = self.movie_metadata[data['movie_id']]['title']
                return data['name'], movie_title, data['gender']
        return None, None, None

    def respond(self, message):
        sentiment_scores = self.analyze_sentiment(message)
        sentiment_label = self.get_sentiment_label(sentiment_scores)
        response = f"Sentiment Analysis: {sentiment_label}\nScores: {sentiment_scores}\n"

        if any(greeting in message.lower() for greeting in self.greetings):
            self.user_state = None
            response += "Hello! How can I assist you today?\n\n" \
                        "What can I assist you with today? Please choose an option:\n" \
                        "1. Ask a question\n" \
                        "2. Fun fact\n" \
                        "3. Small talk\n" \
                        "4. Movie information\n" \
                        "5. Character information\n" \
                        "6. Exit"
        elif any(farewell in message.lower() for farewell in self.farewells):
            response += "Goodbye! Have a great day!"
        elif self.user_state is None:
            if message.strip() == "1":
                self.user_state = "ask_question"
                response += "Please ask your question."
            elif message.strip() == "2":
                response += random.choice(self.fun_facts)
            elif message.strip() == "3":
                response += random.choice(list(self.small_talk_responses.values()))
            elif message.strip() == "4":
                self.user_state = "movie_info"
                response += "Please enter the movie name or a question about the movie."
            elif message.strip() == "5":
                self.user_state = "character_info"
                response += "Please enter the character name or a question about the character."
            elif message.strip() == "6":
                response += "Goodbye! Have a great day!"
            else:
                response += random.choice(self.fallback_responses)
        elif self.user_state == "ask_question":
            answer = self.find_answer(message)
            response += f"Response: {answer}"
            self.user_state = None
        elif self.user_state == "movie_info":
            movie_title, movie_description = self.find_movie_info(message)
            if movie_title and movie_description:
                response += f"Movie: {movie_title}\nDescription: {movie_description}"
            else:
                response += "I'm sorry, I couldn't find information on that movie."
            self.user_state = None
        elif self.user_state == "character_info":
            character_name, movie_title, gender = self.find_character_info(message)
            if character_name and movie_title:
                response += f"Character: {character_name}\nMovie: {movie_title}\nGender: {gender}"
            else:
                response += "I'm sorry, I couldn't find information on that character."
            self.user_state = None
        else:
            response += random.choice(self.fallback_responses)

        response += "\n\nWhat can I assist you with today? Please choose an option:\n" \
                    "1. Ask a question\n" \
                    "2. Fun fact\n" \
                    "3. Small talk\n" \
                    "4. Movie information\n" \
                    "5. Character information\n" \
                    "6. Exit"

        return response

chatbot = SentimentChatbot(qa_df, movie_metadata, character_metadata)

print("==================================================")
print("Welcome to Suko, your friendly chatbot!")
print("Type 'exit' or 'quit' to end the chat.")
print("==================================================")

while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Suko: Goodbye! Have a great day!")
        break
    response = chatbot.respond(user_input)
    print(f"Suko: {response}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Welcome to Suko, your friendly chatbot!
Type 'exit' or 'quit' to end the chat.
You: hello
Suko: Sentiment Analysis: Positive
Scores: {'neg': 0.222152, 'neu': 0.4893153, 'pos': 0.28853264, 'compound': 0.4893153}
Hello! How can I assist you today?

What can I assist you with today? Please choose an option:
1. Ask a question
2. Fun fact
3. Small talk
4. Movie information
5. Character information
6. Exit

What can I assist you with today? Please choose an option:
1. Ask a question
2. Fun fact
3. Small talk
4. Movie information
5. Character information
6. Exit
You: 4
Suko: Sentiment Analysis: Positive
Scores: {'neg': 0.20273842, 'neu': 0.48758754, 'pos': 0.309674, 'compound': 0.48758754}
Please enter the movie name or a question about the movie.

What can I assist you with today? Please choose an option:
1. Ask a question
2. Fun fact
3. Small talk
4. Movie information
5. Character information
6. Exit
You: The Matrix
Suko: Sentiment Analysis: Positive
Scores: {'neg': 0.23579207, 'neu': 0.480