In [1]:
import pandas as pd

class DatasetLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def load_dataset(self):
        """
        Carga el dataset desde la ruta especificada y retorna un DataFrame de pandas.
        """
        try:
            data = pd.read_csv(self.file_path)
            return data
        except FileNotFoundError:
            print("El archivo no fue encontrado.")
            return None

    def preview_dataset(self, num_rows=5):
        """
        Muestra las primeras 'num_rows' filas del dataset.
        """
        data = self.load_dataset()
        if data is not None:
            print(data.head(num_rows))
        else:
            print("No hay datos para mostrar.")

In [2]:
# Crear una instancia de DatasetLoader y cargar los datos
loader = DatasetLoader('/Users/adrianinfantes/Desktop/AIR/COLLEGE AND STUDIES/Data_Scientist_formation/Platzi/LangChainCourse/data/Mental_Health_FAQ.csv')

# Vista previa del dataset
loader.preview_dataset()

   Question_ID                                          Questions  \
0      1590140        What does it mean to have a mental illness?   
1      2110618                    Who does mental illness affect?   
2      6361820                        What causes mental illness?   
4      7657263            Can people with mental illness recover?   

                                             Answers  
0  Mental illnesses are health conditions that di...  
1  It is estimated that mental illness affects 1 ...  
2  It is estimated that mental illness affects 1 ...  
3  Symptoms of mental health disorders vary depen...  
4  When healing from mental illness, early identi...  


In [3]:
class DataAnalyzer:
    def __init__(self, dataset):
        self.dataset = dataset

    def get_dataset_info(self):
        """
        Imprime información general sobre el dataset, incluyendo dimensiones y tipos de datos.
        """
        print("Información del Dataset:")
        print(self.dataset.info())

    def check_missing_values(self):
        """
        Verifica y muestra la cantidad de valores nulos en cada columna del dataset.
        """
        missing_values = self.dataset.isnull().sum()
        print("Valores Nulos en el Dataset:")
        print(missing_values)

In [4]:
# Usando las clases DatasetLoader y DataAnalyzer
dataset = loader.load_dataset()

analyzer = DataAnalyzer(dataset)
analyzer.get_dataset_info()
analyzer.check_missing_values()

Información del Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Question_ID  98 non-null     int64 
 1   Questions    98 non-null     object
 2   Answers      98 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.4+ KB
None
Valores Nulos en el Dataset:
Question_ID    0
Questions      0
Answers        0
dtype: int64


In [5]:
import re
import pandas as pd

class DataPreprocessor:
    def __init__(self, dataset):
        self.dataset = self.convert_to_dataframe(dataset)

    def convert_to_dataframe(self, dataset):
        """
        Convierte el dataset en un DataFrame de pandas si aún no lo es.
        """
        if not isinstance(dataset, pd.DataFrame):
            try:
                dataset = pd.DataFrame(dataset)
            except Exception as e:
                print(f"Error al convertir el dataset a DataFrame: {e}")
                return None
        return dataset

    def fill_missing_values(self, column, fill_value=""):
        """
        Rellena los valores nulos en una columna específica con un valor dado.
        """
        self.dataset[column] = self.dataset[column].fillna(fill_value)

    def clean_text(self, column):
        """
        Realiza una limpieza avanzada del texto en una columna específica.
        """
        self.dataset[column] = self.dataset[column].apply(self._clean_text_helper)

    @staticmethod
    def _clean_text_helper(text):
        """
        Función auxiliar para limpiar texto.
        """
        # Eliminar caracteres especiales y números
        text = re.sub('[^A-Za-z]+', ' ', text)
        # Convertir a minúsculas
        text = text.lower()
        # Eliminar espacios extra
        text = text.strip()
        return text

In [6]:
# Uso de la clase DataPreprocessor
preprocessor = DataPreprocessor(dataset)
preprocessor.fill_missing_values("Questions")
preprocessor.fill_missing_values("Answers")
preprocessor.clean_text("Questions")
preprocessor.clean_text("Answers")

# Vemos el resultado
dataset.head()

Unnamed: 0,Question_ID,Questions,Answers
0,1590140,what does it mean to have a mental illness,mental illnesses are health conditions that di...
1,2110618,who does mental illness affect,it is estimated that mental illness affects in...
2,6361820,what causes mental illness,it is estimated that mental illness affects in...
3,9434130,what are some of the warning signs of mental i...,symptoms of mental health disorders vary depen...
4,7657263,can people with mental illness recover,when healing from mental illness early identif...


In [7]:
from textblob import TextBlob

class SentimentAnalyzer:
    def __init__(self, dataset):
        self.dataset = dataset

    def add_polarity_columns(self, question_col, answer_col):
        self.dataset['question_polarity'] = self.dataset[question_col].apply(self._calculate_polarity)
        self.dataset['answer_polarity'] = self.dataset[answer_col].apply(self._calculate_polarity)

    @staticmethod
    def _calculate_polarity(text):
        return TextBlob(text).sentiment.polarity

In [8]:
# Analizar Sentimientos
sentiment_analyzer = SentimentAnalyzer(dataset)
sentiment_analyzer.add_polarity_columns("Questions", "Answers")

# Vemos el resultado
dataset.head()

Unnamed: 0,Question_ID,Questions,Answers,question_polarity,answer_polarity
0,1590140,what does it mean to have a mental illness,mental illnesses are health conditions that di...,-0.20625,0.029762
1,2110618,who does mental illness affect,it is estimated that mental illness affects in...,-0.1,-0.021921
2,6361820,what causes mental illness,it is estimated that mental illness affects in...,-0.1,-0.021921
3,9434130,what are some of the warning signs of mental i...,symptoms of mental health disorders vary depen...,-0.1,-0.066892
4,7657263,can people with mental illness recover,when healing from mental illness early identif...,-0.1,0.164336


In [9]:
class TextLengthAdder:
    def __init__(self, dataset):
        self.dataset = dataset

    def add_text_length_columns(self, question_col, answer_col):
        """
        Añade columnas al dataset que indican la longitud de las preguntas y respuestas.
        """
        self.dataset['question_length'] = self.dataset[question_col].apply(len)
        self.dataset['answer_length'] = self.dataset[answer_col].apply(len)

In [10]:
# Uso de la clase
length_adder = TextLengthAdder(dataset)
length_adder.add_text_length_columns('Questions', 'Answers')

# Ver las primeras filas del dataset para confirmar la adición de las columnas
dataset.head()

Unnamed: 0,Question_ID,Questions,Answers,question_polarity,answer_polarity,question_length,answer_length
0,1590140,what does it mean to have a mental illness,mental illnesses are health conditions that di...,-0.20625,0.029762,42,1324
1,2110618,who does mental illness affect,it is estimated that mental illness affects in...,-0.1,-0.021921,30,1208
2,6361820,what causes mental illness,it is estimated that mental illness affects in...,-0.1,-0.021921,26,1208
3,9434130,what are some of the warning signs of mental i...,symptoms of mental health disorders vary depen...,-0.1,-0.066892,52,1339
4,7657263,can people with mental illness recover,when healing from mental illness early identif...,-0.1,0.164336,38,576


In [11]:
from transformers import BertTokenizer, BertModel
import torch

class EmbeddingGenerator:
    def __init__(self, dataset):
        self.dataset = dataset
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

    def add_embedding_columns(self, question_col, answer_col):
        self.dataset['question_embedding'] = self.dataset[question_col].apply(self._generate_embeddings)
        self.dataset['answer_embedding'] = self.dataset[answer_col].apply(self._generate_embeddings)

    def _generate_embeddings(self, text):
        # Truncar o dividir el texto para ajustarse al límite de token
        inputs = self.tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()

In [12]:
# Generar Embeddings
embedding_generator = EmbeddingGenerator(dataset)
embedding_generator.add_embedding_columns("Questions", "Answers")

# Vemos el resultado
dataset.head()

Unnamed: 0,Question_ID,Questions,Answers,question_polarity,answer_polarity,question_length,answer_length,question_embedding,answer_embedding
0,1590140,what does it mean to have a mental illness,mental illnesses are health conditions that di...,-0.20625,0.029762,42,1324,"[[0.4737447, 0.30749866, -0.34513614, -0.06363...","[[-0.28696477, 0.47339454, 0.06885683, -0.1039..."
1,2110618,who does mental illness affect,it is estimated that mental illness affects in...,-0.1,-0.021921,30,1208,"[[-0.15863334, 0.4255557, -0.2734234, -0.03929...","[[-0.3362111, 0.53853524, 0.19835782, -0.12483..."
2,6361820,what causes mental illness,it is estimated that mental illness affects in...,-0.1,-0.021921,26,1208,"[[0.2304068, 0.32298884, -0.41936374, -0.14449...","[[-0.3362111, 0.53853524, 0.19835782, -0.12483..."
3,9434130,what are some of the warning signs of mental i...,symptoms of mental health disorders vary depen...,-0.1,-0.066892,52,1339,"[[-0.08530686, 0.41394427, -0.14627229, -0.305...","[[-0.23423892, 0.4693675, 0.14461, -0.12486041..."
4,7657263,can people with mental illness recover,when healing from mental illness early identif...,-0.1,0.164336,38,576,"[[-0.23590559, 0.17182596, -0.15402792, -0.234...","[[-0.32375985, 0.45347068, 0.19493593, -0.1679..."


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import ast

class DatasetPreparation:
    def __init__(self, dataset):
        self.dataset = dataset

    def convert_embeddings_to_numeric(self, embedding_columns):
        """
        Convierte las columnas de embeddings de string a arrays numéricos de numpy.
        """
        for column in embedding_columns:
            self.dataset[column] = self.dataset[column].apply(self._convert_embedding)

    @staticmethod
    def _convert_embedding(embedding_str):
        """
        Convierte un string que representa un embedding en un array numérico de numpy.
        """
        try:
            # Convierte el string a una lista y luego a un array de numpy
            embedding_list = ast.literal_eval(embedding_str)
            return np.array(embedding_list)
        except ValueError:
            return np.array([])

    def normalize_embeddings(self, embedding_columns):
        """
        Normaliza las columnas de embeddings especificadas.
        """
        scaler = StandardScaler()
        for column in embedding_columns:
            # Asegurarse de que los embeddings están en el formato correcto
            valid_embeddings = [emb for emb in self.dataset[column] if emb.size > 0]
            if valid_embeddings:
                scaled_embeddings = scaler.fit_transform(np.stack(valid_embeddings))
                self.dataset[column] = list(scaled_embeddings)

    def split_dataset(self, test_size=0.3, val_size=0.5):
        """
        Divide el dataset en conjuntos de entrenamiento, validación y prueba.
        """
        train, test = train_test_split(self.dataset, test_size=test_size, random_state=42)
        val, test = train_test_split(test, test_size=val_size, random_state=42)
        return train, val, test

In [14]:
# Suponiendo que 'dataset' es tu DataFrame cargado

# 1. Inicializar la clase con tu dataset
preparation = DatasetPreparation(dataset)

# 2. Convertir los embeddings de string a formato numérico
embedding_columns = ['question_embedding', 'answer_embedding']
preparation.convert_embeddings_to_numeric(embedding_columns)

# 3. Normalizar los embeddings
preparation.normalize_embeddings(embedding_columns)

# 4. Dividir el dataset en conjuntos de entrenamiento, validación y prueba
train, val, test = preparation.split_dataset()

# Después de ejecutar estos pasos, tendrás 'train', 'val', y 'test' listos para usar

In [15]:
from transformers import pipeline

class EmotionalChatbotModel:
    def __init__(self, model_name):
        # Inicializa el modelo de Hugging Face
        self.model = pipeline("text-generation", model=model_name)

    def generate_response(self, input_text):
        """
        Genera una respuesta del chatbot basada en el texto de entrada.
        """
        response = self.model(input_text)[0]['generated_text']
        return response

In [16]:
# Uso de la clase
model_name = "Mohammed-Altaf/Medical-ChatBot"  # Reemplaza esto con el modelo que elijas
chatbot_model = EmotionalChatbotModel(model_name)

# Ejemplo de generación de respuesta
input_question = "What is depression?"
response = chatbot_model.generate_response(input_question)
print("Chatbot response:", response)

Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Chatbot response: What is depression?

[|Other person I know seems miserable and lacks interest in activities. Do you think I need to see a doctor for depression?
[|Yes, I should see a


In [20]:
import gradio as gr
import random

class GradioChatbotInterface:
    def __init__(self, chatbot_model):
        self.chatbot_model = chatbot_model

    def chat(self, message):
        response = self.chatbot_model.generate_response(message)
        return response

    def launch(self):
        interface = gr.Interface(
            fn=self.chat, 
            inputs=gr.Textbox(lines=2, placeholder="Type a message..."),
            outputs="text",
            title="Chatbot",
            description="This is a chatbot interface."
        )
        interface.launch()

In [21]:
# Uso de la clase
chatbot_interface = GradioChatbotInterface(chatbot_model)
chatbot_interface.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.
