In [2]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

# Create a sample dataset for testing

In [3]:
loader = TextLoader(file_path='./data/conversations/conversation_0.txt')

In [4]:
data = loader.load()

In [5]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your sample document')
print (f'Here is a sample: {data[0].page_content[:200]}')

You have 1 document(s) in your data
There are 4710 characters in your sample document
Here is a sample: Therapist: Thanks for filling it out. We give this form to everyone once a year regardless of why they come in. It helps us provide better care. Is it okay if I take a look at what you put down?
Clien


# Create chunks from the therapy transcripts

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 400, chunk_overlap= 150)
texts = text_splitter.split_documents(data)

In [9]:
print(f'You have {len(texts)} documents in your data')

You have 17 text(s) in your data


In [13]:
texts[0]

Document(page_content="Therapist: Thanks for filling it out. We give this form to everyone once a year regardless of why they come in. It helps us provide better care. Is it okay if I take a look at what you put down?\nClient: Sure.\nTherapist: So, let's see. It looks that you put-- You drink alcohol at least four times a week on average-\nClient: Mm-hmm.", metadata={'source': './data/conversations/conversation_0.txt'})

# Generate summaries from chunks using the fine-tuned flan-T5-base model

In [14]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM


In [15]:
model_id = 'jruranski/flan-t5-base-samsum'

In [16]:
device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [18]:
pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=512
)

In [19]:
local_llm = HuggingFacePipeline(pipeline=pipe)

In [22]:
sample_summary = local_llm(texts[0].page_content)
print("Sample summary: ", sample_summary) 

Sample summary:  Client filled out the form. She drinks alcohol at least four times a week on average.


In [24]:
summaries = [local_llm("summarize: " + text.page_content) for text in texts]

In [30]:
print(f'You have {len(summaries)} summaries in your data')

You have 17 summaries in your data


In [32]:
summaries_avg_len = sum([len(summary) for summary in summaries]) / len(summaries)
print(f'Average summary length: {summaries_avg_len}')

Average summary length: 83.47058823529412


# Load the whole dataset and generate summaries using the fine-tuned flan-T5-base model

In [33]:

from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [35]:
loader = DirectoryLoader('./data/conversations', glob="**/*.txt", show_progress=True)
docs = loader.load()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jruranski/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
100%|██████████| 141/141 [00:08<00:00, 15.94it/s]


In [36]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 400, chunk_overlap= 150)
texts = text_splitter.split_documents(docs)

print(f'You have {len(texts)} documents in your data')

You have 3468 documents in your data


## Summarize chunks using the fine-tuned flan-T5-base model

In [37]:
summaries = [local_llm("summarize: " + text.page_content) for text in texts]

In [42]:
# save the summaries to separate files along with the original text
for i, text in enumerate(texts):
    with open(f'./data/conversation_summaries/summary_{i}.txt', 'w') as f:
        f.write(summaries[i])
    with open(f'./data/conversation_summaries/original_{i}.txt', 'w') as f:
        f.write(text.page_content)


# Add summaries to each text chunk

In [58]:
# Add the summaries to the original documents (texts) 
summarized_texts = []
for i, text in enumerate(texts):
    new_text = text.copy()
    new_text.page_content += "\nSummary: " + summaries[i]
    summarized_texts.append(text)

In [59]:
summarized_texts[0].page_content

"Therapist: You did your values clarification handout, and that was part of what I wanted to go over with you today. I wanted to hear about your values and just talk to you a little bit more about that. Do-do you wanna tell me what some of your top five values are?\n\nClient: Yes, um, my top value is family happiness, um, that's-\n\nTherapist: That's number one?Summary: Therapist asked client to clarify his values. Client's top five values are family happiness and family happiness.\nSummary: Therapist asked client to clarify his values. Client's top five values are family happiness and family happiness."

In [52]:
# save the final summized documents to a new directory
for i, text in enumerate(summarized_texts):
    with open(f'./data/final_data/summarized_{i}.txt', 'w') as f:
        f.write(text.page_content)

In [53]:
final_texts = summarized_texts[:500]

# Store the summaries and dialogs in a vector store (Chroma)

In [23]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb

In [43]:
embeddings = HuggingFaceEmbeddings()

### Add the summarized chunks into the vector store

In [54]:
if 'vectordb' in globals():
    vectordb.close()

vectordb = Chroma.from_documents(documents=final_texts, embedding=embeddings, persist_directory='./db/vectordb')

In [60]:
# test the retrieval from vectordb
sample_query = "How to stress less"
docs = vectordb.similarity_search(sample_query, k=5)

In [61]:
for doc in docs:
    print(doc.page_content)

Client: Most of it goes to everything else, but me.

Therapist: Okay.

Client: And, uh, like, I don't even—

Therapist: It's gonna be hard to achieve inner harmony-

Client: Yeah.

Therapist: -if you're always running around taking care of everybody.

Client: And I feel like inner harmony first comes with sleep. [laughter] I haven't been able to sleep lately.

Therapist: Right.

Client: But, um-Summary: Client is stressed out because of the stress.
Summary: Client is stressed out because of the stress.
Client: Well, he's-he thinks it's all in my head.

Therapist: Right.

Client: You know, he's- he says it's stress.

Therapist: Right. Okay.

Client: Mm.

Therapist: And what do you think?

Client: Well, I mean, it can't all be in my head. I get too much physical pain-

Therapist: Right.

Client: - for it to be in my head. I mean, it's real. It's-it's real pain.

Therapist: Mm-hmm.

Client: Um—Summary: Client is having stress.
Summary: Client is having stress.
Therapist: Okay. So, this pa

# MultiQuery
We can generate additional questions/queries from the initial user question to better capture the user intent. We will use the ChatGPT3.5 model to generate additional questions/queries.

In [38]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate
# Set logging for the queries
import logging

In [39]:
# Add logging to see what other questions were created
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)


In [63]:
question = "What can I do to feel more relaxed in my free time?"
llm = ChatOpenAI(temperature=0)

retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectordb.as_retriever(), llm=llm)


In [64]:
unique_docs = retriever_from_llm.get_relevant_documents(query=question)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. How can I achieve a state of relaxation during my leisure hours?', '2. What activities can I engage in to experience a greater sense of calm and relaxation in my spare time?', '3. What are some effective ways to unwind and de-stress during my free time?']


In [65]:
len(unique_docs)

7

//# Contextual Compression
//Now we take the retrieved chunks and the contained information down to the relevant parts. This is expensive. 
// Not implemented yet

In [82]:
question = "I have been struggling to relax lately. My school work is stressing me out. What can I do to feel more relaxed in my free time?"

# Sample question and answer with ChatGPT based on our retrieved chunks

In [94]:
prompt_template = """Use the following style guide to answer the question at the end, the style guide doesn't represent your current patient. DO NOT COPY THE CONTEXT INTO YOUR ANSWER. USE THE CONTEXT ONLY AS A STYLE GUIDE.
Act as if you are a therapist and the person you are talking to is your patient. 
Use the provided style guide as a guidance for your tone and style of the conversation. 
Style guide: {context}

Question: {question}
Answer:"""

# prompt_template = """
# Based on the tone and style exemplified in the hypothetical conversation excerpts below, please provide a response to the question. Remember, these excerpts are fictional and created for the purpose of this exercise. They do not represent any real individual or patient. Your response should be in the manner of a therapist, using the tone and style suggested by these excerpts but without copying any specific details from them.
# Please keep your responses short and ASK a lot clarifying questions if needed.

# Hypothetical Conversation Excerpts:
# 1. Client: "I just feel overwhelmed sometimes."
#    Therapist: "It sounds like you're carrying a lot on your shoulders. Can you tell me more about what's overwhelming you?"

# 2. Client: "I'm not sure if I'm making the right choices."
#    Therapist: "Making decisions can be challenging. What options are you currently considering?"

# Question: {question}
# Answer:
# """

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=['context', 'question'],

)
# PROMPT = PromptTemplate(
#     template=prompt_template,
#     input_variables=['question'],

# )

In [95]:
chat_llm = ChatOpenAI(temperature=0.5)

In [96]:
output = chat_llm.predict(text=PROMPT.format_prompt(
    context=unique_docs[:2],
    question=question
).text)
# output = chat_llm.predict(text=PROMPT.format_prompt(

#     question=question
# ).text)

In [98]:
unique_docs[:2]

[Document(page_content="And so, um, yeah, that's probably been my biggest, uh, issue is like when work ends, going home and not really doing anything. Um, just trying to like unwind by watching Netflix or something else, right?Summary: I'm trying to unwind by watching Netflix.\nSummary: I'm trying to unwind by watching Netflix.", metadata={'source': 'data/conversations/conversation_35.txt'}),
 Document(page_content="Client: Mm.\n\nTherapist: Right.\n\nClient: Mm.\n\nTherapist: So, what is it you'd spend your average day doing now?\n\nClient: Well, I potter around the house-\n\nTherapist: Mm-hmm.\n\nClient: - um, I spend perhaps a little bit too much time watching. I've got a bit hooked on daytime television, I have to say.\n\nTherapist: Mm-hmm.\n\nClient: Um, they draw you in, don't they?\n\nTherapist: They certainly do.Summary: Client spends his day watching TV.\nSummary: Client spends his day watching TV.", metadata={'source': 'data/conversations/conversation_36.txt'})]

In [97]:
output

"It sounds like you're experiencing a lot of stress from your school work. Can you tell me more about what specifically is causing you to feel overwhelmed? Additionally, what activities or hobbies do you typically enjoy in your free time?"

# Chat interface for the user

In [99]:
from langchain.chat_models import ChatOpenAI
from langchain import LLMChain
from langchain import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

### Sample chat message 

In [101]:
chat = ChatOpenAI(temperature=0.5)
message = [HumanMessage(content="I have been struggling to relax lately. My school work is stressing me out. What can I do to feel more relaxed in my free time?")]

chat(message)

AIMessage(content="It's important to prioritize relaxation and self-care, especially when you're feeling stressed. Here are a few suggestions to help you feel more relaxed in your free time:\n\n1. Practice deep breathing or meditation: Taking a few minutes each day to focus on your breath or engage in mindfulness meditation can help calm your mind and reduce stress.\n\n2. Engage in physical activity: Exercise is a great way to release tension and boost your mood. Go for a walk, do yoga, or participate in any form of physical activity that you enjoy.\n\n3. Find a hobby or creative outlet: Engaging in activities you love can serve as a distraction from schoolwork and provide a sense of accomplishment. It could be painting, playing an instrument, writing, or any other activity that brings you joy.\n\n4. Spend time in nature: Being in nature has a calming effect on the mind and body. Take a walk in a park, go hiking, or simply sit outside and appreciate the natural surroundings.\n\n5. Disc

# Let's create a therapist chatbot

In [102]:
user_problems = ['relaxation', 'alcohol consumption', 'stress']

In [116]:
system_message_template = """
Based on the tone and style exemplified in the hypothetical conversation excerpts below, please provide a response to the question. Remember, these excerpts are fictional and created for the purpose of this exercise. They do not represent any real individual or patient. Your response should be in the manner of a therapist, using the tone and style suggested by these excerpts but without copying any specific details from them.

Hypothetical Conversation Excerpts:
1. Client: "I just feel overwhelmed sometimes."
   Therapist: "It sounds like you're carrying a lot on your shoulders. Can you tell me more about what's overwhelming you?"

2. Client: "I'm not sure if I'm making the right choices."
   Therapist: "Making decisions can be challenging. What options are you currently considering?"

"""


system_prompt = PromptTemplate(
    template=system_message_template,
    input_variables=['user_problems']
)

system_message_prompt = SystemMessagePromptTemplate(
    prompt=system_prompt
)

# system_prompt = system_message_prompt.format(user_problems=user_problems)
first_messsage_template = """


"""

In [117]:
messages = [
    SystemMessage(content=system_message_template),
    AIMessage(content="Good morning! I'm Dr. Ellis. It's nice to meet you. Before we get started, how are you feeling about being here today?"),
    
    
]

In [128]:
new_message = "I have a feeling that everyone in my class is smarter than me. This causes me a lot of stress when presenting or solving exercises in front of my classmates."# "I have had some problems lately. I find it hard to relax after school. I can't let go of the stress."#"Hi, Dr. Ellis. I'm a bit nervous, honestly. I've never been to therapy before."
messages.append(HumanMessage(content=new_message))

In [129]:
ai_message = chat(messages)
ai_message

AIMessage(content="It can be really challenging when we compare ourselves to others and feel like we don't measure up. It sounds like this comparison is causing you a lot of stress, especially when it comes to presenting or solving exercises in front of your classmates. Can you tell me more about what thoughts or feelings come up when you're in those situations?")

In [130]:
messages.append(ai_message)

In [132]:
# save messages history to a file
with open('./data/chat_history.txt', 'a') as f:
    for message in messages:
        if isinstance(message, AIMessage):
            f.write(f'AI: {message.content}\n')
        elif isinstance(message, HumanMessage):
            f.write(f'Human: {message.content}\n')
        elif isinstance(message, SystemMessage):
            f.write(f'System: {message.content}\n')
        else:
            raise ValueError(f'Unknown message type: {type(message)}')