In [2]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

In [3]:
loader = TextLoader(file_path='./data/conversations/conversation_0.txt')

In [4]:
data = loader.load()

In [5]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your sample document')
print (f'Here is a sample: {data[0].page_content[:200]}')

You have 1 document(s) in your data
There are 4710 characters in your sample document
Here is a sample: Therapist: Thanks for filling it out. We give this form to everyone once a year regardless of why they come in. It helps us provide better care. Is it okay if I take a look at what you put down?
Clien


# Create chunks from the therapy transcripts

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 400, chunk_overlap= 150)
texts = text_splitter.split_documents(data)

In [9]:
print(f'You have {len(texts)} documents in your data')

You have 17 text(s) in your data


In [13]:
texts[0]

Document(page_content="Therapist: Thanks for filling it out. We give this form to everyone once a year regardless of why they come in. It helps us provide better care. Is it okay if I take a look at what you put down?\nClient: Sure.\nTherapist: So, let's see. It looks that you put-- You drink alcohol at least four times a week on average-\nClient: Mm-hmm.", metadata={'source': './data/conversations/conversation_0.txt'})

# Generate summaries from chunks using the fine-tuned flan-T5-base model

In [14]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM


In [15]:
model_id = 'jruranski/flan-t5-base-samsum'

In [16]:
device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [18]:
pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=512
)

In [19]:
local_llm = HuggingFacePipeline(pipeline=pipe)

In [22]:
sample_summary = local_llm(texts[0].page_content)
print("Sample summary: ", sample_summary) 

Sample summary:  Client filled out the form. She drinks alcohol at least four times a week on average.


# Store the summaries and dialogs in a vector store (Chroma)