**Step 1: Set Up Google Colab Environment**

In [None]:
# Install required libraries
!pip install transformers datasets torch PyMuPDF pdfplumber spacy
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client
!python -m spacy download en_core_web_sm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_

**Step 2: Extract Text from PDF Files**

This step extracts and cleans text from your PDF files.


In [None]:
import fitz  # PyMuPDF
import re
import os
import spacy

# Initialize spaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

# Define the path to your PDFs in Google Drive
pdf_path = '/content/drive/My Drive/Sleep/'

# Function to clean extracted text
def clean_text(text):
    # Remove citations
    text = re.sub(r'\[\d+\]', '', text)
    # Remove table and figure captions
    text = re.sub(r'(Table|Figure)\s*\d+:.*\n', '', text)
    # Remove references section if present
    text = re.sub(r'References\n.*', '', text, flags=re.DOTALL)
    return text

# Function to extract and clean text from PDFs
def extract_text_from_pdfs(pdf_directory):
    all_texts = []
    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            with fitz.open(os.path.join(pdf_directory, filename)) as doc:
                text = ""
                for page in doc:
                    text += page.get_text()
                cleaned_text = clean_text(text)
                all_texts.append(cleaned_text)
    return all_texts

# Extract text from all PDFs in the directory
pdf_texts = extract_text_from_pdfs(pdf_path)

**Step 3: Identify and Extract Key Points**

This step identifies key points from the extracted text related to sleep and its various associations.

In [None]:
# Define keywords related to your topics of interest
keywords = [
    "obesity", "blood pressure", "cognitive burnout", "cognitive performance",
    "PTSD", "stress", "anxiety", "sleep stages", "Parkinson", "Alzheimer",
    "cognitive load", "HRV", "cardiac problems", "sleep apnea", "eating disorders",
    "post partum depression", "pregnancy", "alcohol", "sleep quality",
    "depression", "breastfeeding", "cognitive decline", "mental health",
    "optimizing sleep performance", "exercise", "smoking", "diet", "illness",
    "Covid19", "cancer", "breast cancer", "resting heart rate", "heart rate variability", "meditation",
    "Alzheimer’s disease", "Sleep Deprivation"

]

# Function to extract key sentences based on keywords
def extract_key_sentences(text, nlp, keywords=None):
    doc = nlp(text)
    key_sentences = []
    for sentence in doc.sents:
        if keywords and any(keyword in sentence.text.lower() for keyword in keywords):
            key_sentences.append(sentence.text)
        elif sentence.root.dep_ in ("advcl", "ROOT"):
            key_sentences.append(sentence.text)
    return key_sentences

# Extract key sentences from the text
key_sentences_list = []
for text in pdf_texts:
    key_sentences = extract_key_sentences(text, nlp, keywords=keywords)
    key_sentences_list.extend(key_sentences)

**Step 4: Convert Key Points into a Conversational Format**


This step converts the extracted key points into a conversational format suitable for fine-tuning.

In [None]:
!pip install datasets



In [None]:
from transformers import pipeline

# Initialize summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to rewrite key points as conversational examples
def rewrite_as_conversation(key_sentences, summarizer):
    conversation_examples = []
    for sentence in key_sentences:
        # Ensure the sentence is not empty and doesn't cause tokenization issues
        if not sentence.strip():
            continue

        # Calculate appropriate max_length for summarization
        input_length = len(sentence.split())
        max_length = min(input_length + 10, 30)  # Adjusting max_length dynamically

        try:
            # Summarize if the sentence is long enough
            if input_length > 20:
                summary = summarizer(sentence, max_length=max_length, min_length=10, do_sample=False)[0]['summary_text']
            else:
                summary = sentence

            user_prompt = f"What do studies say about {summary.split()[0]}?"
            assistant_response = f"Research indicates that {summary.lower()}."

            conversation_examples.append({
                "messages": [
                    {"role": "system", "content": "You are an AI health coach specializing in sleep."},
                    {"role": "user", "content": user_prompt},
                    {"role": "assistant", "content": assistant_response}
                ]
            })

        except IndexError as e:
            # Handle tokenization errors or issues with indexing
            print(f"An IndexError occurred with the sentence: {sentence}")
            print(f"Error: {e}")
            continue
        except Exception as e:
            # Catch-all for other unexpected errors
            print(f"An unexpected error occurred: {e}")
            continue

    return conversation_examples

# Generate conversational examples from key sentences
conversational_data = rewrite_as_conversation(key_sentences_list, summarizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outp

An IndexError occurred with the sentence: *2.23
(4.24, 0.22)
*1.62
(3.23, 0.02)
0.81
(2.45, 0.84)
0.59
(1.99, 0.81)
0.52
(1.87, 0.82)
0.37
(1.92, 1.18)
0.28
(1.83, 1.27)
0.19
(1.29, 0.92)
0.24
(2.11, 1.62)
0.18
(1.59, 1.22)
0.21
(1.77, 1.35)
*2.49
(4.81, 0.16)
*2.31
(4.40, 0.22)
*1.70
(3.41, 0.00)
0.89
(2.63, 0.85)
0.67
(2.18, 0.84)
0.60
(2.06, 0.86)
0.45
(2.11, 1.20)
0.36
(2.01, 1.29)
0.27
(1.68, 1.14)
0.32
(2.28, 1.63)
0.27
(1.79, 1.25)
0.29
(1.95, 1.37)
*2.67
(5.01, 0.33)
*2.49
(4.60, 0.39)
*1.89
(3.61, 0.16)
1.07
(2.83, 0.69)
0.85
(2.38, 0.69)
0.78
(2.27, 0.70)
0.63
(2.31, 1.04)
0.55
(2.22, 1.13)
0.45
(1.88, 0.98)
0.51
(2.48, 1.47)
0.45
(1.99, 1.10)
0.47
(2.15, 1.21)
*2.66
(4.80, 0.52)
*2.49
(4.37, 0.61)
*1.88
(3.32, 0.44)
1.07
(2.55, 0.41)
0.85
(2.05, 0.36)
0.78
(1.92, 0.36)
0.63
(2.01, 0.75)
0.54
(1.92, 0.83)
0.45
(1.51, 0.62)
0.50
(2.23, 1.22)
0.44
(1.66, 0.77)
0.47
(1.85, 0.

Your max_length is set to 30, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
