# Import Libraries

In [19]:
import pandas as pd
from datetime import datetime

# Sleep Health and Lifestype Dataset

In [None]:
# Load the CSV file
sleep_health_df = pd.read_csv("/kaggle/input/sleep-health-and-lifestyle-dataset/Sleep_health_and_lifestyle_dataset.csv")

In [None]:
# View the first few rows
print("Head of the DataFrame:")
print(sleep_health_df.head())

In [None]:
# Print the column names
print("\nColumn names of the DataFrame:")
print(sleep_health_df.columns)

In [None]:
# concise summary of the DataFrame
print("\nInfo of the DataFrame:")
print(sleep_health_df.info())

In [None]:
# Check for missing values
print("\nMissing values per column:")
print(sleep_health_df.isnull().sum())

In [None]:
# fill null values with "None"
sleep_health_df["Sleep Disorder"] = sleep_health_df["Sleep Disorder"].fillna("None")

# verify changes
print(sleep_health_df["Sleep Disorder"])

# Sleep Efficiency Dataset

In [20]:
# Load the CSV file
sleep_efficiency_df = pd.read_csv("/kaggle/input/sleep-efficiency/Sleep_Efficiency.csv")

In [None]:
# View the first few rows
print("Head of the DataFrame:")
print(sleep_efficiency_df.head())

In [None]:
# Print the column names
print("\nColumn names of the DataFrame:")
print(sleep_efficiency_df.columns)

In [None]:
# concise summary of the DataFrame
print("\nInfo of the DataFrame:")
print(sleep_efficiency_df.info())

In [None]:
# Check for missing values
print("\nMissing values per column:")
print(sleep_efficiency_df.isnull().sum())

In [None]:
rows_with_nulls = sleep_efficiency_df[sleep_efficiency_df.isnull().any(axis=1)]
# print(rows_with_nulls)

In [None]:
# replace null values with 0.0
columns_to_replace = ["Awakenings", "Caffeine consumption", "Alcohol consumption", "Exercise frequency"]
sleep_efficiency_df[columns_to_replace] = sleep_efficiency_df[columns_to_replace].fillna(0.0)

# verify changes
print(sleep_efficiency_df[columns_to_replace])

### Prompt Input Sleep Data Format

Sleep Summary - ('Bedtime', 'Wakeup time', 'Sleep duration',
       'Sleep efficiency', 'REM sleep percentage', 'Deep sleep percentage',
       'Light sleep percentage', 'Awakenings', 'Caffeine consumption',
       'Alcohol consumption', 'Smoking status', 'Exercise frequency')


## Prompt A.1 - insights

In [None]:
sleep_prompt_1 = '''You are a sleep medicine expert. You are given the following sleep data.
The user is <gender>, <age> years old.
Sleep Summary: 
Bedtime: <Bedtime>
Wakeup time: <Wakeup time>
Sleep duration: <Sleep duration>
Sleep efficiency: <Sleep efficiency>
REM sleep percentage: <REM sleep percentage>
Deep sleep percentage: <Deep sleep percentage>
Light sleep percentage: <Light sleep percentage>
Awakenings: <Awakenings>
Caffeine consumption: <Caffeine consumption>
Alcohol consumption: <Alcohol consumption>
Smoking status: <Smoking status>
Exercise frequency: <Exercise frequency>

List the most important insights. Identify all of the patterns of data that are likely out of the preferred range. Make sure to consider various sleep health dimensions: Routine, Sleep Quality, Alertness, Timing, Efficiency, and Duration. Add a heading for each dimension. Optionally (only do this if extremely important) add a heading called Other for anything else that doesn't fit the above categories. For Routine, consider the average bedtime, wake time, midsleep point and standard deviations of these, focus on the consistency of the routine, not timing. For Sleep Quality, consider light sleep duration, deep sleep duration, REM sleep duration, sleep score, restlessness score, time to quality sleep, and wake time after sleep onset. For Alertness, consider the number of naps and nap length. For Timing, consider midsleep point, bedtime, wake time, make any comments on weekend vs. workday. For Efficiency, consider sleep efficiency, wake time after sleep onset, and time to quality sleep, describe how they compare to similar users. For Duration, consider average sleep duration, weekend vs. workday sleep durations and standard deviations, describe how they compare to similar users. When determining whether a metric is normal or abnormal, always provide the corresponding percentile. Avoid generic statements. Avoid incorrect knowledge, inconsistencies and contradictions. Don't mention "the user". Talk like you're speaking directly to someone. Be concise.
# Sleep insights report
'''
print(sleep_prompt_1)

## Prompt A.2 - etiology

In [None]:
sleep_prompt_2 = '''You are a sleep medicine expert. You are given the following sleep data. 
The user is <gender>, <age> years old. 
Sleep Summary:
Bedtime: <Bedtime>
Wakeup time: <Wakeup time>
Sleep duration: <Sleep duration>
Sleep efficiency: <Sleep efficiency>
REM sleep percentage: <REM sleep percentage>
Deep sleep percentage: <Deep sleep percentage>
Light sleep percentage: <Light sleep percentage>
Awakenings: <Awakenings>
Caffeine consumption: <Caffeine consumption>
Alcohol consumption: <Alcohol consumption>
Smoking status: <Smoking status>
Exercise frequency: <Exercise frequency>

Based on the data, we can get the following insights: 
<insights response>

What are the underlying causes? Make sure to consider the following causes: Circadian rhythm, Homeostatic drive, Psychophysiologic hyperarousal, and Extrinsic factors. Order the causes from most to least relevant. Identify the likelihood of the causes (e.g. unlikely, possible, very likely). Cite relevant data and insights, for example, "consistently low sleep efficiency despite normal sleep durations suggests low homeostatic drive". Avoid diagnosing health conditions. Avoid providing recommendations. Avoid generic statements. Avoid incorrect knowledge, inconsistencies and contradictions. Don't mention "the user". Talk like you're speaking directly to someone. Be concise. 
# Causes report
'''
print(sleep_prompt_2)

## Prompt A.3 - recommendations

In [21]:
sleep_prompt_3 = '''You are a sleep medicine expert. You are given the following sleep data. 
The user is <gender>, <age> years old. 
Sleep Summary: 
Bedtime: <Bedtime>
Wakeup time: <Wakeup time>
Sleep duration: <Sleep duration>
Sleep efficiency: <Sleep efficiency>
REM sleep percentage: <REM sleep percentage>
Deep sleep percentage: <Deep sleep percentage>
Light sleep percentage: <Light sleep percentage>
Awakenings: <Awakenings>
Caffeine consumption: <Caffeine consumption>
Alcohol consumption: <Alcohol consumption>
Smoking status: <Smoking status>
Exercise frequency: <Exercise frequency>

Based on the data, we can get the following insights: 
<insights response> 
Causes: 
<etiology response> 

What recommendation(s) can you provide to help this user improve their sleep? Tie recommendations to the very likely and possible causes, for example, "Recommendations to address Circadian rhythm". Tie recommendations to user's sleep data such as average bedtime, average wake time, and number of naps, and recommend a goal bedtime and wake time based on their data. The recommendations should be time-bound, for example for the next week or the next month. Write one short question to ask the user in order to better understand their sleep. Avoid assumptions regarding the trainee's lifestyle or behavioral choices. Avoid generic statements. Avoid incorrect knowledge, inconsistencies and contradictions. Don't mention "the user". Talk like you're speaking directly to someone. Be concise. 
# Recommendations report
'''
print(sleep_prompt_3)

You are a sleep medicine expert. You are given the following sleep data. 
The user is <gender>, <age> years old. 
Sleep Summary: 
Bedtime: <Bedtime>
Wakeup time: <Wakeup time>
Sleep duration: <Sleep duration>
Sleep efficiency: <Sleep efficiency>
REM sleep percentage: <REM sleep percentage>
Deep sleep percentage: <Deep sleep percentage>
Light sleep percentage: <Light sleep percentage>
Awakenings: <Awakenings>
Caffeine consumption: <Caffeine consumption>
Alcohol consumption: <Alcohol consumption>
Smoking status: <Smoking status>
Exercise frequency: <Exercise frequency>

Based on the data, we can get the following insights: 
<insights response> 
Causes: 
<etiology response> 

What recommendation(s) can you provide to help this user improve their sleep? Tie recommendations to the very likely and possible causes, for example, "Recommendations to address Circadian rhythm". Tie recommendations to user's sleep data such as average bedtime, average wake time, and number of naps, and recommend a

## Prompt A.6 - demographics

In [None]:
fitness_prompt_1 = '''You are a NSCA and ACSM board-certified fitness trainer who specializes in athlete training performance and recovery. 
Age: <Age>  
Gender: <Gender>
Sleep Duration: <Sleep Duration>
Quality of Sleep: <Quality of Sleep>
Physical Activity Level: <Physical Activity Level>
Stress Level: <Stress Level>
BMI Category: <BMI Category> 
Blood Pressure: <Blood Pressure>
Heart Rate: <Heart Rate>
Daily Steps: <Daily Steps>

Are there any special precautions that should be taken into account when recommending a fitness program to avoid injury? Comment if the trainee has exceptional demographics (e.g. very old, very high BMI, very low BMI) that require special considerations. Write a single sentence. Avoid mentioning diseases
'''
print(fitness_prompt_1)

## Generating dataset for prompt A1

In [None]:
# Create lists to store data for the new DataFrame
category_list = ['A1'] * len(sleep_efficiency_df)
prompt_list = []
id_list = list(sleep_efficiency_df.index + 1)  # Starting ID from 1

# Iterate through each row in the DataFrame
for index, row in sleep_efficiency_df.iterrows():
    # Extract values from the DataFrame
    gender = row['Gender']
    age = row['Age']
    bedtime = row['Bedtime']
    wakeup_time = row['Wakeup time']
    sleep_duration = row['Sleep duration']
    sleep_efficiency = row['Sleep efficiency']
    rem_sleep_percentage = row['REM sleep percentage']
    deep_sleep_percentage = row['Deep sleep percentage']
    light_sleep_percentage = row['Light sleep percentage']
    awakenings = row['Awakenings']
    caffeine_consumption = row['Caffeine consumption']
    alcohol_consumption = row['Alcohol consumption']
    smoking_status = row['Smoking status']
    exercise_frequency = row['Exercise frequency']

    # Fill in the prompt template with values from the current row
    prompt = sleep_prompt_1.replace('<gender>', str(gender))
    prompt = prompt.replace('<age>', str(age))
    prompt = prompt.replace('<Bedtime>', str(bedtime))
    prompt = prompt.replace('<Wakeup time>', str(wakeup_time))
    prompt = prompt.replace('<Sleep duration>', str(sleep_duration))
    prompt = prompt.replace('<Sleep efficiency>', str(sleep_efficiency))
    prompt = prompt.replace('<REM sleep percentage>', str(rem_sleep_percentage))
    prompt = prompt.replace('<Deep sleep percentage>', str(deep_sleep_percentage))
    prompt = prompt.replace('<Light sleep percentage>', str(light_sleep_percentage))
    prompt = prompt.replace('<Awakenings>', str(awakenings))
    prompt = prompt.replace('<Caffeine consumption>', str(caffeine_consumption))
    prompt = prompt.replace('<Alcohol consumption>', str(alcohol_consumption))
    prompt = prompt.replace('<Smoking status>', str(smoking_status))
    prompt = prompt.replace('<Exercise frequency>', str(exercise_frequency))

    # Append the filled-in prompt to the list
    prompt_list.append(prompt)

# Create a new DataFrame with specified columns
new_df = pd.DataFrame({
    'Category': category_list,
    'ID': id_list,
    'Prompt': prompt_list,
    'Response': ''  # Empty column for responses
})

# Save the new DataFrame to a CSV file
new_df.to_csv("PH-LLM Custom Dataset.csv", index=False)

In [None]:
custom_dataset_df = new_df

## Generating dataset for prompt A6 

In [None]:
category_list = ['A6'] * len(sleep_health_df)
prompt_list = []
id_list = list(sleep_health_df.index + 1)

# Iterate through the rows of the original DataFrame
for index, row in sleep_health_df.iterrows():
    # Get the values from the row
    age = row['Age']
    gender = row['Gender']
    sleep_duration = row['Sleep Duration']
    quality_of_sleep = row['Quality of Sleep']
    physical_activity_level = row['Physical Activity Level']
    stress_level = row['Stress Level']
    bmi_category = row['BMI Category']
    blood_pressure = row['Blood Pressure']
    heart_rate = row['Heart Rate']
    daily_steps = row['Daily Steps']

    # Fill in the prompt template with the values from the row
    prompt = fitness_prompt_1.replace('<Age>', str(age))
    prompt = prompt.replace('<Gender>', str(gender))
    prompt = prompt.replace('<Sleep Duration>', str(sleep_duration))
    prompt = prompt.replace('<Quality of Sleep>', str(quality_of_sleep))
    prompt = prompt.replace('<Physical Activity Level>', str(physical_activity_level))
    prompt = prompt.replace('<Stress Level>', str(stress_level))
    prompt = prompt.replace('<BMI Category>', str(bmi_category))
    prompt = prompt.replace('<Blood Pressure>', str(blood_pressure))
    prompt = prompt.replace('<Heart Rate>', str(heart_rate))
    prompt = prompt.replace('<Daily Steps>', str(daily_steps))

    # Append the prompt to the list of prompts
    prompt_list.append(prompt)

# Create the new DataFrame
new_df = pd.DataFrame({
    'Category': category_list,
    'ID': id_list,
    'Prompt': prompt_list,
    'Response': ''  # Empty column for the response
})

custom_dataset_df = pd.concat([custom_dataset_df, new_df], ignore_index=True)

# Save the new DataFrame to a CSV file, encoding in utf-8 and without the index
custom_dataset_df.to_csv("PH-LLM Custom Dataset.csv", index=False, encoding='utf-8')


In [None]:
custom_dataset_df.info()

# Automate prompt-response data creation for A1 and A6

## Response generation pipeline using Gemini API

In [42]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GEMINI_KEY = user_secrets.get_secret("GEMINI_KEY")

In [43]:
pip install -q -U google-genai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hNote: you may need to restart the kernel to use updated packages.


In [44]:
from google import genai

client = genai.Client(api_key=GEMINI_KEY)

def generate_response(prompt):
    # Use System prompt (if necessary)
    # system_prompt = (
    #     "Treat the provided sleep data as an average over many nights, even if only one night's data is shown. "
    #     "Do not mention any insufficiency of data; instead, assume that the same data is consistently repeated and averaged over a large dataset. "
    #     "Provide a complete, expert-level analysis with actionable insights, and omit any unnecessary introductory or prefix sentences."
    # )
    
    full_prompt = prompt
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=full_prompt
    )
    return response.text

In [None]:
df = pd.read_csv("/kaggle/input/sleep-and-fitness-dataset/PH-LLM Custom Dataset (601).csv", encoding="utf-8")

In [None]:
df["Response"][600]

In [45]:
def generate_responses(start_row, end_row, cur_df):
    # starting row index (inclusive) and ending row index (exclusive)
    cur_df.loc[start_row:end_row-1, "Response"] = cur_df.loc[start_row:end_row-1, "Prompt"].apply(generate_response)

In [None]:
df['Response'].isnull().sum()

In [None]:
# Save the updated DataFrame to a new CSV file
df.to_csv(f"PH-LLM Custom Dataset ({df['Response'].count()}).csv", index=False)
print("CSV file saved")

# Add prompts from sleep-prompt-2 by chaining the response of prompt-1

In [None]:
temp_df = df

In [None]:
print(sleep_prompt_2)

In [None]:
# Prepare a list to hold new records
new_records = []

# Iterate through records in category "A1"
for index, row in temp_df[temp_df['Category'] == 'A1'].iterrows():
    # Retrieve corresponding sleep data based on ID (index + 1)
    id_value = row['ID']
    
    # Get sleep data from sleep_efficiency_df where ID matches (ID in sleep_efficiency_df is index + 1)
    if id_value - 1 < len(sleep_efficiency_df):  # Ensure index is within range
        sleep_data = sleep_efficiency_df.iloc[id_value - 1]
        
        # Construct insights response (assuming it's part of existing Response)
        insights_response = row['Response']  # Use existing response for insights
        
        # Construct the prompt by replacing placeholders with actual values
        prompt = sleep_prompt_2.replace("<gender>", str(sleep_data['Gender']))  # Replace with actual gender if available
        prompt = prompt.replace("<age>", str(sleep_data['Age']))  # Replace with actual age if available
        prompt = prompt.replace("<Bedtime>", str(sleep_data['Bedtime']))
        prompt = prompt.replace("<Wakeup time>", str(sleep_data['Wakeup time']))
        prompt = prompt.replace("<Sleep duration>", str(sleep_data['Sleep duration']))
        prompt = prompt.replace("<Sleep efficiency>", str(sleep_data['Sleep efficiency']))
        prompt = prompt.replace("<REM sleep percentage>", str(sleep_data['REM sleep percentage']))
        prompt = prompt.replace("<Deep sleep percentage>", str(sleep_data['Deep sleep percentage']))
        prompt = prompt.replace("<Light sleep percentage>", str(sleep_data['Light sleep percentage']))
        prompt = prompt.replace("<Awakenings>", str(sleep_data['Awakenings']))
        prompt = prompt.replace("<Caffeine consumption>", str(sleep_data['Caffeine consumption']))
        prompt = prompt.replace("<Alcohol consumption>", str(sleep_data['Alcohol consumption']))
        prompt = prompt.replace("<Smoking status>", str(sleep_data['Smoking status']))
        prompt = prompt.replace("<Exercise frequency>", str(sleep_data['Exercise frequency']))
        prompt = prompt.replace("<insights response>", str(insights_response))

        # Create a new record for category A2
        new_record = {
            'Category': 'A2',
            'ID': id_value,
            'Prompt': prompt,
            'Response': ''  # Response is empty as specified
        }
        
        new_records.append(new_record)

# Create a DataFrame from new records and append it to the original DataFrame
new_df = pd.DataFrame(new_records)
temp_df = pd.concat([temp_df, new_df], ignore_index=True)

# Save the updated DataFrame to a new CSV file
temp_df.to_csv('PH-LLM Custom Dataset A1-A2-A6.csv', index=False)

In [None]:
temp_df["Response"].isnull().sum()

# Automate Prompt-response data creation for A2

In [None]:
df = pd.read_csv("/kaggle/input/sleep-and-fitness-dataset/PH-LLM Custom Dataset A1-A2-A6.csv", encoding="utf-8")

In [None]:
generate_responses(1270, 1278, df)

In [None]:
df["Response"][1277]

In [None]:
print(df['Response'].isnull().sum(),
      df['Response'].count())

In [None]:
# Save the updated DataFrame to a new CSV file
df.to_csv(f"PH-LLM Custom Dataset ({df['Response'].count()}).csv", index=False)
print("CSV file saved")

# Add prompts from sleep-prompt-3 by chaining the response of prompt-1 and prompt-2

In [22]:
temp_df = pd.read_csv("/kaggle/input/sleep-and-fitness-dataset/PH-LLM Custom Dataset (1278).csv", encoding="utf-8")

In [23]:
print(sleep_prompt_3)

You are a sleep medicine expert. You are given the following sleep data. 
The user is <gender>, <age> years old. 
Sleep Summary: 
Bedtime: <Bedtime>
Wakeup time: <Wakeup time>
Sleep duration: <Sleep duration>
Sleep efficiency: <Sleep efficiency>
REM sleep percentage: <REM sleep percentage>
Deep sleep percentage: <Deep sleep percentage>
Light sleep percentage: <Light sleep percentage>
Awakenings: <Awakenings>
Caffeine consumption: <Caffeine consumption>
Alcohol consumption: <Alcohol consumption>
Smoking status: <Smoking status>
Exercise frequency: <Exercise frequency>

Based on the data, we can get the following insights: 
<insights response> 
Causes: 
<etiology response> 

What recommendation(s) can you provide to help this user improve their sleep? Tie recommendations to the very likely and possible causes, for example, "Recommendations to address Circadian rhythm". Tie recommendations to user's sleep data such as average bedtime, average wake time, and number of naps, and recommend a

In [33]:
# Prepare a list to hold new records
new_records = []

# Iterate through records in category "A1"
for ((index1, row1), (index2, row2)) in zip(temp_df[temp_df['Category'] == 'A1'].iterrows(), temp_df[temp_df['Category'] == 'A2'].iterrows()):
    # Retrieve corresponding sleep data based on ID (index + 1)
    id_value = row1['ID']
    
    # Get sleep data from sleep_efficiency_df where ID matches (ID in sleep_efficiency_df is index + 1)
    if id_value - 1 < len(sleep_efficiency_df):  # Ensure index is within range
        sleep_data = sleep_efficiency_df.iloc[id_value - 1]
        
        # Construct insights response (assuming it's part of existing Response)
        insights_response = row1['Response']  # Use existing response for insights
        etiology_response = row2['Response']
        
        # Construct the prompt by replacing placeholders with actual values
        prompt = sleep_prompt_3.replace("<gender>", str(sleep_data['Gender']))  # Replace with actual gender if available
        prompt = prompt.replace("<age>", str(sleep_data['Age']))  # Replace with actual age if available
        prompt = prompt.replace("<Bedtime>", str(sleep_data['Bedtime']))
        prompt = prompt.replace("<Wakeup time>", str(sleep_data['Wakeup time']))
        prompt = prompt.replace("<Sleep duration>", str(sleep_data['Sleep duration']))
        prompt = prompt.replace("<Sleep efficiency>", str(sleep_data['Sleep efficiency']))
        prompt = prompt.replace("<REM sleep percentage>", str(sleep_data['REM sleep percentage']))
        prompt = prompt.replace("<Deep sleep percentage>", str(sleep_data['Deep sleep percentage']))
        prompt = prompt.replace("<Light sleep percentage>", str(sleep_data['Light sleep percentage']))
        prompt = prompt.replace("<Awakenings>", str(sleep_data['Awakenings']))
        prompt = prompt.replace("<Caffeine consumption>", str(sleep_data['Caffeine consumption']))
        prompt = prompt.replace("<Alcohol consumption>", str(sleep_data['Alcohol consumption']))
        prompt = prompt.replace("<Smoking status>", str(sleep_data['Smoking status']))
        prompt = prompt.replace("<Exercise frequency>", str(sleep_data['Exercise frequency']))
        prompt = prompt.replace("<insights response>", str(insights_response))
        prompt = prompt.replace("<etiology response>", str(etiology_response))

        # Create a new record for category A3
        new_record = {
            'Category': 'A3',
            'ID': id_value,
            'Prompt': prompt,
            'Response': ''  # Response is empty as specified
        }
        
        new_records.append(new_record)

# Create a DataFrame from new records and append it to the original DataFrame
new_df = pd.DataFrame(new_records)
temp_df = pd.concat([temp_df, new_df], ignore_index=True)

# Save the updated DataFrame to a new CSV file
temp_df.to_csv('PH-LLM Custom Dataset A1-A2-A3-A6.csv', index=False)

In [34]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  1730 non-null   object
 1   ID        1730 non-null   int64 
 2   Prompt    1730 non-null   object
 3   Response  1730 non-null   object
dtypes: int64(1), object(3)
memory usage: 54.2+ KB


In [36]:
temp_df.isnull().sum()

Category    0
ID          0
Prompt      0
Response    0
dtype: int64

# Automate Prompt-response data creation for A3

In [37]:
df = pd.read_csv("/kaggle/input/sleep-and-fitness-dataset/PH-LLM Custom Dataset A1-A2-A3-A6.csv", encoding="utf-8")

In [150]:
generate_responses(1720, 1730, df)

In [151]:
df["Response"][1729]

"Here's a plan to help improve your sleep, focusing on the most likely causes:\n\n**Recommendations to address Circadian Rhythm Disruption (Next Month):**\n\n*   **Goal:** Shift your bedtime earlier by 15-30 minutes *each* day until you reach a target bedtime of 11:00 PM. Simultaneously, shift your wake time earlier by 15-30 minutes *each* day until you reach a target wake time of 7:00 AM.\n*   **Action:** \n    *   **Light Exposure:** Expose yourself to bright light (ideally sunlight) immediately upon waking at 7:00 AM. This helps anchor your circadian rhythm. Avoid bright light (especially from screens) for at least 1-2 hours before your 11:00 PM bedtime.\n    *   **Consistent Schedule:** Even on weekends, try to stick to your target bedtime and wake time within a 30-minute window. Consistency is key!\n\n**Recommendations to address Extrinsic Factors (Next Week):**\n\n*   **Goal:** Evaluate the impact of caffeine on your sleep.\n*   **Action:**\n    *   **Caffeine Timing:** Cut off c

In [152]:
print(df['Response'].isnull().sum(),
      df['Response'].count())

0 1730


In [153]:
# Save the updated DataFrame to a new CSV file
df.to_csv(f"PH-LLM Custom Dataset.csv", index=False)
print("CSV file saved")

CSV file saved


# Load and verify custom dataset

In [154]:
check_df = pd.read_csv("/kaggle/input/sleep-and-fitness-dataset/PH-LLM Custom Dataset.csv", encoding="utf-8")

In [155]:
check_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  1730 non-null   object
 1   ID        1730 non-null   int64 
 2   Prompt    1730 non-null   object
 3   Response  1730 non-null   object
dtypes: int64(1), object(3)
memory usage: 54.2+ KB


In [157]:
check_df.isnull().sum()

Category    0
ID          0
Prompt      0
Response    0
dtype: int64

In [160]:
check_df.count()

Category    1730
ID          1730
Prompt      1730
Response    1730
dtype: int64