<a href="https://colab.research.google.com/github/itzayush21/Hack_to_the_future/blob/main/health_data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import pandas as pd
import numpy as np

# Define the function to determine sub-categories
def determine_sub_categories(issue, row):
    # Default values
    severity = 'N/A'
    management = 'N/A'
    medication = 'N/A'

    # Health issue-specific logic
    if 'Cardiac Issue' in issue:
        if row['Blood_Pressure_Systolic'] > 140 or row['Heart_Rate_bpm'] > 90:
            severity = 'Severe'
        elif row['Blood_Pressure_Systolic'] > 130 or row['Heart_Rate_bpm'] > 80:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Lifestyle Changes', 'Medication', 'Surgery'])
        medication = np.random.choice(['None', 'OTC', 'Prescription'])

    elif 'Liver Problem' in issue:
        if row['Liver_Function_AST'] > 45 or row['Liver_Function_ALT'] > 45:
            severity = 'Severe'
        elif row['Liver_Function_AST'] > 40 or row['Liver_Function_ALT'] > 40:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Lifestyle Changes', 'Medication', 'Surgery'])
        medication = np.random.choice(['None', 'OTC', 'Prescription'])

    elif 'Mental Health Concern' in issue:
        if row['Mental_Health_Score'] < 2:
            severity = 'Severe'
        elif row['Mental_Health_Score'] < 4:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Therapy', 'Medication', 'Lifestyle Changes'])
        medication = np.random.choice(['None', 'OTC', 'Prescription'])

    elif 'Diabetes' in issue:
        if row['Blood_Sugar_mg/dL'] > 160:
            severity = 'Uncontrolled'
        else:
            severity = 'Controlled'
        management = np.random.choice(['Diet', 'Medication', 'Insulin'])
        medication = np.random.choice(['None', 'OTC', 'Insulin'])

    elif 'Hypertension' in issue:
        if row['Blood_Pressure_Systolic'] >= 140 or row['Blood_Pressure_Diastolic'] >= 90:
            severity = 'Uncontrolled'
        else:
            severity = 'Controlled'
        management = np.random.choice(['Diet', 'Medication', 'Lifestyle Changes'])
        medication = np.random.choice(['None', 'OTC', 'Prescription'])

    elif 'Obesity' in issue:
        if row['BMI'] >= 35:
            severity = 'Severe'
        elif row['BMI'] >= 30:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Diet', 'Exercise', 'Surgery'])
        medication = np.random.choice(['None', 'OTC', 'Prescription'])

    elif 'Respiratory Issue' in issue:
        if row['Smoking_Habits'] == 'Regular smoker':
            severity = 'Severe'
        elif row['Alcohol_Consumption'] > 2:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Medication', 'Lifestyle Changes', 'Surgery'])
        medication = np.random.choice(['None', 'OTC', 'Prescription'])

    elif 'Kidney Problem' in issue:
        if row['BMI'] >= 35 and row['Cholesterol_mg/dL'] > 240:
            severity = 'Severe'
        elif row['BMI'] >= 30 or row['Cholesterol_mg/dL'] > 200:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Medication', 'Dialysis', 'Surgery'])
        medication = np.random.choice(['None', 'OTC', 'Prescription'])

    elif 'Nutritional Deficiency' in issue:
        if row['Daily_Caloric_Intake'] < 1800:
            severity = 'Severe'
        elif row['Daily_Caloric_Intake'] < 2200:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Dietary Changes', 'Supplementation'])
        medication = np.random.choice(['None', 'OTC'])

    elif 'Low Physical Activity' in issue:
        if row['Daily_Steps'] < 3000:
            severity = 'Severe'
        elif row['Daily_Steps'] < 5000:
            severity = 'Moderate'
        else:
            severity = 'Mild'
        management = np.random.choice(['Exercise Program', 'Lifestyle Changes'])
        medication = np.random.choice(['None', 'OTC'])

    return severity, management, medication

# Generate synthetic data
def generate_synthetic_data(num_samples=30000):
    np.random.seed(24)

    # Generate random data
    data = {
        'User_ID': np.arange(1, num_samples + 1),
        'Age': np.random.randint(18, 80, num_samples),
        'Gender': np.random.choice(['Male', 'Female'], num_samples),
        'Height_cm': np.random.randint(150, 190, num_samples),
        'Weight_kg': np.random.randint(50, 120, num_samples),
        'Sleep_Hours': np.random.uniform(4, 10, num_samples),
        'Sleep_Quality': np.random.choice(['Poor', 'Average', 'Good'], num_samples),
        'Stress_Level': np.random.choice(['Low', 'Medium', 'High'], num_samples),
        'Work_Type': np.random.choice(['Sedentary', 'Active'], num_samples),
        'Alcohol_Consumption': np.random.uniform(0, 5, num_samples),
        'Smoking_Habits': np.random.choice(['Non-smoker', 'Occasional smoker', 'Regular smoker'], num_samples),
        'Daily_Caloric_Intake': np.random.randint(1500, 3500, num_samples),
        'Protein_Intake_g': np.random.randint(50, 150, num_samples),
        'Carb_Intake_g': np.random.randint(150, 400, num_samples),
        'Fat_Intake_g': np.random.randint(20, 100, num_samples),
        'Fruit_Veg_Intake': np.random.randint(0, 10, num_samples),
        'Daily_Steps': np.random.randint(2000, 15000, num_samples),
        'Exercise_Type': np.random.choice(['None', 'Cardio', 'Strength Training'], num_samples),
        'Exercise_Duration_min': np.random.randint(0, 120, num_samples),
        'Heart_Rate_bpm': np.random.randint(60, 100, num_samples),
        'Blood_Pressure_Systolic': np.random.randint(100, 160, num_samples),
        'Blood_Pressure_Diastolic': np.random.randint(60, 100, num_samples),
        'Blood_Sugar_mg/dL': np.random.randint(70, 150, num_samples),
        'Cholesterol_mg/dL': np.random.randint(150, 300, num_samples),
        'Liver_Function_AST': np.random.randint(10, 50, num_samples),
        'Liver_Function_ALT': np.random.randint(10, 50, num_samples),
        'Mental_Health_Score': np.random.uniform(1, 10, num_samples)
    }

    df = pd.DataFrame(data)

    # Calculate BMI
    df['BMI'] = df['Weight_kg'] / ((df['Height_cm'] / 100) ** 2)

    # Define health issue thresholds and rules
    def determine_health_issues(row):
        issues = []

        # Weight and BMI considerations
        if row['BMI'] >= 30:
            issues.append('Obesity')
        else:
            if row['Gender'] == 'Female' and row['Weight_kg'] > 80 and row['BMI'] >= 25:
                issues.append('Obesity')
            elif row['Gender'] == 'Male' and row['Weight_kg'] > 90 and row['BMI'] >= 25:
                issues.append('Obesity')


        # Cardiac Issue
        if row['Heart_Rate_bpm'] > 90 or row['Blood_Pressure_Systolic'] > 140 or row['Blood_Pressure_Diastolic'] > 90 or row['Cholesterol_mg/dL'] > 240:
            issues.append('Cardiac Issue')

        # Liver Problem
        if row['Liver_Function_AST'] > 45 or row['Liver_Function_ALT'] > 45:
            issues.append('Liver Problem')

        # Mental Health Concern
        if row['Mental_Health_Score'] < 3:
            issues.append('Mental Health Concern')
        if row['Stress_Level'] == 'High':
            issues.append('Mental Health Concern')
        if row['Sleep_Hours'] < 5 or row['Sleep_Quality'] == 'Poor':
            issues.append('Mental Health Concern')
        if row['Sleep_Hours'] < 6 and row['Heart_Rate_bpm'] > 80:
            issues.append('Cardiac Issue')

        # Diabetes
        if row['Blood_Sugar_mg/dL'] > 140:
            issues.append('Diabetes')

        # Hypertension
        if row['Blood_Pressure_Systolic'] >= 140 or row['Blood_Pressure_Diastolic'] >= 90:
            issues.append('Hypertension')

        # Respiratory Issue
        if row['Smoking_Habits'] in ['Regular smoker', 'Occasional smoker'] or row['Alcohol_Consumption'] > 2:
            issues.append('Respiratory Issue')

        # Kidney Problem
        if row['BMI'] >= 35 and row['Cholesterol_mg/dL'] > 240:
            issues.append('Kidney Problem')

        # Nutritional Deficiency
        if row['Daily_Caloric_Intake'] < 1800 and row['BMI'] >= 25:
            issues.append('Nutritional Deficiency')
        if row['Protein_Intake_g'] < 50 and row['BMI'] >= 25:
            issues.append('Nutritional Deficiency')
        if row['Carb_Intake_g'] < 100 and row['BMI'] >= 25:
            issues.append('Nutritional Deficiency')
        if row['Fat_Intake_g'] > 80 and row['BMI'] >= 25:
            issues.append('High Fat Intake')
        if row['Fruit_Veg_Intake'] < 3:
            issues.append('Nutritional Deficiency')
        if row['Daily_Steps'] < 5000:
            issues.append('Low Physical Activity')
        if row['Exercise_Type'] == 'None' and row['Exercise_Duration_min'] == 0:
            issues.append('Lack of Exercise')

        # Low Physical Activity
        if row['Daily_Steps'] < 5000:
            issues.append('Low Physical Activity')

        return ', '.join(issues) if issues else 'No issue'

    # Apply the function to determine health issues
    df['Health_Issue'] = df.apply(determine_health_issues, axis=1)

    # Apply sub-category function to create the Sub_Category, Severity, Management, and Medication columns
    df[['Severity', 'Management', 'Medication']] = df.apply(
        lambda row: pd.Series(determine_sub_categories(row['Health_Issue'], row)), axis=1
    )

    return df

# Generate synthetic data
df = generate_synthetic_data()

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,User_ID,Age,Gender,Height_cm,Weight_kg,Sleep_Hours,Sleep_Quality,Stress_Level,Work_Type,Alcohol_Consumption,Smoking_Habits,Daily_Caloric_Intake,Protein_Intake_g,Carb_Intake_g,Fat_Intake_g,Fruit_Veg_Intake,Daily_Steps,Exercise_Type,Exercise_Duration_min,Heart_Rate_bpm,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Blood_Sugar_mg/dL,Cholesterol_mg/dL,Liver_Function_AST,Liver_Function_ALT,Mental_Health_Score,BMI,Health_Issue,Severity,Management,Medication
0,1,52,Male,166,95,9.492258,Good,Low,Sedentary,1.958677,Regular smoker,1683,136,306,46,5,5663,,101,91,117,66,100,275,46,42,7.094642,34.47525,"Obesity, Cardiac Issue, Liver Problem, Respira...",Severe,Surgery,
1,2,21,Male,162,64,9.314001,Good,High,Active,3.096247,Non-smoker,1969,52,374,86,2,8107,,41,61,108,67,136,294,35,22,4.054235,24.386526,"Cardiac Issue, Mental Health Concern, Respirat...",Mild,Surgery,Prescription
2,3,18,Male,183,67,7.208673,Poor,Medium,Active,2.278278,Non-smoker,2658,124,326,85,6,4629,Strength Training,14,80,147,90,139,187,32,13,2.526225,20.006569,"Cardiac Issue, Mental Health Concern, Mental H...",Severe,Lifestyle Changes,OTC
3,4,41,Female,182,114,4.367803,Poor,Medium,Sedentary,4.565499,Non-smoker,2796,145,356,69,4,12096,Strength Training,14,70,122,88,114,289,17,22,1.804171,34.416133,"Obesity, Cardiac Issue, Mental Health Concern,...",Mild,Lifestyle Changes,OTC
4,5,35,Male,153,78,6.979355,Poor,High,Active,0.182537,Regular smoker,3162,106,383,85,6,7639,Cardio,41,81,125,61,117,257,13,47,7.703125,33.320518,"Obesity, Cardiac Issue, Liver Problem, Mental ...",Moderate,Medication,OTC


In [None]:
# prompt: data of health issue column=="no issue"

import pandas as pd
import numpy as np

# ... (Preceding code remains the same)

# Generate synthetic data
df = generate_synthetic_data()

# Filter the DataFrame for rows where Health_Issue is "No issue"
no_issue_df = df[df['Health_Issue'] == "No issue"]

# Display the filtered DataFrame
(no_issue_df)


Unnamed: 0,User_ID,Age,Gender,Height_cm,Weight_kg,Sleep_Hours,Sleep_Quality,Stress_Level,Work_Type,Alcohol_Consumption,...,Blood_Sugar_mg/dL,Cholesterol_mg/dL,Liver_Function_AST,Liver_Function_ALT,Mental_Health_Score,BMI,Health_Issue,Severity,Management,Medication
56,57,59,Male,167,58,9.798018,Good,Low,Sedentary,0.463155,...,94,170,36,32,8.327042,20.79673,No issue,,,
251,252,72,Male,173,79,8.620824,Good,Low,Sedentary,0.984545,...,74,200,12,17,5.214548,26.395803,No issue,,,
2461,2462,62,Male,166,63,8.363707,Average,Low,Active,0.216769,...,136,225,27,19,9.979997,22.862534,No issue,,,
3143,3144,72,Male,189,63,8.788693,Good,Low,Active,0.147424,...,91,154,28,35,6.936623,17.636684,No issue,,,
4062,4063,50,Female,169,52,6.833753,Good,Low,Active,1.397784,...,123,187,16,15,5.83435,18.206645,No issue,,,
4365,4366,75,Female,170,66,6.314178,Average,Medium,Sedentary,0.420007,...,71,202,22,15,9.927686,22.83737,No issue,,,
5465,5466,60,Male,169,70,6.849572,Good,Low,Sedentary,1.283877,...,106,180,27,44,4.271476,24.508946,No issue,,,
7161,7162,56,Female,154,59,6.425599,Average,Low,Active,0.187699,...,122,204,20,27,6.142803,24.87772,No issue,,,
8056,8057,57,Male,170,65,8.053752,Good,Medium,Active,0.69469,...,86,225,36,30,7.733658,22.491349,No issue,,,
8309,8310,33,Male,186,51,6.673118,Average,Medium,Sedentary,0.584927,...,71,169,37,25,6.258395,14.741589,No issue,,,


In [33]:
# prompt: convert df to csv and save to files

# Save the DataFrame to a CSV file
df.to_csv('synthetic_health_data.csv', index=False)

# Optionally, you can save to other file formats like Excel
df.to_excel('synthetic_health_data.xlsx', index=False)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   User_ID                   30000 non-null  int64  
 1   Age                       30000 non-null  int64  
 2   Gender                    30000 non-null  object 
 3   Height_cm                 30000 non-null  int64  
 4   Weight_kg                 30000 non-null  int64  
 5   Sleep_Hours               30000 non-null  float64
 6   Sleep_Quality             30000 non-null  object 
 7   Stress_Level              30000 non-null  object 
 8   Work_Type                 30000 non-null  object 
 9   Alcohol_Consumption       30000 non-null  float64
 10  Smoking_Habits            30000 non-null  object 
 11  Daily_Caloric_Intake      30000 non-null  int64  
 12  Protein_Intake_g          30000 non-null  int64  
 13  Carb_Intake_g             30000 non-null  int64  
 14  Fat_In

In [None]:
df.head()

Unnamed: 0,User_ID,Age,Gender,Height_cm,Weight_kg,Sleep_Hours,Sleep_Quality,Stress_Level,Work_Type,Alcohol_Consumption,...,Blood_Sugar_mg/dL,Cholesterol_mg/dL,Liver_Function_AST,Liver_Function_ALT,Mental_Health_Score,BMI,Health_Issue,Severity,Management,Medication
0,1,52,Male,166,95,9.492258,Good,Low,Sedentary,1.958677,...,100,275,46,42,7.094642,34.47525,"Obesity, Cardiac Issue, Liver Problem, Respira...",Severe,Surgery,
1,2,21,Male,162,64,9.314001,Good,High,Active,3.096247,...,136,294,35,22,4.054235,24.386526,"Cardiac Issue, Mental Health Concern, Respirat...",Mild,Surgery,Prescription
2,3,18,Male,183,67,7.208673,Poor,Medium,Active,2.278278,...,139,187,32,13,2.526225,20.006569,"Cardiac Issue, Mental Health Concern, Mental H...",Severe,Lifestyle Changes,OTC
3,4,41,Female,182,114,4.367803,Poor,Medium,Sedentary,4.565499,...,114,289,17,22,1.804171,34.416133,"Obesity, Cardiac Issue, Mental Health Concern,...",Mild,Lifestyle Changes,OTC
4,5,35,Male,153,78,6.979355,Poor,High,Active,0.182537,...,117,257,13,47,7.703125,33.320518,"Obesity, Cardiac Issue, Liver Problem, Mental ...",Moderate,Medication,OTC


In [20]:
unique_states = df['Health_Issue'].unique()

# Create a dictionary to map states to sequential numerical values
state_to_index = {}
for i, state in enumerate(unique_states):
  state_to_index[state] = i

# Replace state names with numerical values in the DataFrame
print(state_to_index)

{'Obesity, Cardiac Issue, Liver Problem, Respiratory Issue, Nutritional Deficiency': 0, 'Cardiac Issue, Mental Health Concern, Respiratory Issue, Nutritional Deficiency': 1, 'Cardiac Issue, Mental Health Concern, Mental Health Concern, Hypertension, Respiratory Issue, Low Physical Activity, Low Physical Activity': 2, 'Obesity, Cardiac Issue, Mental Health Concern, Mental Health Concern, Respiratory Issue': 3, 'Obesity, Cardiac Issue, Liver Problem, Mental Health Concern, Mental Health Concern, Respiratory Issue, High Fat Intake': 4, 'Cardiac Issue, Mental Health Concern, Mental Health Concern, Respiratory Issue': 5, 'Cardiac Issue, Mental Health Concern, Mental Health Concern, Cardiac Issue, Hypertension, Respiratory Issue': 6, 'Liver Problem, Respiratory Issue': 7, 'Obesity, Cardiac Issue, Liver Problem, Mental Health Concern, Respiratory Issue': 8, 'Obesity, Cardiac Issue, Hypertension, Respiratory Issue': 9, 'Obesity, Respiratory Issue, Nutritional Deficiency': 10, 'Cardiac Issue, R

In [34]:
# prompt: remove health issue and make diiferent column for diiferent health_issue and if the health_issue is present encode them 1 else 0

# Create a list of unique health issues
unique_issues = []
for index, row in df.iterrows():
    issues = row['Health_Issue'].split(', ')
    for issue in issues:
        if issue not in unique_issues:
            unique_issues.append(issue)

# Create new columns for each health issue and initialize to 0
for issue in unique_issues:
    df[issue.replace(' ', '_')] = 0

# Set the value to 1 if the health issue is present for a user
for index, row in df.iterrows():
    issues = row['Health_Issue'].split(', ')
    for issue in issues:
        df.loc[index, issue.replace(' ', '_')] = 1

# Drop the original 'Health_Issue' column
df = df.drop('Health_Issue', axis=1)

# Display the first few rows of the updated DataFrame
df.head()


Unnamed: 0,User_ID,Age,Gender,Height_cm,Weight_kg,Sleep_Hours,Sleep_Quality,Stress_Level,Work_Type,Alcohol_Consumption,Smoking_Habits,Daily_Caloric_Intake,Protein_Intake_g,Carb_Intake_g,Fat_Intake_g,Fruit_Veg_Intake,Daily_Steps,Exercise_Type,Exercise_Duration_min,Heart_Rate_bpm,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Blood_Sugar_mg/dL,Cholesterol_mg/dL,Liver_Function_AST,Liver_Function_ALT,Mental_Health_Score,BMI,Severity,Management,Medication,Obesity,Cardiac_Issue,Liver_Problem,Respiratory_Issue,Nutritional_Deficiency,Mental_Health_Concern,Hypertension,Low_Physical_Activity,High_Fat_Intake,Diabetes,Kidney_Problem,Lack_of_Exercise,No_issue
0,1,52,Male,166,95,9.492258,Good,Low,Sedentary,1.958677,Regular smoker,1683,136,306,46,5,5663,,101,91,117,66,100,275,46,42,7.094642,34.47525,Severe,Surgery,,1,1,1,1,1,0,0,0,0,0,0,0,0
1,2,21,Male,162,64,9.314001,Good,High,Active,3.096247,Non-smoker,1969,52,374,86,2,8107,,41,61,108,67,136,294,35,22,4.054235,24.386526,Mild,Surgery,Prescription,0,1,0,1,1,1,0,0,0,0,0,0,0
2,3,18,Male,183,67,7.208673,Poor,Medium,Active,2.278278,Non-smoker,2658,124,326,85,6,4629,Strength Training,14,80,147,90,139,187,32,13,2.526225,20.006569,Severe,Lifestyle Changes,OTC,0,1,0,1,0,1,1,1,0,0,0,0,0
3,4,41,Female,182,114,4.367803,Poor,Medium,Sedentary,4.565499,Non-smoker,2796,145,356,69,4,12096,Strength Training,14,70,122,88,114,289,17,22,1.804171,34.416133,Mild,Lifestyle Changes,OTC,1,1,0,1,0,1,0,0,0,0,0,0,0
4,5,35,Male,153,78,6.979355,Poor,High,Active,0.182537,Regular smoker,3162,106,383,85,6,7639,Cardio,41,81,125,61,117,257,13,47,7.703125,33.320518,Moderate,Medication,OTC,1,1,1,1,0,1,0,0,1,0,0,0,0


In [35]:
# prompt:  determine_sub_categories using this function write add appropaiate Severity	Management	Medication column

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# ... (Preceding code remains the same)

# Apply sub-category function to create the Sub_Category, Severity, Management, and Medication columns
df[['Severity', 'Management', 'Medication']] = df.apply(
    lambda row: pd.Series(determine_sub_categories(row.to_dict(), row)), axis=1
)

# Display the first few rows of the updated DataFrame with sub-categories
df[567:987]


Unnamed: 0,User_ID,Age,Gender,Height_cm,Weight_kg,Sleep_Hours,Sleep_Quality,Stress_Level,Work_Type,Alcohol_Consumption,Smoking_Habits,Daily_Caloric_Intake,Protein_Intake_g,Carb_Intake_g,Fat_Intake_g,Fruit_Veg_Intake,Daily_Steps,Exercise_Type,Exercise_Duration_min,Heart_Rate_bpm,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Blood_Sugar_mg/dL,Cholesterol_mg/dL,Liver_Function_AST,Liver_Function_ALT,Mental_Health_Score,BMI,Severity,Management,Medication,Obesity,Cardiac_Issue,Liver_Problem,Respiratory_Issue,Nutritional_Deficiency,Mental_Health_Concern,Hypertension,Low_Physical_Activity,High_Fat_Intake,Diabetes,Kidney_Problem,Lack_of_Exercise,No_issue
567,568,52,Female,157,69,7.668773,Average,Medium,Active,0.407695,Occasional smoker,2276,60,384,36,1,4870,Strength Training,30,99,134,91,79,163,14,39,5.953337,27.993022,Controlled,Diet,Insulin,0,1,0,1,1,0,1,1,0,0,0,0,0
568,569,72,Female,185,62,7.098464,Good,Medium,Active,0.420946,Regular smoker,2794,55,213,91,9,11019,Cardio,52,71,112,76,76,297,14,40,3.103516,18.115413,Controlled,Diet,OTC,0,1,0,1,0,0,0,0,0,0,0,0,0
569,570,19,Female,158,82,6.922665,Good,Low,Active,1.665052,Regular smoker,2156,103,389,62,6,8397,Strength Training,61,96,145,75,135,174,29,42,1.086360,32.847300,Controlled,Diet,,1,1,0,1,0,1,1,0,0,0,0,0,0
570,571,33,Female,165,99,8.616961,Average,High,Sedentary,3.882334,Non-smoker,2585,59,257,95,5,10639,,33,72,147,60,85,172,27,32,8.293781,36.363636,Controlled,Insulin,OTC,1,1,0,1,0,1,1,0,1,0,0,0,0
571,572,39,Female,158,93,7.349945,Poor,Medium,Active,3.780053,Regular smoker,2100,58,299,27,3,7659,Cardio,25,95,110,81,75,284,28,35,9.426682,37.253645,Controlled,Diet,,1,1,0,1,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,983,20,Male,155,68,8.315936,Average,High,Sedentary,3.250397,Occasional smoker,2075,145,235,26,9,11946,,58,95,132,64,142,225,18,36,5.549986,28.303850,Controlled,Diet,,0,1,0,1,0,1,0,0,0,1,0,0,0
983,984,36,Male,153,105,6.878943,Average,Medium,Sedentary,2.555431,Regular smoker,1561,133,263,84,7,8647,Strength Training,87,76,145,84,72,150,32,42,2.710534,44.854543,Controlled,Medication,Insulin,1,1,0,1,1,1,1,0,1,0,0,0,0
984,985,32,Male,174,96,8.955799,Average,Low,Sedentary,3.377431,Occasional smoker,2036,79,392,70,7,6397,Strength Training,48,85,115,97,142,231,27,43,7.154132,31.708284,Controlled,Medication,Insulin,1,1,0,1,0,0,1,0,0,1,0,0,0
985,986,77,Female,162,61,6.944993,Good,Medium,Sedentary,1.847467,Regular smoker,2726,106,157,80,5,10247,Strength Training,57,83,128,95,127,157,28,13,3.979467,23.243408,Controlled,Medication,OTC,0,1,0,1,0,0,1,0,0,0,0,0,0


In [40]:


df = df.drop('Severity', axis=1)
df.head()


Unnamed: 0,User_ID,Age,Gender,Height_cm,Weight_kg,Sleep_Hours,Sleep_Quality,Stress_Level,Work_Type,Alcohol_Consumption,Smoking_Habits,Daily_Caloric_Intake,Protein_Intake_g,Carb_Intake_g,Fat_Intake_g,Fruit_Veg_Intake,Daily_Steps,Exercise_Type,Exercise_Duration_min,Heart_Rate_bpm,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Blood_Sugar_mg/dL,Cholesterol_mg/dL,Liver_Function_AST,Liver_Function_ALT,Mental_Health_Score,BMI,Obesity,Cardiac_Issue,Liver_Problem,Respiratory_Issue,Nutritional_Deficiency,Mental_Health_Concern,Hypertension,Low_Physical_Activity,High_Fat_Intake,Diabetes,Kidney_Problem,Lack_of_Exercise,No_issue
0,1,52,Male,166,95,9.492258,Good,Low,Sedentary,1.958677,Regular smoker,1683,136,306,46,5,5663,,101,91,117,66,100,275,46,42,7.094642,34.47525,1,1,1,1,1,0,0,0,0,0,0,0,0
1,2,21,Male,162,64,9.314001,Good,High,Active,3.096247,Non-smoker,1969,52,374,86,2,8107,,41,61,108,67,136,294,35,22,4.054235,24.386526,0,1,0,1,1,1,0,0,0,0,0,0,0
2,3,18,Male,183,67,7.208673,Poor,Medium,Active,2.278278,Non-smoker,2658,124,326,85,6,4629,Strength Training,14,80,147,90,139,187,32,13,2.526225,20.006569,0,1,0,1,0,1,1,1,0,0,0,0,0
3,4,41,Female,182,114,4.367803,Poor,Medium,Sedentary,4.565499,Non-smoker,2796,145,356,69,4,12096,Strength Training,14,70,122,88,114,289,17,22,1.804171,34.416133,1,1,0,1,0,1,0,0,0,0,0,0,0
4,5,35,Male,153,78,6.979355,Poor,High,Active,0.182537,Regular smoker,3162,106,383,85,6,7639,Cardio,41,81,125,61,117,257,13,47,7.703125,33.320518,1,1,1,1,0,1,0,0,1,0,0,0,0


In [42]:
def record(col):
  unique_states = df[col].unique()

# Create a dictionary to map states to sequential numerical values
  state_to_index = {}
  for i, state in enumerate(unique_states):
    state_to_index[state] = i
  return state_to_index
# Replace state names with numerical values in the DataFrame
g=record('Gender')
s=record('Sleep_Quality')
l=record('Stress_Level')
w=record('Work_Type')
e=record('Exercise_Type')
sm=record('Smoking_Habits')
print(g)
print(s)
print(l)
print(w)
print(e)
print(sm)


{'Male': 0, 'Female': 1}
{'Good': 0, 'Poor': 1, 'Average': 2}
{'Low': 0, 'High': 1, 'Medium': 2}
{'Sedentary': 0, 'Active': 1}
{'None': 0, 'Strength Training': 1, 'Cardio': 2}
{'Regular smoker': 0, 'Non-smoker': 1, 'Occasional smoker': 2}


In [43]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [45]:
cols_to_encode = ['Gender', 'Sleep_Quality', 'Stress_Level', 'Work_Type', 'Exercise_Type', 'Smoking_Habits']
for col in cols_to_encode:
  le = LabelEncoder()
  df[col] = le.fit_transform(df[col])

In [56]:
df.to_csv('synthetic_health_data2.csv', index=False)


In [48]:
# prompt: split the x from age to bmi and y from obesity to no_issue

# Assuming 'Obesity' to 'No_issue' are the target variables and the rest are features
X = df.loc[:, 'Age':'BMI']
y = df.loc[:, 'Obesity':'No_issue']
print(X)


       Age  Gender  Height_cm  Weight_kg  Sleep_Hours  Sleep_Quality  \
0       52       1        166         95     9.492258              1   
1       21       1        162         64     9.314001              1   
2       18       1        183         67     7.208673              2   
3       41       0        182        114     4.367803              2   
4       35       1        153         78     6.979355              2   
...    ...     ...        ...        ...          ...            ...   
29995   71       1        176        110     5.816560              1   
29996   23       1        170        113     4.130872              0   
29997   31       0        174         74     8.559954              1   
29998   59       1        172         89     8.879139              0   
29999   67       0        170         91     4.129817              0   

       Stress_Level  Work_Type  Alcohol_Consumption  Smoking_Habits  \
0                 1          1             1.958677             

In [49]:
print(y)

       Obesity  Cardiac_Issue  Liver_Problem  Respiratory_Issue  \
0            1              1              1                  1   
1            0              1              0                  1   
2            0              1              0                  1   
3            1              1              0                  1   
4            1              1              1                  1   
...        ...            ...            ...                ...   
29995        1              1              1                  1   
29996        1              1              0                  1   
29997        0              1              0                  1   
29998        1              1              0                  1   
29999        1              1              0                  1   

       Nutritional_Deficiency  Mental_Health_Concern  Hypertension  \
0                           1                      0             0   
1                           1                      1   

In [None]:
n

In [50]:


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8506666666666667


In [54]:


# Assuming you want to convert the DataFrame 'df' into a list of lists
simple_list = df.values.tolist()

# Print the first few elements of the simple list
print(simple_list[:5])


In [65]:

x=np.array(['30', 1, '165', '70', '7', 0, 2, 1,2, 1, '2500', '100', '300', '70', 5, '800', 2, '30', '72', '10', '80', '90', '190', '25', '300', '8', '25.7'])
x=np.array(x)
x=x.reshape(1,-1)
print(x)
res=clf.predict(x)
print(res)


[['30' '1' '165' '70' '7' '0' '2' '1' '2' '1' '2500' '100' '300' '70' '5'
  '800' '2' '30' '72' '10' '80' '90' '190' '25' '300' '8' '25.7']]
[[0 0 1 1 0 0 0 1 0 0 0 0 0]]


