In [1]:
import pandas as pd
import numpy as np

In [13]:
# Set a random seed for reproducibility
np.random.seed(42)

# Number of records
n_records = 1000

# Function to adjust risk factors based on age and diet
def adjust_risk_factors(age, diet):
    hypertension = 'Yes' if (np.random.rand() < 0.3 + 0.05 * (age // 10)) or (diet == 'High Fat') else 'No'
    diabetes = 'Yes' if np.random.rand() < 0.2 + 0.03 * (age // 10) else 'No'
    return hypertension, diabetes

# Function to calculate BMI and BMI category
def calculate_bmi(weight, height):
    bmi = weight / (height/100)**2
    if bmi < 18.5:
        category = 'Underweight'
    elif bmi < 24.9:
        category = 'Normal weight'
    elif bmi < 30:
        category = 'Overweight'
    else:
        category = 'Obesity'
    return round(bmi, 1), category

# Simulating the data
data = {
    "Patient_ID": np.arange(1, n_records + 1),
    "Age": np.random.randint(18, 90, size=n_records),
    "Gender": np.random.choice(['Male', 'Female'], size=n_records),
    "Diet_Type": np.random.choice(['High Fat', 'Balanced', 'Vegetarian'], size=n_records),
    "Smoking_Status": np.random.choice(['Smoker', 'Non-smoker'], size=n_records),
    "Physical_Activity_Level": np.random.choice(['Low', 'Moderate', 'High'], size=n_records),
    "Family_History_of_Stroke": np.random.choice(['Yes', 'No'], size=n_records),
    "Hypertension": [adjust_risk_factors(age, diet)[0] for age, diet in zip(data['Age'], data['Diet_Type'])],
    "Diabetes": [adjust_risk_factors(age, diet)[1] for age, diet in zip(data['Age'], data['Diet_Type'])],
    "Height_cm": np.random.normal(165, 10, size=n_records).astype(int),  # Normal distribution around average height
    "Weight_kg": np.random.normal(70, 15, size=n_records).astype(int),  # Normal distribution around average weight
    "Region": np.random.choice(['Urban', 'Rural'], size=n_records)  # Only Urban and Rural
}

# Adding BMI and BMI category
bmi_values = [calculate_bmi(weight, height) for weight, height in zip(data['Weight_kg'], data['Height_cm'])]
data['BMI'], data['BMI_Category'] = zip(*bmi_values)

# Define stroke risk based on conditions
data['Stroke_Risk'] = [
    1 if any([
        age > 65, 
        hypertension == 'Yes', 
        diabetes == 'Yes', 
        bmi > 25
    ]) else 0 
    for age, hypertension, diabetes, bmi in zip(data['Age'], data['Hypertension'], data['Diabetes'], data['BMI'])
]

# Creating DataFrame
stroke_df = pd.DataFrame(data)

# Displaying the first few rows of the DataFrame
stroke_df.head()

Unnamed: 0,Patient_ID,Age,Gender,Diet_Type,Smoking_Status,Physical_Activity_Level,Family_History_of_Stroke,Hypertension,Diabetes,Height_cm,Weight_kg,Region,BMI,BMI_Category,Stroke_Risk
0,1,69,Male,Balanced,Non-smoker,Moderate,Yes,No,No,158,90,Urban,36.1,Obesity,1
1,2,32,Male,Vegetarian,Non-smoker,High,Yes,Yes,Yes,146,85,Rural,39.9,Obesity,1
2,3,89,Female,High Fat,Non-smoker,High,Yes,Yes,No,171,49,Rural,16.8,Underweight,1
3,4,78,Male,Balanced,Non-smoker,High,Yes,Yes,No,148,58,Urban,26.5,Overweight,1
4,5,38,Male,Vegetarian,Smoker,Moderate,No,No,No,157,81,Rural,32.9,Obesity,1


In [10]:
stroke_df.shape

(1000, 15)

In [11]:
# Exporting the simulated stroke data to a CSV file in the data folder
stroke_df.to_csv('../data/Simulated_Stroke_Data.csv', index=False)

## Scientific Evidence for Stroke Risk Factors

1. **Age**:
   - **Evidence**: Older age is a well-documented risk factor for stroke. The risk of having a stroke approximately doubles every decade after age 55.
   - **Source**: American Stroke Association.

2. **Gender**:
   - **Evidence**: Men have a higher risk of stroke than women in most age groups, although women are more likely to die from a stroke.
   - **Source**: Centers for Disease Control and Prevention (CDC).

3. **Diet Type**:
   - **Evidence**: Diets high in fats and low in fruits and vegetables are associated with an increased stroke risk, while balanced and vegetarian diets are generally protective.
   - **Source**: A study from "Neurology" (the medical journal of the American Academy of Neurology).

4. **Smoking Status**:
   - **Evidence**: Smoking accelerates clot formation, thickens blood, and increases the amount of plaque buildup in arteries, all of which increase the risk of stroke.
   - **Source**: National Stroke Association.

5. **Physical Activity Level**:
   - **Evidence**: Regular physical activity can lower blood pressure, improve cholesterol levels, and enhance overall cardiovascular health, reducing stroke risk.
   - **Source**: American Heart Association.

6. **Family History of Stroke**:
   - **Evidence**: A family history of stroke increases your risk, suggesting genetic factors contribute to stroke risk.
   - **Source**: Stroke Association, UK.

7. **Hypertension**:
   - **Evidence**: High blood pressure is the leading cause of stroke and is the most significant controllable risk factor.
   - **Source**: American Stroke Association.

8. **Diabetes**:
   - **Evidence**: Diabetes is a significant risk factor for stroke. Diabetes complicates blood flow and leads to harmful clot formation.
   - **Source**: American Diabetes Association.

9. **Cholesterol Levels**:
   - **Evidence**: High cholesterol contributes to plaque buildup in arteries, increasing the risk of a clot forming and causing a stroke.
   - **Source**: American Heart Association.

10. **BMI**:
    - **Evidence**: High body mass index (BMI) is associated with increased risk of high cholesterol, high blood pressure, and diabetes, which are risk factors for stroke.
    - **Source**: Obesity and Stroke Study, American Stroke Association.

11. **Height and Weight**:
    - **Evidence**: Indirectly relate to stroke risk through their contribution to calculating BMI.
    - **Source**: General medical consensus on the relationship between obesity and health risks.

12. **Region** (Urban/Rural):
    - **Evidence**: Urban environments may have higher pollution levels, which can affect health, but they also typically have better healthcare access. Rural areas might have less pollution but poorer access to immediate stroke care.
    - **Source**: Studies on urban vs. rural health outcomes, such as those published in the Journal of Public Health.
