In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
import seaborn as sns 
import matplotlib.pyplot as plt



Repository: https://github.com/jstraker1/datasummative_2.git

## 1. Describe the Dataset

In [15]:
# Load dataset
df = pd.read_csv("dirty_v3_path.csv")

# Display first few rows
df.head()

Unnamed: 0,Age,Gender,Medical Condition,Glucose,Blood Pressure,BMI,Oxygen Saturation,LengthOfStay,Cholesterol,Triglycerides,HbA1c,Smoking,Alcohol,Physical Activity,Diet Score,Family History,Stress Level,Sleep Hours,random_notes,noise_col
0,46.0,Male,Diabetes,137.04,135.27,28.9,96.04,6,231.88,210.56,7.61,0,0,-0.2,3.54,0,5.07,6.05,lorem,-137.057211
1,22.0,Male,Healthy,71.58,113.27,26.29,97.54,2,165.57,129.41,4.91,0,0,8.12,5.9,0,5.87,7.72,ipsum,-11.23061
2,50.0,,Asthma,95.24,,22.53,90.31,2,214.94,165.35,5.6,0,0,5.01,4.65,1,3.09,4.82,ipsum,98.331195
3,57.0,,Obesity,,130.53,38.47,96.6,5,197.71,182.13,6.92,0,0,3.16,3.37,0,3.01,5.33,lorem,44.187175
4,66.0,Female,Hypertension,95.15,178.17,31.12,94.9,4,259.53,115.85,5.98,0,1,3.56,3.4,0,6.38,6.64,lorem,44.831426


In [16]:
# Summarise the dataset
#df.info()
#df.describe()

# .info() - summary of structure, data types, missing values
# .describe() - summary of numerical values and statistics

In [17]:
# Set unnecessary column to be removed
unnecessary_cols = [ "random_notes" , "noise_col" ]

# Remove columns that are not necessary
df.drop(columns=unnecessary_cols, inplace=True)

In [18]:
# Rename columns
df.rename(columns={

    "Age": "age",
    "Gender": "gender",
    "Medical Condition": "medical_condition",
    "Glucose": "glucose",
    "Blood Pressure": "blood_pressure",
    "BMI": "bmi",
    "Oxygen Saturation": "oxygen_saturation",
    "LengthOfStay": "length_of_stay",
    "Cholesterol": "cholesterol",
    "Triglycerides": "triglycerides",
    "HbA1c": "hba1c",
    "Smoking": "smoking_status",
    "Alcohol": "alcohol_use",
    "Physical Activity": "physical_activity",
    "Diet Score": "diet_score",
    "Family History": "family_history",
    "Stress Level": "stress_level",
    "Sleep Hours": "sleep_hours"

}, inplace=True)

# Print updated columns to check renaming has been done correctly (no typos)
print(df.columns)

Index(['age', 'gender', 'medical_condition', 'glucose', 'blood_pressure',
       'bmi', 'oxygen_saturation', 'length_of_stay', 'cholesterol',
       'triglycerides', 'hba1c', 'smoking_status', 'alcohol_use',
       'physical_activity', 'diet_score', 'family_history', 'stress_level',
       'sleep_hours'],
      dtype='object')


In [19]:
# Glucose conversion
# Molecular weight for glucose
MW_glucose = 180.156

# Convert glucose from mg/dL to mmol/L
df['glucose'] = (df['glucose'] * 10) / MW_glucose

# Display first few rows to check conversion
df['glucose'].head()

0    7.606741
1    3.973223
2    5.286529
3         NaN
4    5.281534
Name: glucose, dtype: float64

In [20]:
# Cholesterol conversion
# Molecular weight for cholesterol
MW_cholesterol = 386.65

# Convert cholesterol from mg/dL to mmol/L
df['cholesterol'] = (df['cholesterol'] * 10) / MW_cholesterol

# Display first few rows to check conversion
df['cholesterol'].head()

0    5.997155
1    4.282167
2    5.559033
3    5.113410
4    6.712272
Name: cholesterol, dtype: float64

In [21]:
# Triglycerides conversion
# Molecular weight for triglycerides
MW_triglycerides = 885.7

# Convert triglycerides from mg/dL to mmol/L
df['triglycerides'] = (df['triglycerides'] * 10) / MW_triglycerides

# Display first few rows to check conversion
df['triglycerides'].head()

0    2.377329
1    1.461104
2    1.866885
3    2.056340
4    1.308005
Name: triglycerides, dtype: float64

In [22]:
# HbA1c conversion
# Convert HbA1c from % to mmol/mol
df['hba1c'] = (df['hba1c'] * 10.929) - 23.5

# Display first few rows to check conversion
df['hba1c'].head()

0    59.66969
1    30.16139
2    37.70240
3    52.12868
4    41.85542
Name: hba1c, dtype: float64

In [23]:
# Handle missing values in the gender column by assigning them to an Unknown category
df['gender'] = df['gender'].fillna('Unknown')

# Check that there are no more missing values in gender
df['gender'].isnull().sum()

np.int64(0)

In [24]:
# Handle missing values in the medical_condition column by assigning them to an Unknown category
df['medical_condition'] = df['medical_condition'].fillna('Unknown')

# Check that there are no more missing values in medical condition
df['medical_condition'].isnull().sum()

np.int64(0)

In [25]:
# I am selecting all continuous numeric variables with no missing values to help predict age, glucose and blood_pressure
# This helps the regression model use as much complete numeric information as possible
# It avoids predictors that also contain missing values
from sklearn.linear_model import LinearRegression
predictor_cols = [
    'bmi',
    'oxygen_saturation',
    'length_of_stay',
    'cholesterol',
    'triglycerides',
    'hba1c',
    'physical_activity',
    'diet_score',
    'family_history',
    'stress_level',
    'sleep_hours'
]

In [28]:
# Impute missing values in target using linear regression on predictors

# Define the function
def regression_impute(df, target, predictors, add_flag=True):
        # This makes a reusable function for regression imputation
        # df: dataframe containing the data
        # target: the column to be imputed
        # predictors: list of columns to use as predictors
        # add_flag: whether to add a flag column indicating imputed rows

    # Define rows with and without missing target values
    not_null = df[df[target].notnull()]
    null = df[df[target].isnull()]
        # non_null: rows where target is not missing
        # null: rows where target is missing

     # If no missing values, create a 0-only flag and stop the function as no imputation is needed
    if null.empty:
        if add_flag and f"{target}_imputed" not in df.columns:
            df[f"{target}_imputed"] = 0
        return None
    
    # Set up training data
    X_train = not_null[predictors]
    y_train = not_null[target]
        # X_train = predictor columns (BMI, cholesterol, sleep_hours, etc.)
        # y_train = the real observed values of the target column (age, glucose, or blood_pressure)

    # Select rows requiring imputation
    X_test = null[predictors]

    # Fit regression model
    cleaning_model = LinearRegression()
    cleaning_model.fit(X_train, y_train)
        # Creates a linear regression object and fits it to the complete cases

     # Predict missing values
    preds = cleaning_model.predict(X_test)
        # This generates predicted values for rows where the target variable was missing

    # Fill in missing values in the original dataframe
    df.loc[df[target].isnull(), target] = preds
        # This assigns the predicted values back into the original dataframe

     # Add imputation flag
    if add_flag:
        flag_name = f"{target}_imputed"
        df[flag_name] = 0
        df.loc[null.index, flag_name] = 1
            # This creates a new column indicating which rows were imputed
            # It does this by creating a new column, setting all rows to 0 initially, then setting rows that were imputed to 1
            # This flag allows us to later check the distributions of imputed vs observed values

        return cleaning_model

In [29]:
# create a dictionary to store models if we want to inspect them later
imputation_models = {}

# Start a loop running 3 times, once for each of the three variables that need imputation
for target in ['age', 'glucose', 'blood_pressure']:
    # Run the regression imputation function for each variable
    model = regression_impute(df, target, predictor_cols, add_flag=True)
        # df: dataset
        # target: the variable currently being imputed
        # predictor_cols: the list of complete predictor variables
        # add_flag=True: tells the function to create the *_imputed columns
    # Save the fitted model into the dictionary
    imputation_models[target] = model

## 2. Test H1. Examine how disease risk changes with age 

**Multinomial logistic regression**

In [30]:
from sklearn.preprocessing import LabelEncoder

# adding code labels for each medical condition, e.g. diabetes = 0 , hypertension = 1 , obesity = 2 ...
le = LabelEncoder()

df['condition_label'] = le.fit_transform(df['medical_condition'])
df['condition_label'].head()

0    3
1    4
2    1
3    6
4    5
Name: condition_label, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split
x= df['age']
y=df['condition_label']

# splitting data into training data and test data
x_train , x_test, y_train, y_test = train_test_split(x , y, random_state=42 , test_size =0.2 , stratify = y) 

# reshaping data so that it is compatible with sklearn
x_train = np.array(x_train).reshape(-1,1)
y_train = np.array(y_train).reshape(-1,1)
x_test = np.array(x_test).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, metrics

# creating and training the model
log_model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs' ,  max_iter = 10000, random_state = 42)
log_model.fit(x_train , y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,10000


In [33]:
# making predictions 
y_pred = log_model.predict(x_test)

# probability for each health condition
y_proba = log_model.predict_proba(x_test)

# evaluating the model 
print(f"Logistic Regression model accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.2f}%")

Logistic Regression model accuracy: 30.27%


**Simple logistic regression models**

## Test H2. Identify whether gender affects disease risk 

## Test H3. Analyse how lifestyle factors influence medical risk markers 

## Test H4. Identify which medical risk factors predict each disease 

## Compare predictive power of lifestyle only models and medical inclusive models 

## Evaluate the overall pathway from lifestyle to medical markers to disease 

## Build a simple classifier that can diagnose certain conditions 

## Build Shiny app for clinicians 

## Discuss limitations and validity