<a href="https://colab.research.google.com/github/hazelchiang2102/ADALL_github/blob/main/ADALL_Project_Draft_2401.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Core libraries
import pandas as pd
import numpy as np
# Visualisation
import matplotlib.pyplot as plt
# Modelling and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [None]:
# Example: Replace this with the raw URL of your GitHub file
github_raw_url = 'https://raw.githubusercontent.com/hazelchiang2102/ADALL_github/refs/heads/main/diabetes_binary_health_indicators_BRFSS2015.csv'
try:
    df = pd.read_csv(github_raw_url)
    print("Successfully loaded data from GitHub!")
    display(df.head())
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please ensure the URL is correct and the file format is compatible with `pd.read_csv`.")

Successfully loaded data from GitHub!


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [None]:
from google.colab import userdata
from openai import OpenAI

# Load key from Google Colab Secrets
api_key = userdata.get('OPENAI_API_KEY')

client = OpenAI(
    api_key=api_key,
)

In [None]:
#generate a preview of ten rows as text first, so that we can use it for sending to LLM API later.
data_preview = df.head(10).to_string()
print(data_preview)

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  AnyHealthcare  NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  Income
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0                   0.0           0.0     0.0      1.0                0.0            1.0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0     3.0
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0                   0.0           1.0     0.0      0.0                0.0            0.0          1.0      3.0       0.0       0.0       0.0  0.0   7.0        6.0     1.0
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0                   0.0           0.0     1.0      0.0                0.0            1.0          1.0      5.0      30.0      30.0       1.0  0.0   9.0        4.0     8.0
3              0.0     1.0       0.0        

In [None]:
#sending to LLM API
#generate a preview of ten rows as text first, so that we can use it for sending to LLM API later.
data_preview = df.head(10).to_string()
response = client.responses.create(
    model="gpt-5-mini",
    instructions="""
You are an expert data scientist with extensive knowledge of tree-based models.
Use ONLY the information inside the dataset profile text.
Do NOT invent correlations, columns, or values.
If something is not in the dataset profile, state 'Not shown in profile'.
Always justify recommendations using reasoning trace based ONLY on the dataset profile.
""",
    input=f"""Dataset info: {data_preview}\n
    Context:
    The business problem is to identify individuals at high risk of diabetes early and to predict the likelihood of diabetes using non-invasive, survey-based indicators (e.g. BMI, blood pressure, lifestyle habits), so that preventive interventions can be prioritised.\n
    Questions
    1. Based on the context and dataset info, how should i approach modelling objective? focus on problem framing aspects.
    2. What would be the most meaningful target?
    3. What would be most important metric for scoring?
    4. What are the top 3 most potentially important features?
    """)
print(response.output_text)

Below I answer each question and justify recommendations using only the information shown in the dataset profile and the supplied context.

1) How to approach the modelling objective (problem framing)
- Frame this as a supervised binary classification task: the dataset contains a Diabetes_binary column with values 0.0/1.0, and the business goal is to predict the likelihood that an individual has diabetes so preventive interventions can be prioritised.
- Use probability outputs (risk scores) rather than just class labels so you can set operational thresholds for intervention capacity and trade off false negatives vs false positives.
- Treat it as a cross‑sectional risk‑screening model: the features shown are non‑invasive survey indicators (BMI, blood pressure indicators, lifestyle, demographic and access variables). The profile does not show any temporal or longitudinal information, so assume prediction is from current survey features (Not shown in profile: whether labels represent futu

In [None]:
import pandas as pd
import numpy as np
from io import StringIO

# ---------------------------
# Generate a full dataset profile
# ---------------------------

buffer = StringIO()

# dtypes
buffer.write("=== DTYPES ===\n")
buffer.write(df.dtypes.to_string())
buffer.write("\n\n")

# numeric describe
buffer.write("=== NUMERIC DESCRIBE ===\n")
buffer.write(df.describe().to_string())
buffer.write("\n\n")

# categorical describe
buffer.write("=== CATEGORICAL DESCRIBE ===\n")
try:
    buffer.write(df.describe(include='object').to_string())
except:
    buffer.write("No categorical columns")
buffer.write("\n\n")

# null summary
buffer.write("=== NULL SUMMARY ===\n")
null_summary = (
    df.isna().sum().to_frame("null_count")
    .assign(null_pct=lambda x: x["null_count"]/len(df))
)
buffer.write(null_summary.to_string())
buffer.write("\n\n")

# unique cardinality
buffer.write("=== UNIQUE VALUES PER COLUMN ===\n")
buffer.write(df.nunique().to_frame("unique_count").to_string())
buffer.write("\n\n")

# correlation matrix
buffer.write("=== CORRELATIONS (NUMERIC ONLY) ===\n")
buffer.write(df.corr(numeric_only=True).round(3).to_string())
buffer.write("\n\n")

# value counts for categoricals
buffer.write("=== VALUE COUNTS (TOP 20 PER CATEGORICAL COLUMN) ===\n")
cat_cols = df.select_dtypes(include='object').columns
if len(cat_cols) > 0:
    for col in cat_cols:
        buffer.write(f"\nColumn: {col}\n")
        vc = df[col].value_counts().head(20)
        buffer.write(vc.to_string())
        buffer.write("\n")
else:
    buffer.write("No categorical columns\n")
buffer.write("\n")

# --------- FIXED OUTLIER COMPUTATION (NO BOOLEANS) ---------
buffer.write("=== OUTLIER SUMMARY (IQR METHOD) ===\n")
num_cols = df.select_dtypes(include=['number']).columns  # exclude booleans
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[num_cols] < (Q1 - 1.5*IQR)) | (df[num_cols] > (Q3 + 1.5*IQR))).sum()
buffer.write(outliers.to_string())
buffer.write("\n\n")

# leakage scan: columns with all unique values
buffer.write("=== POSSIBLE LEAKAGE COLUMNS (UNIQUE FOR EACH ROW) ===\n")
leak_cols = df.columns[df.nunique() == len(df)]
buffer.write(str(list(leak_cols)))
buffer.write("\n\n")

# shape, duplicates, constant cols
buffer.write("=== SHAPE / DUPLICATES / CONSTANT COLUMNS ===\n")
dup_count = df.duplicated().sum()
constant_cols = df.columns[df.nunique() == 1].tolist()
buffer.write(f"Rows: {len(df)}, Columns: {df.shape[1]}\n")
buffer.write(f"Duplicate rows: {dup_count}\n")
buffer.write(f"Constant columns: {constant_cols}\n\n")

# Final text
payload_text = buffer.getvalue()

print(payload_text)

=== DTYPES ===
Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64

=== NUMERIC DESCRIBE ===
       Diabetes_binary         HighBP       HighChol      CholCheck            BMI         Smoker         Stroke  HeartDiseaseorAttack   PhysActivity         Fruits        Veggies  HvyAlcoholConsump  AnyHealthcare    NoDocbcCost        GenHlth       MentH

In [None]:
response = client.responses.create(
    model="gpt-5-mini",
    instructions="""
You are an expert data scientist with extensive knowledge of tree-based models.
Always justify recommendations using reasoning trace based ONLY on the dataset profile.
""",
    input=f"""
Dataset info: {payload_text}\n
Questions:\n
1. Based on the dataset profile, what data quality issues should be resolved before modelling?
Provide a priority list and justify each item. \n
2. Which columns appear redundant, correlated, or likely to cause leakage?
Explain why each is problematic. \n
Next: Provide a python script to handle the identified issues.
Define one helper function for each issue.
Then define a wrapper function that calls these helper with true false option as user choice
Provide a single line of code to run the overall wrapper function.
Do not encode categorical columns or model first.
""")

print(response.output_text)


Below I first summarize the data-quality problems I recommend you fix (priority-ordered with justification based only on the supplied dataset profile). Then I list the columns that look redundant / correlated / potentially leak information (again justified from the profile). Finally I provide a self-contained Python script that implements one helper function per issue and a wrapper that runs them under user control. The script does not encode categoricals or run any model.

1) Priority list of data-quality issues to resolve (highest priority first), with justification from the profile

- 1 — Remove duplicate rows (HIGH priority)
  - Why: Profile shows 24,206 duplicate rows out of 253,680 total rows (~9.5% duplicates). Duplicates inflate training sample size and bias model evaluation if not intentional; removing duplicates is low-risk and high-impact.
  - Evidence: "Duplicate rows: 24206"

- 2 — Handle very low-variance / near-constant binary features (HIGH priority)
  - Why: Near-const

In [None]:
import pandas as pd
import numpy as np

def remove_duplicates(df):
    """
    Remove exact duplicate rows.
    Returns (df_no_dup, info_dict) where info_dict has counts.
    """
    before = len(df)
    df2 = df.drop_duplicates().reset_index(drop=True)
    after = len(df2)
    info = {"duplicates_removed": before - after, "rows_before": before, "rows_after": after}
    print(f"Removed {info['duplicates_removed']} duplicate rows.")
    return df2, info

def drop_low_variance_features(df, threshold=0.95, target_col='Diabetes_binary'):
    """
    Drop features where the most frequent value frequency >= threshold.
    E.g., threshold=0.95 drops columns that are >=95% the same value.
    Does not drop the target column even if it is low-variance.
    Returns (df_reduced, dropped_columns_list).
    """
    dropped = []
    for col in df.columns:
        if col == target_col:
            continue
        # Only evaluate columns with at least one non-null value
        vc = df[col].value_counts(normalize=True, dropna=False)
        if len(vc) == 0:
            continue
        top_freq = vc.iloc[0]
        if top_freq >= threshold:
            dropped.append(col)
    df2 = df.drop(columns=dropped)
    print(f"Dropped {len(dropped)} low-variance columns: {dropped}")
    return df2, dropped

def drop_highly_correlated(df, target_col='Diabetes_binary', corr_threshold=0.50):
    """
    Drop features that are highly correlated with each other (absolute corr >= corr_threshold).
    For each pair with |corr| >= corr_threshold, keep the one with higher absolute correlation to the target.
    Works on numeric columns only.
    Returns (df_reduced, dropped_columns_list).
    """
    numeric = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col not in numeric:
        raise ValueError("target_col must be numeric and present in the dataframe")
    features = [c for c in numeric if c != target_col]
    corr = df[features].corr().abs()
    to_drop = set()
    # Precompute feature-target correlations
    target_corr = df[features].corrwith(df[target_col]).abs().to_dict()
    # iterate upper triangle pairs
    for i, fi in enumerate(features):
        if fi in to_drop:
            continue
        for fj in features[i+1:]:
            if fj in to_drop:
                continue
            cij = corr.at[fi, fj]
            if cij >= corr_threshold:
                # drop the one with lower abs corr with target; if tie, drop fj
                if target_corr.get(fi, 0) >= target_corr.get(fj, 0):
                    drop = fj
                else:
                    drop = fi
                to_drop.add(drop)
    dropped = sorted(list(to_drop))
    df2 = df.drop(columns=dropped)
    print(f"Dropped {len(dropped)} highly correlated features (threshold={corr_threshold}): {dropped}")
    return df2, dropped

def winsorize_numeric(df, cols=None, lower_pct=0.01, upper_pct=0.99, exclude_binary=True):
    """
    Winsorize (clip) numeric columns at the given percentiles. Returns (df_winsorized, cols_winsorized)
    - If cols is None, choose numeric columns with more than 2 unique values (non-binary) and dtype numeric.
    - exclude_binary=True avoids winsorizing binary columns flagged by the IQR summary.
    """
    if cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        # exclude target if present
        # pick only columns with >2 unique values if exclude_binary is True
        if exclude_binary:
            cols = [c for c in numeric_cols if df[c].nunique(dropna=False) > 2]
        else:
            cols = numeric_cols
    cols = [c for c in cols if c in df.columns]
    clipped_cols = []
    df2 = df.copy()
    for c in cols:
        lower = df2[c].quantile(lower_pct)
        upper = df2[c].quantile(upper_pct)
        if pd.isna(lower) or pd.isna(upper):
            continue
        # Only clip if there is spread
        if lower < upper:
            df2[c] = df2[c].clip(lower=lower, upper=upper)
            clipped_cols.append(c)
    print(f"Winsorized {len(clipped_cols)} columns at {int(lower_pct*100)}th and {int(upper_pct*100)}th percentiles: {clipped_cols}")
    return df2, clipped_cols

def impute_missing(df, strategy='median'):
    """
    Impute missing values if present.
    - For numeric columns, use median (default) or mean.
    - For object / categorical columns, use mode.
    Returns (df_imputed, imputation_map)
    """
    imputation_map = {}
    df2 = df.copy()
    for col in df2.columns:
        if df2[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df2[col]):
                if strategy == 'median':
                    fill = df2[col].median()
                elif strategy == 'mean':
                    fill = df2[col].mean()
                else:
                    raise ValueError("strategy must be 'median' or 'mean' for numeric")
            else:
                fill = df2[col].mode(dropna=True).iloc[0] if not df2[col].mode(dropna=True).empty else np.nan
            df2[col] = df2[col].fillna(fill)
            imputation_map[col] = fill
    if imputation_map:
        print(f"Imputed missing columns: {imputation_map}")
    else:
        print("No missing values detected; no imputation performed.")
    return df2, imputation_map

def preprocess_wrapper(
    df,
    remove_duplicates_flag=True,
    drop_low_variance_flag=True,
    low_variance_threshold=0.95,
    drop_correlated_flag=True,
    corr_threshold=0.50,
    winsorize_flag=False,
    winsorize_lower=0.01,
    winsorize_upper=0.99,
    impute_missing_flag=False,
    target_col='Diabetes_binary'
):
    """
    Wrapper to run preprocessing steps in order:
      1. remove duplicates
      2. drop low-variance features (>= low_variance_threshold)
      3. drop highly correlated features (abs corr >= corr_threshold)
      4. winsorize numeric non-binary features (optional)
      5. impute missing values (optional)
    Returns (df_processed, summary_dict)
    """
    summary = {"steps": []}
    df_work = df.copy()
    if remove_duplicates_flag:
        df_work, info = remove_duplicates(df_work)
        summary['duplicates_info'] = info
        summary['steps'].append('duplicates_removed')

    if drop_low_variance_flag:
        df_work, dropped_lv = drop_low_variance_features(df_work, threshold=low_variance_threshold, target_col=target_col)
        summary['low_variance_dropped'] = dropped_lv
        summary['steps'].append('low_variance_dropped')

    if drop_correlated_flag:
        df_work, dropped_corr = drop_highly_correlated(df_work, target_col=target_col, corr_threshold=corr_threshold)
        summary['correlated_dropped'] = dropped_corr
        summary['steps'].append('correlated_dropped')

    if winsorize_flag:
        df_work, wins = winsorize_numeric(df_work, cols=None, lower_pct=winsorize_lower, upper_pct=winsorize_upper, exclude_binary=True)
        summary['winsorized'] = wins
        summary['steps'].append('winsorized')

    if impute_missing_flag:
        df_work, imp_map = impute_missing(df_work, strategy='median')
        summary['imputed'] = imp_map
        summary['steps'].append('imputed')

    print("Preprocessing complete. Steps performed:", summary['steps'])
    return df_work, summary

# Example single-line run (replace `df` with your dataframe variable):
# processed_df, preprocess_summary = preprocess_wrapper(df,
#                                                      remove_duplicates_flag=True,
#                                                      drop_low_variance_flag=True,
#                                                      low_variance_threshold=0.95,
#                                                      drop_correlated_flag=True,
#                                                      corr_threshold=0.50,
#                                                      winsorize_flag=False,
#                                                      impute_missing_flag=False)


In [None]:
# ---------------------------
# One of the possible response from GPT.
# What are the differences with yours?
# How would you improve earlier prompt?
# ---------------------------

# Helper 1: remove duplicates
def remove_duplicates(df, inplace=False):
    """
    Remove exact duplicate rows.
    Returns (df_clean, n_removed).
    """
    if not inplace:
        df = df.copy()
    initial = len(df)
    df.drop_duplicates(inplace=True)
    removed = initial - len(df)
    return (df, removed)

# Helper 2: convert binary-like float columns to integer / category
def convert_binary_floats(df, inplace=False, as_category=False):
    """
    Detect columns with exactly 2 unique non-null values and integer-like (0/1)
    and convert them to integer (int8) or categorical.
    Returns (df_converted, cols_converted).
    """
    if not inplace:
        df = df.copy()
    cols_converted = []
    for col in df.columns:
        # consider only non-null unique values
        nunique = df[col].nunique(dropna=True)
        if nunique == 2:
            # check values are subset of {0,1} after rounding
            vals = set(pd.Series(df[col].dropna().unique()).astype(float).round(6))
            if vals.issubset({0.0, 1.0}):
                if as_category:
                    df[col] = df[col].astype('category')
                else:
                    df[col] = df[col].astype('int8')
                cols_converted.append(col)
    return (df, cols_converted)

# Helper 3: cap/winsorize outliers using IQR method (or percentile clipping option)
def cap_outliers_iqr(df, cols=None, factor=1.5, inplace=False):
    """
    For each specified column (or all numeric columns if cols is None),
    compute IQR and clip values to [Q1 - factor*IQR, Q3 + factor*IQR].
    Returns (df_capped, caps) where caps is dict of (col: (lower, upper)).
    """
    if not inplace:
        df = df.copy()
    numeric = df.select_dtypes(include=[np.number]).columns.tolist()
    if cols is None:
        cols = numeric
    caps = {}
    for col in cols:
        if col not in df.columns:
            continue
        series = df[col].dropna()
        if series.empty:
            continue
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        lower = q1 - factor * iqr
        upper = q3 + factor * iqr
        # only apply if bounds are finite
        if np.isfinite(lower) and np.isfinite(upper):
            df[col] = df[col].clip(lower=lower, upper=upper)
            caps[col] = (lower, upper)
    return (df, caps)

# Helper 4: drop highly correlated features (automated selection using target correlation)
def drop_correlated(df, target, threshold=0.45, inplace=False, exclude=None):
    """
    Drop one column from each pair with absolute correlation > threshold.
    Selection rule: for a correlated pair (a,b), drop the column with lower
    absolute Pearson correlation with the target. Never drop the target or
    columns in exclude list.
    Returns (df_reduced, dropped_cols)
    """
    if not inplace:
        df = df.copy()
    if exclude is None:
        exclude = []
    exclude = set(exclude) | {target}
    # compute correlations on numeric columns only
    numeric = df.select_dtypes(include=[np.number]).columns.tolist()
    corr = df[numeric].corr().abs()
    to_drop = set()
    dropped_cols = []
    # iterate upper triangle
    cols = corr.columns.tolist()
    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            a, b = cols[i], cols[j]
            if a in exclude or b in exclude:
                continue
            if corr.at[a, b] > threshold:
                # compare absolute corr with target (if present)
                corr_with_target = df[numeric].corr().abs().get(target)
                # If target not numeric or missing, fall back to mean absolute corr across others
                if target in numeric:
                    a_score = corr_with_target.get(a, 0.0)
                    b_score = corr_with_target.get(b, 0.0)
                else:
                    a_score = corr[a].mean()
                    b_score = corr[b].mean()
                # drop the one with smaller abs corr with target; tie-break by column name
                if a_score < b_score:
                    pick = a
                elif b_score < a_score:
                    pick = b
                else:
                    pick = min(a, b)
                if pick not in to_drop:
                    to_drop.add(pick)
                    dropped_cols.append(pick)
    df.drop(columns=list(to_drop), inplace=True)
    return (df, dropped_cols)

# Helper 5: simple target balancing (undersample or oversample). Use only when desired.
def balance_target_simple(df, target, method='undersample', random_state=42, inplace=False):
    """
    Simple balancing:
      - 'undersample': randomly downsample majority class to minority class size.
      - 'oversample': randomly upsample minority class with replacement to majority size.
    Returns (df_balanced, counts) where counts is dict of new class counts.
    """
    if not inplace:
        df = df.copy()
    if target not in df.columns:
        raise ValueError("target not in dataframe")
    counts = df[target].value_counts()
    if len(counts) < 2:
        return (df, counts.to_dict())
    classes = counts.index.tolist()
    c0, c1 = classes[0], classes[1]
    n0, n1 = counts.iloc[0], counts.iloc[1]
    rng = np.random.RandomState(random_state)
    if method == 'undersample':
        # downsample majority
        if n0 > n1:
            maj, minc = c0, c1
            n_min = n1
        else:
            maj, minc = c1, c0
            n_min = n0
        maj_df = df[df[target] == maj].sample(n=n_min, random_state=random_state)
        min_df = df[df[target] == minc]
        df_bal = pd.concat([maj_df, min_df]).sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    elif method == 'oversample':
        # upsample minority
        if n0 < n1:
            minc = c0
            n_maj = n1
        else:
            minc = c1
            n_maj = n0
        min_df = df[df[target] == minc]
        n_to_sample = n_maj
        sampled = min_df.sample(n=n_to_sample, replace=True, random_state=random_state)
        maj_df = df[df[target] != minc]
        df_bal = pd.concat([maj_df, sampled]).sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    else:
        raise ValueError("method must be 'undersample' or 'oversample'")
    return (df_bal, df_bal[target].value_counts().to_dict())

# Helper 6: simple reporter to summarize key issues (optional)
def report_summary(df, target='Diabetes_binary'):
    """
    Return a small dict summary: counts, duplicates, target mean, numeric outlier flags (IQR counts),
    and top correlations to target.
    """
    out = {}
    out['rows'] = len(df)
    out['cols'] = df.shape[1]
    out['duplicates'] = df.duplicated().sum()
    if target in df.columns:
        out['target_mean'] = float(df[target].mean())
        out['target_counts'] = df[target].value_counts().to_dict()
    # numeric IQR outlier counts for selected columns that were flagged in profile
    numeric = df.select_dtypes(include=[np.number]).columns.tolist()
    iqr_outliers = {}
    for col in ['BMI', 'MentHlth', 'PhysHlth']:
        if col in df.columns:
            s = df[col].dropna()
            q1, q3 = s.quantile(0.25), s.quantile(0.75)
            iqr = q3 - q1
            lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
            iqr_outliers[col] = int(((s < lower) | (s > upper)).sum())
    out['iqr_outliers_sample_cols'] = iqr_outliers
    # top absolute correlations with target
    if target in numeric:
        corrs = df[numeric].corr()[target].abs().sort_values(ascending=False)
        out['top_corrs_with_target'] = corrs.head(10).to_dict()
    return out

# Wrapper function that calls the helpers based on user options
def preprocess(
    df,
    target='Diabetes_binary',
    remove_duplicates_flag=True,
    convert_binaries_flag=True,
    cap_outliers_flag=True,
    outlier_cols=None,
    drop_correlated_flag=True,
    corr_threshold=0.45,
    balance_flag=False,
    balance_method='undersample',
    balance_random_state=42,
    inplace=False
):
    """
    Run a sequence of data-prep steps based on boolean flags.
    Returns: df_processed, report where report contains information about actions taken.
    """
    report = {}
    if not inplace:
        df = df.copy()
    # 0) initial summary
    report['initial_rows'] = len(df)
    report['initial_cols'] = df.shape[1]
    # 1) remove duplicates
    if remove_duplicates_flag:
        df, removed = remove_duplicates(df, inplace=True)
        report['duplicates_removed'] = int(removed)
    else:
        report['duplicates_removed'] = 0
    # 2) convert binary floats
    if convert_binaries_flag:
        df, converted = convert_binary_floats(df, inplace=True, as_category=False)
        report['binary_cols_converted'] = converted
    else:
        report['binary_cols_converted'] = []
    # 3) cap outliers on specified continuous columns (default to BMI, MentHlth, PhysHlth)
    if cap_outliers_flag:
        if outlier_cols is None:
            # choose sensible defaults based on profile
            outlier_cols = [c for c in ['BMI', 'MentHlth', 'PhysHlth'] if c in df.columns]
        df, caps = cap_outliers_iqr(df, cols=outlier_cols, factor=1.5, inplace=True)
        report['outlier_caps'] = caps
    else:
        report['outlier_caps'] = {}
    # 4) drop correlated features
    if drop_correlated_flag:
        df, dropped = drop_correlated(df, target=target, threshold=corr_threshold, inplace=True, exclude=None)
        report['dropped_correlated'] = dropped
    else:
        report['dropped_correlated'] = []
    # 5) optional simple balancing
    if balance_flag:
        df, counts = balance_target_simple(df, target=target, method=balance_method, random_state=balance_random_state, inplace=True)
        report['balanced_counts'] = counts
        report['balance_method'] = balance_method
    else:
        report['balanced_counts'] = None
    # final summary
    report['final_rows'] = len(df)
    report['final_cols'] = df.shape[1]
    # include small summary
    report['summary'] = report_summary(df, target=target)

    return df, report


# Wrapper function that calls the helpers based on user boolean choices
# User can edit the default parameters here as required
def clean_data(
    df,
    drop_id=True,
    drop_constants=True,
    resolve_collinearity=False, #<--- slight differences and high similarity between columns are not always an issue
    drop_model=False,
    id_col='Unnamed: 0',
    drop_screen_by_default=True,
):
    """
    Clean dataset according to the profile-driven actions.
    Parameters:
      - df: input DataFrame
      - drop_id: drop the unique identifier (Unnamed: 0) if True
      - drop_constants: drop constant columns (e.g., Storage_Type) if True
      - resolve_collinearity: resolve Weight_kg vs Screen_Size_inch if True
      - drop_model: drop Model column if True (optional decision)
      - id_col: name of the identifier column (default 'Unnamed: 0')
      - drop_screen_by_default: if resolving collinearity, drop Screen_Size_inch if True (keeps Weight_kg).
    Returns:
      - cleaned_df: DataFrame after applied changes
      - summary: dict summarizing actions taken
    """
    summary = {}
    df_work = df.copy()
    # 1. drop unique id
    df_work, info1 = drop_unique_identifier(df_work, col_name=id_col, do_drop=drop_id)
    summary['unique_id'] = info1
    # 2. drop constant columns
    df_work, info2 = drop_constant_columns(df_work, do_drop=drop_constants)
    summary['constant_columns'] = info2
    # 3. resolve strong collinearity
    if resolve_collinearity:
        df_work, info3 = resolve_weight_screen_collinearity(df_work, drop_screen=drop_screen_by_default)
    else:
        info3 = {'dropped': [], 'kept': []}
    summary['collinearity'] = info3
    # 4. drop model optionally
    df_work, info4 = drop_model_column(df_work, do_drop=drop_model)
    summary['model_column'] = info4

    return df_work, summary

# Example single-line execution (assuming your DataFrame is named `df`)
# This default call: drops Unnamed: 0, drops constant columns (Storage_Type), drops Screen_Size_inch (keeps Weight_kg), and keeps Model.
# To drop Model as well, set drop_model=True.
cleaned_df, cleaning_summary = clean_data(df)

NameError: name 'drop_unique_identifier' is not defined

In [None]:
# Helper functions for the 'clean_data' wrapper function

def drop_unique_identifier(df, col_name, do_drop=True):
    """
    Drops a specified unique identifier column from the DataFrame if it exists.
    Returns the modified DataFrame and a dictionary indicating dropped columns.
    """
    dropped_cols = []
    if do_drop and col_name in df.columns:
        df = df.drop(columns=[col_name])
        dropped_cols.append(col_name)
    return df, {'dropped': dropped_cols}

def drop_constant_columns(df, do_drop=True):
    """
    Drops columns that have only one unique value (constant columns) if specified.
    Returns the modified DataFrame and a dictionary indicating dropped columns.
    """
    constant_cols = []
    if do_drop:
        for col in df.columns:
            if df[col].nunique(dropna=False) <= 1:
                constant_cols.append(col)
        df = df.drop(columns=constant_cols, errors='ignore') # Use errors='ignore' if some columns might have been dropped already
    return df, {'dropped': constant_cols}

def resolve_weight_screen_collinearity(df, drop_screen=True):
    """
    Resolves hypothetical collinearity between 'Weight_kg' and 'Screen_Size_inch'
    by dropping one. This function is specific to a laptop dataset context.
    Returns the modified DataFrame and a dictionary indicating dropped/kept columns.
    """
    dropped_cols = []
    kept_cols = []
    if 'Weight_kg' in df.columns and 'Screen_Size_inch' in df.columns:
        if drop_screen:
            df = df.drop(columns=['Screen_Size_inch'])
            dropped_cols.append('Screen_Size_inch')
            kept_cols.append('Weight_kg')
        else: # Implies keeping Screen_Size_inch and dropping Weight_kg
            df = df.drop(columns=['Weight_kg'])
            dropped_cols.append('Weight_kg')
            kept_cols.append('Screen_Size_inch')
    elif 'Weight_kg' in df.columns:
        kept_cols.append('Weight_kg')
    elif 'Screen_Size_inch' in df.columns:
        kept_cols.append('Screen_Size_inch')
    return df, {'dropped': dropped_cols, 'kept': kept_cols}

def drop_model_column(df, do_drop=True):
    """
    Drops the 'Model' column if it exists and specified. This function is specific
    to a hypothetical laptop dataset context.
    Returns the modified DataFrame and a dictionary indicating dropped columns.
    """
    dropped_cols = []
    if do_drop and 'Model' in df.columns:
        df = df.drop(columns=['Model'])
        dropped_cols.append('Model')
    return df, {'dropped': dropped_cols}


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [None]:
# Example: Replace this with the raw URL of your GitHub file
github_raw_url = 'https://raw.githubusercontent.com/hazelchiang2102/ADALL_github/refs/heads/main/diabetes_binary_health_indicators_BRFSS2015.csv'
try:
    df = pd.read_csv(github_raw_url)
    print("Successfully loaded data from GitHub!")
    display(df.head())
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please ensure the URL is correct and the file format is compatible with `pd.read_csv`.")

Successfully loaded data from GitHub!


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
#read in the dataset (select 2015)
year = '2015'
brfss_2015_dataset = pd.read_csv('/content/sample_data/diabetes_binary_health_indicators_BRFSS2015.csv')

In [None]:
#How many rows and columns
brfss_2015_dataset.shape

(253680, 22)

In [None]:
#check that the data loaded in is in the correct format
pd.set_option('display.max_columns', 500)
brfss_2015_dataset.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
brfss_df_selected = brfss_2015_dataset[['Diabetes_binary',
                                         'HighBP',
                                         'HighChol', 'CholCheck',
                                         'BMI',
                                         'Smoker',
                                         'Stroke', 'HeartDiseaseorAttack',
                                         'PhysActivity',
                                         'Fruits', 'Veggies',
                                         'HvyAlcoholConsump',
                                         'AnyHealthcare', 'NoDocbcCost',
                                         'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk',
                                         'Sex', 'Age', 'Education', 'Income']]

In [None]:
brfss_df_selected.shape

(253680, 22)

In [None]:
brfss_df_selected.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
#Drop Missing Values - knocks 100,000 rows out right away
brfss_df_selected = brfss_df_selected.dropna()
brfss_df_selected.shape

(253680, 22)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [None]:
from google.colab import userdata
from openai import OpenAI

# Load key from Google Colab Secrets
api_key = userdata.get('OPENAI_API_KEY')

client = OpenAI(
    api_key=api_key
)

In [None]:
# Diabetes_binary (originally intended as DIABETE3)
# going to make this ordinal. 0 is for no diabetes or only during pregnancy, 1 is for pre-diabetes or borderline diabetes, 2 is for yes diabetes
# Note: The 'Diabetes_binary' column currently contains 0.0 (no diabetes) and 1.0 (diabetes).
# To align with the desired 0, 1, 2 scale, we will map 1.0 to 2.0. The pre-diabetes category (1) cannot be derived from this binary column.
# The original comments and replacement mapping seem to refer to a different source column with more categories.
brfss_df_selected['Diabetes_binary'] = brfss_df_selected['Diabetes_binary'].replace({1:2})
# Remove all 7 (dont knows) and 9 (refused) - these values are not present in Diabetes_binary based on initial inspection.
# brfss_df_selected = brfss_df_selected[brfss_df_selected.Diabetes_binary != 7]
# brfss_df_selected = brfss_df_selected[brfss_df_selected.Diabetes_binary != 9]
brfss_df_selected['Diabetes_binary'].unique()

array([0., 2.])

In [None]:
#generate a preview of ten rows as text first, so that we can use it for sending to LLM API later.
data_preview = df.head(10).to_string()
print(data_preview)

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  AnyHealthcare  NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  Income
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0                   0.0           0.0     0.0      1.0                0.0            1.0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0     3.0
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0                   0.0           1.0     0.0      0.0                0.0            0.0          1.0      3.0       0.0       0.0       0.0  0.0   7.0        6.0     1.0
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0                   0.0           0.0     1.0      0.0                0.0            1.0          1.0      5.0      30.0      30.0       1.0  0.0   9.0        4.0     8.0
3              0.0     1.0       0.0        

In [None]:
#sending to LLM API
response = client.responses.create(
    model="gpt-5-mini",
    instructions="""
You are an expert data scientist with extensive knowledge of tree-based models.
Use ONLY the information inside the dataset profile text.
Do NOT invent correlations, columns, or values.
If something is not in the dataset profile, state 'Not shown in profile'.
Always justify recommendations using reasoning trace based ONLY on the dataset profile.
""",
    input=f"""Dataset info: {data_preview}\n
    Context:
    The business problem is that Healthcare system and public health agencies lack a scalable, data-driven way to identify individuals at high risj of diabetes early, using routinely collected, low-cost health data indicators.
    Questions
    1. Based on the context and dataset info, how should i approach modelling objective? focus on problem framing aspects.
    2. What would be the most meaningful target?
    3. What would be most important metric for scoring?
    4. What are the top 3 most potentially important features?
    """)
print(response.output_text)

Summary answer (directly tied to the provided dataset profile and business context)

1) Problem framing / modelling objective
- Frame this as a supervised binary classification task: predict the column Diabetes_binary from the other routinely collected indicators in the table. This directly matches the business goal of identifying individuals at high risk of diabetes early.
- Use a model and pipeline designed for early identification / screening: produce well-calibrated probability scores (so stakeholders can set triage thresholds) and optimize for catching as many true high-risk individuals as feasible (see metrics below).
- Modeling approach choices driven by the dataset profile:
  - The predictors are a mix of binary indicators (e.g., HighBP, Smoker, HeartDiseaseorAttack, PhysActivity, Fruits, Veggies, etc.) and numeric fields (BMI, Age, GenHlth/MentHlth/PhysHlth appear numeric-coded). Tree-based ensembles (random forest, gradient-boosted trees) are a natural first choice because th

In [None]:
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [None]:
import pandas as pd
import numpy as np
from io import StringIO

# ---------------------------
# Generate a full dataset profile
# ---------------------------

buffer = StringIO()

# dtypes
buffer.write("=== DTYPES ===\n")
buffer.write(df.dtypes.to_string())
buffer.write("\n\n")

# numeric describe
buffer.write("=== NUMERIC DESCRIBE ===\n")
buffer.write(df.describe().to_string())
buffer.write("\n\n")

# categorical describe
buffer.write("=== CATEGORICAL DESCRIBE ===\n")
try:
    buffer.write(df.describe(include='object').to_string())
except:
    buffer.write("No categorical columns")
buffer.write("\n\n")

# null summary
buffer.write("=== NULL SUMMARY ===\n")
null_summary = (
    df.isna().sum().to_frame("null_count")
    .assign(null_pct=lambda x: x["null_count"]/len(df))
)
buffer.write(null_summary.to_string())
buffer.write("\n\n")

# unique cardinality
buffer.write("=== UNIQUE VALUES PER COLUMN ===\n")
buffer.write(df.nunique().to_frame("unique_count").to_string())
buffer.write("\n\n")

# correlation matrix
buffer.write("=== CORRELATIONS (NUMERIC ONLY) ===\n")
buffer.write(df.corr(numeric_only=True).round(3).to_string())
buffer.write("\n\n")

# value counts for categoricals
buffer.write("=== VALUE COUNTS (TOP 20 PER CATEGORICAL COLUMN) ===\n")
cat_cols = df.select_dtypes(include='object').columns
if len(cat_cols) > 0:
    for col in cat_cols:
        buffer.write(f"\nColumn: {col}\n")
        vc = df[col].value_counts().head(20)
        buffer.write(vc.to_string())
        buffer.write("\n")
else:
    buffer.write("No categorical columns\n")
buffer.write("\n")

# --------- FIXED OUTLIER COMPUTATION (NO BOOLEANS) ---------
buffer.write("=== OUTLIER SUMMARY (IQR METHOD) ===\n")
num_cols = df.select_dtypes(include=['number']).columns  # exclude booleans
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[num_cols] < (Q1 - 1.5*IQR)) | (df[num_cols] > (Q3 + 1.5*IQR))).sum()
buffer.write(outliers.to_string())
buffer.write("\n\n")

# leakage scan: columns with all unique values
buffer.write("=== POSSIBLE LEAKAGE COLUMNS (UNIQUE FOR EACH ROW) ===\n")
leak_cols = df.columns[df.nunique() == len(df)]
buffer.write(str(list(leak_cols)))
buffer.write("\n\n")

# shape, duplicates, constant cols
buffer.write("=== SHAPE / DUPLICATES / CONSTANT COLUMNS ===\n")
dup_count = df.duplicated().sum()
constant_cols = df.columns[df.nunique() == 1].tolist()
buffer.write(f"Rows: {len(df)}, Columns: {df.shape[1]}\n")
buffer.write(f"Duplicate rows: {dup_count}\n")
buffer.write(f"Constant columns: {constant_cols}\n\n")

# Final text
payload_text = buffer.getvalue()

print(payload_text)


=== DTYPES ===
Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64

=== NUMERIC DESCRIBE ===
       Diabetes_binary         HighBP       HighChol      CholCheck            BMI         Smoker         Stroke  HeartDiseaseorAttack   PhysActivity         Fruits        Veggies  HvyAlcoholConsump  AnyHealthcare    NoDocbcCost        GenHlth       MentH

In [None]:
#1 HighBP
# Confirming unique values. Original values 1.0 and 0.0 are already correct (1 for high blood pressure, 0 for no).
brfss_df_selected.HighBP.unique()

array([1., 0.])

In [None]:
#2 HighChol (original name was TOLDHI2)
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss_df_selected['HighChol'] = brfss_df_selected['HighChol'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.HighChol != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.HighChol != 9]
brfss_df_selected.HighChol.unique()

array([1., 0.])

In [None]:
#3 CholCheck
# Change 3 to 0 and 2 to 0 for Not checked cholesterol in past 5 years
# Remove 9
brfss_df_selected['CholCheck'] = brfss_df_selected['CholCheck'].replace({3:0,2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.CholCheck != 9]
brfss_df_selected.CholCheck.unique()

array([1., 0.])

In [None]:
#4 BMI (no changes, just note that these are BMI * 100. So for example a BMI of 4018 is really 40.18)
# The BMI column appears to already contain correctly scaled values, so no division by 100 is needed.
# Original values such as 40.0, 25.0 would become 0.0 after div(100).round(0).
# Removing the incorrect transformation:
# brfss_df_selected['BMI'] = brfss_df_selected['BMI'].div(100).round(0)
brfss_df_selected.BMI.unique()

array([0.])

In [None]:
#5 Smoker (original name was SMOKE100)
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)
brfss_df_selected['Smoker'] = brfss_df_selected['Smoker'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.Smoker != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.Smoker != 9]
brfss_df_selected.Smoker.unique()

array([1., 0.])

In [None]:
#7 HeartDiseaseorAttack (original name was _MICHD)
#Change 2 to 0 because this means did not have MI or CHD
brfss_df_selected['HeartDiseaseorAttack'] = brfss_df_selected['HeartDiseaseorAttack'].replace({2: 0})
brfss_df_selected.HeartDiseaseorAttack.unique()

array([0., 1.])

In [None]:
#8 PhysActivity (original name was _TOTINDA)
# 1 for physical activity
# change 2 to 0 for no physical activity
# Remove all 9 (don't know/refused)
brfss_df_selected['PhysActivity'] = brfss_df_selected['PhysActivity'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.PhysActivity != 9]
brfss_df_selected.PhysActivity.unique()

array([0., 1.])

In [None]:
#9 Fruits (original name was _FRTLT1)
# Change 2 to 0. this means no fruit consumed per day. 1 will mean consumed 1 or more pieces of fruit per day
# remove all dont knows and missing 9
brfss_df_selected['Fruits'] = brfss_df_selected['Fruits'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.Fruits != 9]
brfss_df_selected.Fruits.unique()

array([0., 1.])

In [None]:
#10 Veggies (original name was _VEGLT1)
# Change 2 to 0. this means no vegetables consumed per day. 1 will mean consumed 1 or more pieces of vegetable per day
# remove all dont knows and missing 9
brfss_df_selected['Veggies'] = brfss_df_selected['Veggies'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.Veggies != 9]
brfss_df_selected.Veggies.unique()

array([1., 0.])

In [None]:
#11 HvyAlcoholConsump (original name was _RFDRHV5)
# Change 1 to 0 (1 was no for heavy drinking). change all 2 to 1 (2 was yes for heavy drinking)
# remove all dont knows and missing 9
# The HvyAlcoholConsump column appears to already be encoded as 0 (no) and 1 (yes).
# The replacement logic `replace({1:0, 2:1})` would incorrectly convert all 'yes' (1) to 'no' (0).
# No replacement is needed for its values.
# The df.describe() shows max value is 1.0, so no 9s (don't know/refused) are present and filtering is unnecessary.
# brfss_df_selected['HvyAlcoholConsump'] = brfss_df_selected['HvyAlcoholConsump'].replace({1:0, 2:1})
# brfss_df_selected = brfss_df_selected[brfss_df_selected.HvyAlcoholConsump != 9]
brfss_df_selected.HvyAlcoholConsump.unique()

array([0.])

In [None]:
#12 AnyHealthcare (original name was HLTHPLN1)
# 1 is yes, change 2 to 0 because it is No health care access
# remove 7 and 9 for don't know or refused
brfss_df_selected['AnyHealthcare'] = brfss_df_selected['AnyHealthcare'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.AnyHealthcare != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.AnyHealthcare != 9]
brfss_df_selected.AnyHealthcare.unique()

array([1., 0.])

In [None]:
#13 MEDCOST
# Change 2 to 0 for no, 1 is already yes
# remove 7 for don/t know and 9 for refused
brfss_df_selected['NoDocbcCost'] = brfss_df_selected['NoDocbcCost'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.NoDocbcCost != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.NoDocbcCost != 9]
brfss_df_selected.NoDocbcCost.unique()

array([0., 1.])

In [None]:
#14 GENHLTH
# This is an ordinal variable that I want to keep (1 is Excellent -> 5 is Poor)
# Remove 7 and 9 for don't know and refused
brfss_df_selected = brfss_df_selected[brfss_df_selected.GenHlth != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.GenHlth != 9]
brfss_df_selected.GenHlth.unique()

array([5., 3., 2., 4., 1.])

In [None]:
#15 MENTHLTH
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused
brfss_df_selected['MentHlth'] = brfss_df_selected['MentHlth'].replace({88:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.MentHlth != 77]
brfss_df_selected = brfss_df_selected[brfss_df_selected.MentHlth != 99]
brfss_df_selected.MentHlth.unique()

array([18.,  0., 30.,  3.,  5., 15., 10.,  6., 20.,  2., 25.,  1.,  4.,
        7.,  8., 21., 14., 26., 29., 16., 28., 11., 12., 24., 17., 13.,
       27., 19., 22.,  9., 23.])

In [None]:
#16 PHYSHLTH
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused
brfss_df_selected['PhysHlth'] = brfss_df_selected['PhysHlth'].replace({88:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.PhysHlth != 77]
brfss_df_selected = brfss_df_selected[brfss_df_selected.PhysHlth != 99]
brfss_df_selected.PhysHlth.unique()

array([15.,  0., 30.,  2., 14., 28.,  7., 20.,  3., 10.,  1.,  5., 17.,
        4., 19.,  6., 12., 25., 27., 21., 22.,  8., 29., 24.,  9., 16.,
       18., 23., 13., 26., 11.])

In [None]:
#17 DIFFWALK
# change 2 to 0 for no. 1 is already yes
# remove 7 and 9 for don't know not sure and refused
brfss_df_selected['DiffWalk'] = brfss_df_selected['DiffWalk'].replace({2:0})
brfss_df_selected = brfss_df_selected[brfss_df_selected.DiffWalk != 7]
brfss_df_selected = brfss_df_selected[brfss_df_selected.DiffWalk != 9]
brfss_df_selected.DiffWalk.unique()

array([1., 0.])

In [None]:
#18 SEX
# in other words - is respondent male (somewhat arbitrarily chose this change because men are at higher risk for heart disease)
# change 2 to 0 (female as 0). Male is 1
brfss_df_selected['Sex'] = brfss_df_selected['Sex'].replace({2:0})
brfss_df_selected.Sex.unique()

array([0., 1.])

In [None]:
#19 _AGEG5YR
# already ordinal. 1 is 18-24 all the way up to 13 wis 80 and older. 5 year increments.
# remove 14 because it is don't know or missing
brfss_df_selected = brfss_df_selected[brfss_df_selected.Age != 14]
brfss_df_selected.Age.unique()

array([ 9.,  7., 11., 10.,  8., 13.,  4.,  6.,  2., 12.,  5.,  1.,  3.])

In [None]:
#20 EDUCA
# This is already an ordinal variable with 1 being never attended school or kindergarten only up to 6 being college 4 years or more
# Scale here is 1-6
# Remove 9 for refused:
brfss_df_selected = brfss_df_selected[brfss_df_selected.Education != 9]
brfss_df_selected.Education.unique()

array([4., 6., 3., 5., 2., 1.])

In [None]:
#21 INCOME2
# Variable is already ordinal with 1 being less than $10,000 all the way up to 8 being $75,000 or more
# Remove 77 and 99 for don't know and refused
brfss_df_selected = brfss_df_selected[brfss_df_selected.Income != 77]
brfss_df_selected = brfss_df_selected[brfss_df_selected.Income != 99]
brfss_df_selected.Income.unique()

array([3., 1., 8., 6., 4., 7., 2., 5.])

In [None]:
#Check the shape of the dataset now: We have 253,680 cleaned rows and 22 columns (1 of which is our dependent variable)
brfss_df_selected.shape

(253680, 22)

In [None]:
#Let's see what the data looks like after Modifying Values
brfss_df_selected.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
#Check Class Sizes of the heart disease column
brfss_df_selected.groupby(['Diabetes_binary']).size()

Unnamed: 0_level_0,0
Diabetes_binary,Unnamed: 1_level_1
0.0,218334
1.0,35346


In [None]:
#Rename the columns to make them more readable#Rename the columns to make them more readable
brfss = brfss_df_selected.rename(columns = {'Diabetes_binary':'Diabetes_012',
                                         '_RFHYPE5':'HighBP',
                                         'TOLDHI2':'HighChol', '_CHOLCHK':'CholCheck',
                                         '_BMI5':'BMI',
                                         'SMOKE100':'Smoker',
                                         'CVDSTRK3':'Stroke', '_MICHD':'HeartDiseaseorAttack',
                                         '_TOTINDA':'PhysActivity',
                                         '_FRTLT1':'Fruits', '_VEGLT1':"Veggies",
                                         '_RFDRHV5':'HvyAlcoholConsump',
                                         'HLTHPLN1':'AnyHealthcare', 'MEDCOST':'NoDocbcCost',
                                         'GENHLTH':'GenHlth', 'MENTHLTH':'MentHlth', 'PHYSHLTH':'PhysHlth', 'DIFFWALK':'DiffWalk',
                                         'SEX':'Sex', '_AGEG5YR':'Age', 'EDUCA':'Education', 'INCOME2':'Income' })

In [None]:
brfss.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [None]:
brfss.shape

(253680, 22)

In [None]:
#Check how many respondents have no diabetes, prediabetes or diabetes. Note the class imbalance!
brfss.groupby(['Diabetes_012']).size()

Unnamed: 0_level_0,0
Diabetes_012,Unnamed: 1_level_1
0.0,218334
1.0,35346


In [None]:
#************************************************************************************************
brfss.to_csv('diabetes_012_health_indicators_BRFSS2015.csv', sep=",", index=False)
#************************************************************************************************

In [None]:
display(brfss.describe())

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.139333,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,0.81142,0.056197,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.346294,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,0.391175,0.230302,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0


In [None]:
display(brfss.corr())

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
Diabetes_012,1.0,0.263129,0.200276,0.064761,0.216843,0.060789,0.105816,0.177282,-0.118133,-0.040779,-0.056584,-0.057056,0.016255,0.031433,0.293569,0.069315,0.171337,0.218344,0.03143,0.177442,-0.124456,-0.163919
HighBP,0.263129,1.0,0.298199,0.098508,0.213748,0.096991,0.129575,0.209361,-0.125267,-0.040555,-0.061266,-0.003972,0.038425,0.017358,0.30053,0.056456,0.161212,0.223618,0.052207,0.344452,-0.141358,-0.171235
HighChol,0.200276,0.298199,1.0,0.085642,0.106722,0.091299,0.09262,0.180765,-0.078046,-0.040859,-0.039874,-0.011543,0.04223,0.01331,0.208426,0.062069,0.121751,0.144672,0.031205,0.272318,-0.070802,-0.085459
CholCheck,0.064761,0.098508,0.085642,1.0,0.034495,-0.009929,0.024158,0.044206,0.00419,0.023849,0.006121,-0.02373,0.117626,-0.058255,0.046589,-0.008366,0.031775,0.040585,-0.022115,0.090321,0.00151,0.014259
BMI,0.216843,0.213748,0.106722,0.034495,1.0,0.013804,0.020153,0.052904,-0.147294,-0.087518,-0.062275,-0.048736,-0.018471,0.058206,0.239185,0.08531,0.121141,0.197078,0.04295,-0.036618,-0.103932,-0.100069
Smoker,0.060789,0.096991,0.091299,-0.009929,0.013804,1.0,0.061173,0.114441,-0.087401,-0.077666,-0.030678,0.101619,-0.023251,0.048946,0.163143,0.092196,0.11646,0.122463,0.093662,0.120641,-0.161955,-0.123937
Stroke,0.105816,0.129575,0.09262,0.024158,0.020153,0.061173,1.0,0.203002,-0.069151,-0.013389,-0.041124,-0.01695,0.008776,0.034804,0.177942,0.070172,0.148944,0.176567,0.002978,0.126974,-0.076009,-0.128599
HeartDiseaseorAttack,0.177282,0.209361,0.180765,0.044206,0.052904,0.114441,0.203002,1.0,-0.087299,-0.01979,-0.039167,-0.028991,0.018734,0.031,0.258383,0.064621,0.181698,0.212709,0.086096,0.221618,-0.0996,-0.141011
PhysActivity,-0.118133,-0.125267,-0.078046,0.00419,-0.147294,-0.087401,-0.069151,-0.087299,1.0,0.142756,0.15315,0.012392,0.035505,-0.061638,-0.266186,-0.125587,-0.21923,-0.253174,0.032482,-0.092511,0.199658,0.198539
Fruits,-0.040779,-0.040555,-0.040859,0.023849,-0.087518,-0.077666,-0.013389,-0.01979,0.142756,1.0,0.254342,-0.035288,0.031544,-0.044243,-0.103854,-0.068217,-0.044633,-0.048352,-0.091175,0.064547,0.110187,0.079929
