In [1]:
from framingham_score import *
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np
import os

In [2]:
def preprocess_data(df, handle_missings='median'):
    categorical_columns = ['is_smoking', "BPMeds", 'prevalentStroke', 'prevalentHyp', 'diabetes', 'sex', 'education']
    df.loc[df['is_smoking'] == "YES", 'is_smoking'] = 1
    df.loc[df['is_smoking'] == "NO", 'is_smoking'] = 0
    df.loc[df['sex'] == "M", 'sex'] = 1
    df.loc[df['sex'] == "F", 'sex'] = 0
    df["is_smoking"] = df["is_smoking"].astype(int)
    df["sex"] = df["sex"].astype(int)

    if handle_missings == 'median':
        for col in df.columns:
            if col not in categorical_columns:
                mean_col = df[col].median()
                df[col] = df[col].fillna(mean_col)
            else:
                mean_col = np.argmax(df[col])
                df[col] = df[col].fillna(mean_col)
    elif handle_missings == 'knn':
        for col in df.columns:
            if col not in categorical_columns:
                mean_col = df[col].median()
                df[col] = df[col].fillna(mean_col)

        imputator = KNNImputer(n_neighbors=5)
        df = pd.DataFrame(imputator.fit_transform(df), columns=df.columns)

    # Column to see if patient was in Hypertension during the sampling.
    df.insert(len(df.columns) - 1, 'isHyp', value=0)
    df.loc[(df['sysBP'] >= 130) | (df['diaBP'] >= 80), 'isHyp'] = 1

    # creating categorical column for packs of cigarettes.
    df.insert(len(df.columns) - 1, 'packsOfCigs', value=0)
    df.loc[df['cigsPerDay'] > 0, 'packsOfCigs'] = 1
    df.loc[df['cigsPerDay'] >= 10, 'packsOfCigs'] = 2
    df.loc[df['cigsPerDay'] >= 20, 'packsOfCigs'] = 3
    df.loc[df['cigsPerDay'] >= 30, 'packsOfCigs'] = 4

    # creating categorical column for glucose levels.
    df.insert(len(df.columns) - 1, 'glucose_level', value=0)
    df.loc[df['glucose'] > 2.6 * 18, 'glucose_level'] = 1
    df.loc[df['glucose'] > 4.7 * 18, 'glucose_level'] = 2
    df.loc[df['glucose'] > 6.3 * 18, 'glucose_level'] = 3
    df.loc[df['glucose'] > 8.5 * 18, 'glucose_level'] = 4

    # Score2 diabetes score (Framingham Risk Score)
    df.insert(len(df.columns) - 1, 'diabetes_score2', value=0)
    df['diabetes_score2'] = df.apply(lambda x: calculate_framingham_score(
        x['age'], x['sex'], x['is_smoking'], x['cigsPerDay'], x['BPMeds'], x['prevalentStroke'], x['prevalentHyp'],
        x['diabetes'],
        x['totChol'], x['sysBP'], x['diaBP'], x['BMI'], x['heartRate'], x['glucose'], x['education']), axis=1)

    return df.set_index('id')


def remove_outliers_iqr(df, columns, threshold=1.5):
    # Calculate the IQR for each column
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1

    # Define lower and upper bounds to identify outliers
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR

    # Filter rows without outliers
    mask = ~((df[columns] < lower_bound) | (df[columns] > upper_bound)).any(axis=1)
    return df[mask]


# Load Data
data_dir = r"C:\Users\soldier109\PycharmProjects\Moshal Medicine Hackaton 2023\Datasets\Cardiovascular Study Dataset"
trainset = preprocess_data(pd.read_csv(data_dir + "\\train.csv"), handle_missings='median')

In [None]:
from run_ce_optimization import *
immutable_features = ['sex', 'age', 'education']
positive_features = ["cigsPerDay","totChol",'sysBP','diaBP','BMI','heartRate','glucose',"packsOfCigs",'glucose_level']
binary_features = ['is_smoking', "BPMeds",'prevalentStroke','prevalentHyp','diabetes','isHyp']
integers_features = []
larger_than_features = []
exclude = []
# exclude = ['rf']
norm = "l2"
target_column = 'TenYearCHD'
res, sol = run_experiment(trainset, target_column, dataset_name="CHD_prediction_"+norm, norm=norm,
                          exclude_models=exclude,
                          immutable_features=immutable_features,
                          positive_features=positive_features,
                          binary_features=binary_features, integers_features=integers_features,
                          larger_than_features=larger_than_features,
                          num_instances=3, time_limit=120, rho=0.05, trees_max_depth=, trees_n_est=)

  0%|          | 0/2 [00:00<?, ?it/s]

#### Model: rf	Specs: 100	Sample number: 1/3 ####
rf tables saved.

### Starting the RANDOM FOREST iterative approach ###
time limit: 1200


------------------------ Iteration: 0 ------------------------
Optimizing the master problem...
solution master [0.8421052631578947, 0.3333333333333333, 0.0, 1.0, 0.04285714285714286, 0.0, 0.0, 0.0, 0.0, 0.1935483870967742, 0.31324876976964333, 0.38624338805675507, 0.23065621939275222, 0.4591836734693877, 0.23306084556338647, 1.0, 0.25, 0.25, 1.0000099105930076] generated in  43.8 s
--> Distance to the factual instance: 0.13365164011409586
--> Distance to the border: 0.0
Optimizing the adversarial problem...
Set parameter Username
Academic license - for non-commercial use only - expires 2024-08-31
Set parameter PoolSearchMode to value 1
Status: optimal
solution adv problem [0.8421052631578947, 0.3333333333333333, 0.0, 1.0, 0.06783666060294316, 0.0, 0.0, 0.0, 0.0, 0.2098328763628813, 0.3049696034261935, 0.35889293308917614, 0.2460830212340823, 0.45

 50%|█████     | 1/2 [55:43<55:43, 3343.22s/it]

1e-05


@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ ERROR @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


#### Model: gbm	Specs: 100	Sample number: 1/3 ####
gbm tables saved.

### Starting the GRADIENT BOOSTING iterative approach ###
time limit: 1200


------------------------ Iteration: 0 ------------------------
Optimizing the master problem...
solution master [0.8421052631578947, 0.3333333333333333, 0.0, 1.0, 0.04285714285714286, 0.0, 0.0, 0.0, 0.0, 0.18590832501649857, 0.3049645390070922, 0.38624338805675507, 0.22869735956192017, 0.45408162474632263, 0.11016949266195297, 1.0, 0.25, 0.25000000000000006, 0.9999990463256836] generated in  37.2 s
--> Distance to the factual instance: 0.02281683173234112


In [None]:
df_output_path = r"C:\Users\soldier109\PycharmProjects\Moshal Medicine Hackaton 2023\results_CHD_prediction_"+ norm + r"\df outputs"
os.makedirs(df_output_path, exist_ok=True)
sol.to_csv(df_output_path+ "\\sol.csv", index=False)
res.to_csv(df_output_path + "\\res.csv", index=False)