In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

pd.options.display.max_columns = 50

import tensorflow as tf
from tensorflow.keras import layers

print(tf.__version__)
print(tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from catboost import CatBoostClassifier
from scipy.stats import randint
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, classification_report


2.15.0
/device:GPU:0
Num GPUs Available:  1


In [3]:
;!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


# Functions

In [9]:
def preprocess_df(file_path):

    file_path = 'data.csv'
    df = pd.read_csv(file_path)
    df = df.drop_duplicates()

    columns_to_exclude_max = ['sight_left', 'sight_right', 'SGOT_AST', 'gamma_GTP']
    for column in columns_to_exclude_max:
        df = df[df[column] != df[column].max()]

    sus_columns = ['waistline', 'BLDS', 'tot_chole', 'HDL_chole', 'LDL_chole', 'triglyceride', 'serum_creatinine', 'SGOT_AST',
                   'SGOT_ALT', 'gamma_GTP', 'hemoglobin']

    # Initialize a mask to select all rows initially
    mask = pd.Series([True] * df.shape[0])

    for column in sus_columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        # Define the bounds for the outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Update the mask to identify rows that are not outliers in the current column
        mask = mask & (df[column] >= lower_bound) & (df[column] <= upper_bound)

    # Apply the mask to filter out the outliers
    df = df[mask]

    df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)
    df['BMI_Category'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, float('inf')], labels=['0', '1', '2', '3'])
    df['MAP'] = df['DBP'] + (df['SBP'] - df['DBP']) / 3
    df['Liver_Enzyme_Ratio'] = df['SGOT_AST'] / df['SGOT_ALT']
    df['Anemia_Indicator'] = (df['hemoglobin'] < 12).astype(int)

    smoker_type_mapping = {1.0: 'Non-Smoker', 2.0: 'Former Smoker', 3.0: 'Current Smoker'}
    df['Smoker Type'] = df['SMK_stat_type_cd'].map(smoker_type_mapping)

    label_encoder = LabelEncoder()
    categorical_columns = ['sex','DRK_YN']  #1:male, 1:Y

    # Apply label encoding to each categorical column

    for column in categorical_columns:
        df[column] = label_encoder.fit_transform(df[column])
    columns_to_convert = ['sex','DRK_YN', 'SMK_stat_type_cd', 'urine_protein', 'hear_left', 'hear_right', 'Anemia_Indicator']
    df[columns_to_convert] = df[columns_to_convert].astype('int')
    df[columns_to_convert] = df[columns_to_convert].astype('category')

    return df





def encode_and_scale(df):

    df['prev_smoker'] = np.where(df['SMK_stat_type_cd'] == 2, 1, 0) #1 for previous smoker
    df['prev_smoker'] = df['prev_smoker'].astype('category')
    cols_to_drop = ["Smoker Type",
                         "BMI_Category",
                         "DRK_YN"
                         ]
    df.drop(columns=cols_to_drop, inplace=True)
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    columns_to_convert = df.select_dtypes(include=['category']).columns
    df[columns_to_convert] = df[columns_to_convert].astype('int')

    return df


def get_data(filepath):
    return encode_and_scale(preprocess_df(filepath))



def split_train_test(df, y, test_size = 0.2):

    X = df.drop(columns=["prev_smoker", "SMK_stat_type_cd"])
    y = df[y]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    return X_train, X_test, y_train, y_test

# Get Data

In [10]:
df = get_data('data.csv')

df.head()

  df = df[mask]


Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,BMI,MAP,Liver_Enzyme_Ratio,Anemia_Indicator,prev_smoker
0,1,-0.834105,0.900772,1.168505,1.151795,0.128398,0.13757,1,1,-0.063531,0.512956,0.350544,0.009549,-0.718717,0.413044,-0.320616,2.073138,1,0.870142,-0.232731,1.734657,1.184764,1,0.764943,0.287091,-1.460062,0,0
1,1,-1.181349,1.982749,1.602218,1.040952,-0.164544,0.726518,1,1,0.64148,0.721198,0.971868,1.033334,-0.201872,1.114171,0.222648,1.175638,1,0.34221,-0.40655,1.850353,0.215221,3,0.381293,0.736072,-1.560599,0,0
3,1,0.207626,1.441761,1.602218,1.262639,1.593107,0.726518,1,1,1.698997,1.241802,-0.004498,0.243557,1.348664,-0.288083,-0.058351,2.41833,1,1.398073,1.157817,1.618961,-0.456,1,0.816968,1.537825,-0.887892,0,0
4,1,0.207626,0.359783,-0.132631,0.043365,0.128398,0.726518,1,1,1.205489,0.721198,0.528065,0.185055,0.241138,0.126219,-0.095817,-0.20513,1,-0.185721,-0.580368,-0.926356,0.066061,1,-0.426312,0.992633,0.764306,0,0
5,1,0.207626,0.359783,-0.566343,-0.510851,0.714281,1.60994,1,1,1.487494,1.762406,0.350544,0.740824,1.422499,-0.574908,2.302035,-0.20513,3,-0.185721,1.157817,2.313138,0.961023,3,-0.985423,1.762316,-1.177304,0,0


In [11]:
X_train, X_test, y_train, y_test = split_train_test(df, y = 'prev_smoker', test_size=0.2)

# Modeling

## Neural Network

In [8]:
model = tf.keras.Sequential(
    [
        layers.Dense(units=16, activation="relu", input_shape=(X_train.shape[-1],)),
        layers.Dropout(0.2),
        layers.BatchNormalization(),
        layers.Dense(units=8, activation="relu"),
        layers.Dense(units=1, activation="sigmoid"),
    ]
)

learning_rate = 0.001

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss="binary_crossentropy",
              metrics=['accuracy', tf.keras.metrics.F1Score()]
             )

y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

model.fit(X_train, y_train,
          epochs=10,
          batch_size=32,
          validation_data=(X_test, y_test),
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b47a079d450>

## Catboost

In [7]:
# Define the CatBoostClassifier
catboost_classifier = CatBoostClassifier(task_type="GPU")

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'iterations': randint(10, 200),
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'depth': randint(1, 10),
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128, 256],
    'thread_count': [2, 4, 8, 16],
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    catboost_classifier,
    param_distributions=param_dist,
    n_iter=10,  # Number of random combinations to try
    cv=5,  # Number of cross-validation folds
    scoring='f1_macro',  # Use F1 score as the evaluation metric
    random_state=42
)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

# Print the best parameters and corresponding F1 score
print("Best Parameters: ", random_search.best_params_)
print("Best F1 Score: {:.2f}".format(random_search.best_score_))

# Evaluate the model on the test set
y_pred = random_search.predict(X_test)
test_f1_score = f1_score(y_test, y_pred, average='macro')
print("Test F1 Score: {:.2f}".format(test_f1_score))

[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
2:	learn: 0.5602585	total: 37.3ms	remaining: 1.7s
3:	learn: 0.5283595	total: 65.6ms	remaining: 2.23s
4:	learn: 0.5010583	total: 101ms	remaining: 2.74s
5:	learn: 0.4779257	total: 121ms	remaining: 2.7s
6:	learn: 0.4582287	total: 150ms	remaining: 2.85s
7:	learn: 0.4415843	total: 171ms	remaining: 2.83s
8:	learn: 0.4274782	total: 220ms	remaining: 3.2s
9:	learn: 0.4151988	total: 251ms	remaining: 3.26s
10:	learn: 0.4066682	total: 272ms	remaining: 3.19s
11:	learn: 0.3973258	total: 296ms	remaining: 3.15s
12:	learn: 0.3894364	total: 323ms	remaining: 3.16s
13:	learn: 0.3824461	total: 378ms	remaining: 3.4s
14:	learn: 0.3765282	total: 430ms	remaining: 3.58s
15:	learn: 0.3714374	total: 461ms	remaining: 3.57s
16:	learn: 0.3669221	total: 492ms	remaining: 3.56s
17:	learn: 0.3629580	total: 521ms	remaining: 3.53s
18:	learn: 0.3595178	total: 553ms	remaining: 3.52s
19:	learn: 0.3564698	total: 602ms	remaining: 3.61s
20:	learn: 0.3538749	total: 656

In [None]:
# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Modeling with Undersampling

In [13]:
rus = RandomUnderSampler(random_state=42)

X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

In [8]:
y_train.value_counts()

0    487732
1     92088
Name: prev_smoker, dtype: int64

In [9]:
y_train_resampled.value_counts()

0    92088
1    92088
Name: prev_smoker, dtype: int64

## Neural Network

In [10]:
model = tf.keras.Sequential(
    [
        layers.Dense(units=16, activation="relu", input_shape=(X_train.shape[-1],)),
        layers.Dropout(0.2),
        layers.BatchNormalization(),
        layers.Dense(units=8, activation="relu"),
        layers.Dense(units=1, activation="sigmoid"),
    ]
)

learning_rate = 0.001

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss="binary_crossentropy",
              metrics=['accuracy', tf.keras.metrics.F1Score()]
             )

y_train_resampled = y_train_resampled.astype(np.float32)
y_test = y_test.astype(np.float32)

model.fit(X_train_resampled, y_train_resampled,
          epochs=10,
          batch_size=32,
          validation_data=(X_test, y_test),
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b7d745af610>

## CatBoost

In [14]:
# Define the CatBoostClassifier
catboost_classifier = CatBoostClassifier(task_type="GPU")

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'iterations': randint(10, 200),
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
    'depth': randint(1, 10),
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128, 256],
    'thread_count': [2, 4, 8, 16],
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    catboost_classifier,
    param_distributions=param_dist,
    n_iter=10,  # Number of random combinations to try
    cv=5,  # Number of cross-validation folds
    scoring='f1_macro',  # Use F1 score as the evaluation metric
    random_state=42
)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and corresponding F1 score
print("Best Parameters: ", random_search.best_params_)
print("Best F1 Score: {:.2f}".format(random_search.best_score_))

# Evaluate the model on the test set
y_pred = random_search.predict(X_test)
test_f1_score = f1_score(y_test, y_pred, average='macro')
print("Test F1 Score: {:.2f}".format(test_f1_score))

[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
3:	learn: 0.6181341	total: 34.6ms	remaining: 1.17s
4:	learn: 0.6043192	total: 42.8ms	remaining: 1.16s
5:	learn: 0.5921815	total: 51.5ms	remaining: 1.15s
6:	learn: 0.5813683	total: 60.2ms	remaining: 1.14s
7:	learn: 0.5718926	total: 68.7ms	remaining: 1.13s
8:	learn: 0.5635852	total: 73.8ms	remaining: 1.07s
9:	learn: 0.5558494	total: 82.5ms	remaining: 1.07s
10:	learn: 0.5490378	total: 91.2ms	remaining: 1.07s
11:	learn: 0.5430138	total: 100ms	remaining: 1.07s
12:	learn: 0.5377009	total: 109ms	remaining: 1.06s
13:	learn: 0.5328027	total: 117ms	remaining: 1.05s
14:	learn: 0.5286200	total: 126ms	remaining: 1.05s
15:	learn: 0.5250547	total: 135ms	remaining: 1.04s
16:	learn: 0.5218214	total: 140ms	remaining: 1.01s
17:	learn: 0.5188665	total: 149ms	remaining: 1.01s
18:	learn: 0.5161426	total: 157ms	remaining: 1s
19:	learn: 0.5139658	total: 161ms	remaining: 969ms
20:	learn: 0.5116621	total: 170ms	remaining: 964ms
21:	learn: 0.5096868	to

In [17]:
# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[82441 39282]
 [ 2836 20397]]
