# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

pd.options.display.max_columns = 50

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Functions

## Data Read and Preprocessing Function

In [3]:
from sklearn.preprocessing import LabelEncoder

def preprocess_df(file_path):

    file_path = 'data.csv'
    df = pd.read_csv(file_path)
    df = df.drop_duplicates()

    columns_to_exclude_max = ['sight_left', 'sight_right', 'SGOT_AST', 'gamma_GTP']
    for column in columns_to_exclude_max:
        df = df[df[column] != df[column].max()]

    sus_columns = ['waistline', 'BLDS', 'tot_chole', 'HDL_chole', 'LDL_chole', 'triglyceride', 'serum_creatinine', 'SGOT_AST',
                   'SGOT_ALT', 'gamma_GTP', 'hemoglobin']

    # Initialize a mask to select all rows initially
    mask = pd.Series([True] * df.shape[0])

    for column in sus_columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        # Define the bounds for the outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Update the mask to identify rows that are not outliers in the current column
        mask = mask & (df[column] >= lower_bound) & (df[column] <= upper_bound)

    # Apply the mask to filter out the outliers
    df = df[mask]

    df['BMI'] = df['weight'] / ((df['height'] / 100) ** 2)
    df['BMI_Category'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, float('inf')], labels=['0', '1', '2', '3'])
    df['MAP'] = df['DBP'] + (df['SBP'] - df['DBP']) / 3
    df['Liver_Enzyme_Ratio'] = df['SGOT_AST'] / df['SGOT_ALT']
    df['Anemia_Indicator'] = (df['hemoglobin'] < 12).astype(int)

    smoker_type_mapping = {1.0: 'Non-Smoker', 2.0: 'Former Smoker', 3.0: 'Current Smoker'}
    df['Smoker Type'] = df['SMK_stat_type_cd'].map(smoker_type_mapping)
    
    label_encoder = LabelEncoder()
    categorical_columns = ['sex','DRK_YN']  #1:male, 1:Y

    # Apply label encoding to each categorical column

    for column in categorical_columns:
        df[column] = label_encoder.fit_transform(df[column])
    columns_to_convert = ['sex','DRK_YN', 'SMK_stat_type_cd', 'urine_protein', 'hear_left', 'hear_right', 'Anemia_Indicator']
    df[columns_to_convert] = df[columns_to_convert].astype('int')
    df[columns_to_convert] = df[columns_to_convert].astype('category')

    return df

## Scaling Function

**used after EDA**

In [25]:
def encode_and_scale(df):

    df['prev_smoker'] = np.where(df['SMK_stat_type_cd'] == 2, 1, 0) #1 for previous smoker
    df['prev_smoker'] = df['prev_smoker'].astype('category')
    cols_to_drop = ["Smoker Type",
                         "BMI_Category",
                         "DRK_YN"
                         ]
    df.drop(columns=cols_to_drop, inplace=True)
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    columns_to_convert = df.select_dtypes(include=['category']).columns
    df[columns_to_convert] = df[columns_to_convert].astype('int')

    return df

## Get data
**ready for modeling**

In [7]:
def get_data(filepath):
    return encode_and_scale(preprocess_df(filepath))

## Split Function

In [29]:
def split_train_test(df, y, test_size = 0.2):

    X = df.drop(columns=["prev_smoker", "SMK_stat_type_cd"])
    y = df[y]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    return X_train, X_test, y_train, y_test

# Example Use

In [5]:
df = preprocess_df("data.csv")
df.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN,BMI,BMI_Category,MAP,Liver_Enzyme_Ratio,Anemia_Indicator,Smoker Type
0,1,35,170,75,90.0,1.0,1.0,1,1,120.0,80.0,99.0,193.0,48.0,126.0,92.0,17.1,1,1.0,21.0,35.0,40.0,1,1,25.951557,2,93.333333,0.6,0,Non-Smoker
1,1,30,180,80,89.0,0.9,1.2,1,1,130.0,82.0,106.0,228.0,55.0,148.0,121.0,15.8,1,0.9,20.0,36.0,27.0,3,0,24.691358,1,98.0,0.555556,0,Current Smoker
3,1,50,175,80,91.0,1.5,1.2,1,1,145.0,87.0,95.0,201.0,76.0,104.0,106.0,17.6,1,1.1,29.0,34.0,18.0,1,0,26.122449,2,106.333333,0.852941,0,Non-Smoker
4,1,50,165,60,80.0,1.0,1.2,1,1,138.0,82.0,101.0,199.0,61.0,117.0,104.0,13.8,1,0.8,19.0,12.0,25.0,1,0,22.038567,1,100.666667,1.583333,0,Non-Smoker
5,1,50,165,55,75.0,1.2,1.5,1,1,142.0,92.0,99.0,218.0,77.0,95.0,232.0,13.8,3,0.8,29.0,40.0,37.0,3,1,20.20202,1,108.666667,0.725,0,Current Smoker


In [6]:
df_enc = encode_and_scale(df)
df_enc.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,BMI,MAP,Liver_Enzyme_Ratio,Anemia_Indicator,prev_smoker
0,1,-0.834105,0.900772,1.168505,1.151795,0.128398,0.13757,1,1,-0.063531,0.512956,0.350544,0.009549,-0.718717,0.413044,-0.320616,2.073138,1,0.870142,-0.232731,1.734657,1.184764,0.764943,0.287091,-1.460062,0,0
1,1,-1.181349,1.982749,1.602218,1.040952,-0.164544,0.726518,1,1,0.64148,0.721198,0.971868,1.033334,-0.201872,1.114171,0.222648,1.175638,1,0.34221,-0.40655,1.850353,0.215221,0.381293,0.736072,-1.560599,0,0
3,1,0.207626,1.441761,1.602218,1.262639,1.593107,0.726518,1,1,1.698997,1.241802,-0.004498,0.243557,1.348664,-0.288083,-0.058351,2.41833,1,1.398073,1.157817,1.618961,-0.456,0.816968,1.537825,-0.887892,0,0
4,1,0.207626,0.359783,-0.132631,0.043365,0.128398,0.726518,1,1,1.205489,0.721198,0.528065,0.185055,0.241138,0.126219,-0.095817,-0.20513,1,-0.185721,-0.580368,-0.926356,0.066061,-0.426312,0.992633,0.764306,0,0
5,1,0.207626,0.359783,-0.566343,-0.510851,0.714281,1.60994,1,1,1.487494,1.762406,0.350544,0.740824,1.422499,-0.574908,2.302035,-0.20513,3,-0.185721,1.157817,2.313138,0.961023,-0.985423,1.762316,-1.177304,0,0


In [26]:
df = get_data('data.csv')

In [28]:
df['SMK_stat_type_cd'].value_counts()

SMK_stat_type_cd
1    476506
3    132949
2    115321
Name: count, dtype: int64

In [22]:
df['prev_smoker'].value_counts()

prev_smoker
0    609455
1    115321
Name: count, dtype: int64

## Binary class prediction

In [30]:
X_train, X_test, y_train, y_test = split_train_test(df, y = 'prev_smoker', test_size=0.2)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

0.8389097381274317
              precision    recall  f1-score   support

           0       0.84      0.99      0.91    121723
           1       0.47      0.04      0.07     23233

    accuracy                           0.84    144956
   macro avg       0.66      0.51      0.49    144956
weighted avg       0.78      0.84      0.78    144956



## Multiclass prediction

In [32]:
X_train, X_test, y_train, y_test = split_train_test(df, 'SMK_stat_type_cd', test_size=0.2)

In [34]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Overall Accuracy: {accuracy:.4f}\n')
print(classification_report(y_test, y_pred))

Overall Accuracy: 0.7082

              precision    recall  f1-score   support

           1       0.81      0.86      0.84     94926
           2       0.45      0.30      0.36     23233
           3       0.48      0.53      0.50     26797

    accuracy                           0.71    144956
   macro avg       0.58      0.56      0.57    144956
weighted avg       0.69      0.71      0.70    144956

