# Import Libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy.stats import zscore


In [4]:
data = pd.read_csv("data/smoking_driking_dataset.csv")
data.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,...,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN
0,Male,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,...,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,Y
1,Male,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,...,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,N
2,Male,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,...,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,N
3,Male,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,...,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,N
4,Male,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,...,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,N


In [7]:
data.shape

(991346, 24)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 991320 entries, 0 to 991345
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sex               991320 non-null  object 
 1   age               991320 non-null  int64  
 2   height            991320 non-null  int64  
 3   weight            991320 non-null  int64  
 4   waistline         991320 non-null  float64
 5   sight_left        991320 non-null  float64
 6   sight_right       991320 non-null  float64
 7   hear_left         991320 non-null  float64
 8   hear_right        991320 non-null  float64
 9   SBP               991320 non-null  float64
 10  DBP               991320 non-null  float64
 11  BLDS              991320 non-null  float64
 12  tot_chole         991320 non-null  float64
 13  HDL_chole         991320 non-null  float64
 14  LDL_chole         991320 non-null  float64
 15  triglyceride      991320 non-null  float64
 16  hemoglobin        99

# Data Cleaning & Preprocessing

In [None]:
# Check for Null values
data.isnull().sum()

sex                 0
age                 0
height              0
weight              0
waistline           0
sight_left          0
sight_right         0
hear_left           0
hear_right          0
SBP                 0
DBP                 0
BLDS                0
tot_chole           0
HDL_chole           0
LDL_chole           0
triglyceride        0
hemoglobin          0
urine_protein       0
serum_creatinine    0
SGOT_AST            0
SGOT_ALT            0
gamma_GTP           0
SMK_stat_type_cd    0
DRK_YN              0
dtype: int64

In [None]:
# Check for duplicate rows
data.duplicated().sum()

26

In [None]:
# Remove duplicate rows
data.drop_duplicates(inplace=True)
data.duplicated().sum()

0

In [None]:
# Step 3: Outlier Detection and Removal
# Using Z-score to detect and cap outliers
numerical_cols = data.select_dtypes(include=[np.number]).columns
z_scores = data[numerical_cols].apply(zscore)
df = data[(z_scores.abs() < 3).all(axis=1)]

In [None]:
# Step 4: Encode Target Variables
# SMK_stat_type_cd (Smoking Status): 1=never, 2=quit, 3=still smoke
# DRK_YN (Drinker or Not): Y=1, N=0
df['DRK_YN'] = df['DRK_YN'].map({'Y': 1, 'N': 0})

# Exploratory Data Analysis

In [13]:
data.describe()

Unnamed: 0,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,...,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd
count,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,...,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0,991320.0
mean,47.614529,162.240563,63.283884,81.233255,0.980833,0.978428,1.031495,1.030476,122.43236,76.052549,...,56.936984,113.037429,132.14003,14.22981,1.094221,0.860467,25.989424,25.755148,37.136152,1.608112
std,14.181346,9.282922,12.514101,11.850296,0.605954,0.604779,0.174652,0.171892,14.543083,9.889334,...,17.238578,35.842938,102.194762,1.584924,0.437719,0.480536,23.493668,26.30891,50.423811,0.818504
min,20.0,130.0,25.0,8.0,0.1,0.1,1.0,1.0,67.0,32.0,...,1.0,1.0,1.0,1.0,1.0,0.1,1.0,1.0,1.0,1.0
25%,35.0,155.0,55.0,74.1,0.7,0.7,1.0,1.0,112.0,70.0,...,46.0,89.0,73.0,13.2,1.0,0.7,19.0,15.0,16.0,1.0
50%,45.0,160.0,60.0,81.0,1.0,1.0,1.0,1.0,120.0,76.0,...,55.0,111.0,106.0,14.3,1.0,0.8,23.0,20.0,23.0,1.0
75%,60.0,170.0,70.0,87.8,1.2,1.2,1.0,1.0,131.0,82.0,...,66.0,135.0,159.0,15.4,1.0,1.0,28.0,29.0,39.0,2.0
max,85.0,190.0,140.0,999.0,9.9,9.9,2.0,2.0,273.0,185.0,...,8110.0,5119.0,9490.0,25.0,6.0,98.0,9999.0,7210.0,999.0,3.0
