# Binary Prediction of Smoker Status Using Bio Signals 

## Exploratory Data Analysis

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

warnings.filterwarnings('ignore')

In [4]:
train = pd.read_csv(r'D:/Data Science Project/1.2.3 Challenge/Binary-Prediction-of-Smoker-Status-Using-Bio-Signals/Data/train.csv')
test = pd.read_csv(r'D:/Data Science Project/1.2.3 Challenge/Binary-Prediction-of-Smoker-Status-Using-Bio-Signals/Data/test.csv')
train_original = pd.read_csv(r'D:/Data Science Project/1.2.3 Challenge/Binary-Prediction-of-Smoker-Status-Using-Bio-Signals/Data/train_dataset.csv')

In [23]:
def check_data_quality(data):
    """
    DESC
    See data quality. Spesifics on incorrect data types, missing values, Duplicates & Data Distribution

    PARAMETER
    data = data uesd for data quality check

    RETURN
    Data quality checking results from incorrect data types missing values, duplicates & Data Distribution    
    """

    print("-------")
    print(data.info()) # Check data types 

    print("-------")
    if data.isnull().sum().sum() == 0: # Check missing values
        print("Data Was Clean From Missing Values")
    else :
        print(data.isnull().sum())

    print("-------")
    print(f"Number of Duplicates : {data.duplicated().sum()} rows") # Check Number of Duplicates 

    print("-------")
    print(data.describe().T)

In [24]:
# Check Quality Data on Train
check_data_quality(train)

-------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   159256 non-null  int64  
 1   age                  159256 non-null  int64  
 2   height(cm)           159256 non-null  int64  
 3   weight(kg)           159256 non-null  int64  
 4   waist(cm)            159256 non-null  float64
 5   eyesight(left)       159256 non-null  float64
 6   eyesight(right)      159256 non-null  float64
 7   hearing(left)        159256 non-null  int64  
 8   hearing(right)       159256 non-null  int64  
 9   systolic             159256 non-null  int64  
 10  relaxation           159256 non-null  int64  
 11  fasting blood sugar  159256 non-null  int64  
 12  Cholesterol          159256 non-null  int64  
 13  triglyceride         159256 non-null  int64  
 14  HDL                  159256 non-null  int64  
 15  LDL      

In [25]:
# Check Quality Data on Test
check_data_quality(test)

-------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106171 entries, 0 to 106170
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   106171 non-null  int64  
 1   age                  106171 non-null  int64  
 2   height(cm)           106171 non-null  int64  
 3   weight(kg)           106171 non-null  int64  
 4   waist(cm)            106171 non-null  float64
 5   eyesight(left)       106171 non-null  float64
 6   eyesight(right)      106171 non-null  float64
 7   hearing(left)        106171 non-null  int64  
 8   hearing(right)       106171 non-null  int64  
 9   systolic             106171 non-null  int64  
 10  relaxation           106171 non-null  int64  
 11  fasting blood sugar  106171 non-null  int64  
 12  Cholesterol          106171 non-null  int64  
 13  triglyceride         106171 non-null  int64  
 14  HDL                  106171 non-null  int64  
 15  LDL      

In [26]:
# Check Quality Data on Original Train
check_data_quality(train_original)

-------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38984 entries, 0 to 38983
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  38984 non-null  int64  
 1   height(cm)           38984 non-null  int64  
 2   weight(kg)           38984 non-null  int64  
 3   waist(cm)            38984 non-null  float64
 4   eyesight(left)       38984 non-null  float64
 5   eyesight(right)      38984 non-null  float64
 6   hearing(left)        38984 non-null  int64  
 7   hearing(right)       38984 non-null  int64  
 8   systolic             38984 non-null  int64  
 9   relaxation           38984 non-null  int64  
 10  fasting blood sugar  38984 non-null  int64  
 11  Cholesterol          38984 non-null  int64  
 12  triglyceride         38984 non-null  int64  
 13  HDL                  38984 non-null  int64  
 14  LDL                  38984 non-null  int64  
 15  hemoglobin           38984 n

In [52]:
# Check Target Distribution
datas = [train, train_original]
for data in datas:
    print(data['smoking'].value_counts(normalize=True))

0    0.562635
1    0.437365
Name: smoking, dtype: float64
0    0.632721
1    0.367279
Name: smoking, dtype: float64


In [53]:
# Treat Outlier Values : Removing Them on Train Origial
from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples = 100, random_state=42)
y_pred = clf.fit_predict(train)
train_cleaned = train[np.where(y_pred == 1, True, False)]

In [57]:
# Treat Outlier Values : Removing them on Train Original
clf = IsolationForest(max_samples = 100, random_state=42)
y_pred = clf.fit_predict(train_original)
train_original_cleaned = train_original[np.where(y_pred == 1, True, False)]

## Feature Engineering

## Data Preprocessing