In [14]:
import pandas as pd
from pathlib import Path
import plotly.express as pxv

In [16]:
df_train = pd.read_csv('../Resources/train_dataset.csv')
df_test = pd.read_csv('../Resources/test_dataset.csv')

In [17]:
print(f"Train Dataset has {df_train.shape[0]} rows. And is" 
      f" {round(df_train.shape[0]/(df_train.shape[0]+df_test.shape[0])*100, 2)} % of the dataset\n"
      f"Test Dataset has {df_test.shape[0]} rows."
      f" And is {round(df_test.shape[0]/(df_train.shape[0]+df_test.shape[0])*100, 2)} % of the dataset")
      

Train Dataset has 38984 rows. And is 70.0 % of the dataset
Test Dataset has 16708 rows. And is 30.0 % of the dataset


In [18]:
df_train.columns

Index(['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking'],
      dtype='object')

In [19]:
df_test.columns

Index(['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries'],
      dtype='object')

Useful columns likely exclude Height, Eyesight, and Hearing. The other features will require some amount of machine learning to determine whether they are important features to for the machine learning algorithm or not.

In [20]:
# Dropping not useful columns
# df_test_light has all the same features except for 'smoking', which is the target
df_train = df_train[['age', 'weight(kg)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking']]
df_test = df_test[['age', 'weight(kg)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries']]

The question is if we do a supervised machine learning model, we dont have smoking information for the test dataset. So what do we do ?

## Recommendation:
#### Disregard the 'test' dataset and use only the train dataset for a machine learning algorithm and split it ourselves. This enables us to do a supervised machine learning algorithm. 

In [21]:
display(df_train.head(), df_test.head())

Unnamed: 0,age,weight(kg),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,85,118,78,97,239,153,70,142,19.8,1,1.0,61,115,125,1,1
1,20,110,119,79,88,211,128,71,114,15.9,1,1.1,19,25,30,1,0
2,45,65,110,80,80,193,120,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,80,158,88,249,210,366,46,91,16.9,1,0.9,32,36,36,0,0
4,20,60,109,64,100,179,200,47,92,14.9,1,1.2,26,28,15,0,0


Unnamed: 0,age,weight(kg),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,40,65,120,70,102,225,260,41,132,15.7,1,0.8,24,26,32,0
1,45,75,100,67,96,258,345,49,140,15.7,1,1.1,26,28,138,0
2,30,90,115,72,88,177,103,53,103,13.5,1,1.0,19,29,30,0
3,60,50,118,78,86,187,70,65,108,14.1,1,1.3,31,28,33,0
4,30,65,110,70,87,190,210,45,103,14.7,1,0.8,21,21,19,0


In [22]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38984 entries, 0 to 38983
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  38984 non-null  int64  
 1   weight(kg)           38984 non-null  int64  
 2   systolic             38984 non-null  int64  
 3   relaxation           38984 non-null  int64  
 4   fasting blood sugar  38984 non-null  int64  
 5   Cholesterol          38984 non-null  int64  
 6   triglyceride         38984 non-null  int64  
 7   HDL                  38984 non-null  int64  
 8   LDL                  38984 non-null  int64  
 9   hemoglobin           38984 non-null  float64
 10  Urine protein        38984 non-null  int64  
 11  serum creatinine     38984 non-null  float64
 12  AST                  38984 non-null  int64  
 13  ALT                  38984 non-null  int64  
 14  Gtp                  38984 non-null  int64  
 15  dental caries        38984 non-null 

In [23]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16708 entries, 0 to 16707
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  16708 non-null  int64  
 1   weight(kg)           16708 non-null  int64  
 2   systolic             16708 non-null  int64  
 3   relaxation           16708 non-null  int64  
 4   fasting blood sugar  16708 non-null  int64  
 5   Cholesterol          16708 non-null  int64  
 6   triglyceride         16708 non-null  int64  
 7   HDL                  16708 non-null  int64  
 8   LDL                  16708 non-null  int64  
 9   hemoglobin           16708 non-null  float64
 10  Urine protein        16708 non-null  int64  
 11  serum creatinine     16708 non-null  float64
 12  AST                  16708 non-null  int64  
 13  ALT                  16708 non-null  int64  
 14  Gtp                  16708 non-null  int64  
 15  dental caries        16708 non-null 

In [24]:
df_train.nunique()

age                     14
weight(kg)              22
systolic               125
relaxation              94
fasting blood sugar    258
Cholesterol            279
triglyceride           389
HDL                    122
LDL                    286
hemoglobin             143
Urine protein            6
serum creatinine        34
AST                    195
ALT                    230
Gtp                    439
dental caries            2
smoking                  2
dtype: int64

In [25]:
df_test.nunique()

age                     14
weight(kg)              20
systolic               114
relaxation              81
fasting blood sugar    221
Cholesterol            251
triglyceride           383
HDL                    113
LDL                    235
hemoglobin             130
Urine protein            6
serum creatinine        25
AST                    161
ALT                    195
Gtp                    358
dental caries            2
dtype: int64

In [26]:
# Checking for nan values in Train data
for column in df_train.columns:
    print(f"Column {column} has {df_train[column].isnull().sum()} null values")

Column age has 0 null values
Column weight(kg) has 0 null values
Column systolic has 0 null values
Column relaxation has 0 null values
Column fasting blood sugar has 0 null values
Column Cholesterol has 0 null values
Column triglyceride has 0 null values
Column HDL has 0 null values
Column LDL has 0 null values
Column hemoglobin has 0 null values
Column Urine protein has 0 null values
Column serum creatinine has 0 null values
Column AST has 0 null values
Column ALT has 0 null values
Column Gtp has 0 null values
Column dental caries has 0 null values
Column smoking has 0 null values


In [27]:
# Checking for nan values in Test data
for column in df_test.columns:
    print(f"Column {column} has {df_test[column].isnull().sum()} null values")

Column age has 0 null values
Column weight(kg) has 0 null values
Column systolic has 0 null values
Column relaxation has 0 null values
Column fasting blood sugar has 0 null values
Column Cholesterol has 0 null values
Column triglyceride has 0 null values
Column HDL has 0 null values
Column LDL has 0 null values
Column hemoglobin has 0 null values
Column Urine protein has 0 null values
Column serum creatinine has 0 null values
Column AST has 0 null values
Column ALT has 0 null values
Column Gtp has 0 null values
Column dental caries has 0 null values


In [28]:
# Check for appropriate data types
display(df_train.dtypes, df_test.dtypes)

age                      int64
weight(kg)               int64
systolic                 int64
relaxation               int64
fasting blood sugar      int64
Cholesterol              int64
triglyceride             int64
HDL                      int64
LDL                      int64
hemoglobin             float64
Urine protein            int64
serum creatinine       float64
AST                      int64
ALT                      int64
Gtp                      int64
dental caries            int64
smoking                  int64
dtype: object

age                      int64
weight(kg)               int64
systolic                 int64
relaxation               int64
fasting blood sugar      int64
Cholesterol              int64
triglyceride             int64
HDL                      int64
LDL                      int64
hemoglobin             float64
Urine protein            int64
serum creatinine       float64
AST                      int64
ALT                      int64
Gtp                      int64
dental caries            int64
dtype: object

## Potential Algorithms:
 - Random Forest
 - deep learning model? compare the performance

In [30]:
df_train.to_csv("../Resources/smoker_dataset_cleaned.csv")