In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

In [2]:
def load():
    data = pd.read_csv("/kaggle/input/datasets/redwankarimsony/heart-disease-data/heart_disease_uci.csv")
    return data


In [3]:
df = load()
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    iqr = quartile3 - quartile1
    up_limit = quartile3 + iqr * 1.5
    low_limit = quartile1 - iqr * 1.5
    return low_limit, up_limit

    
    

In [5]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False



In [6]:
df.columns = [col.upper() for col in df.columns]

In [7]:
df.columns

Index(['ID', 'AGE', 'SEX', 'DATASET', 'CP', 'TRESTBPS', 'CHOL', 'FBS',
       'RESTECG', 'THALCH', 'EXANG', 'OLDPEAK', 'SLOPE', 'CA', 'THAL', 'NUM'],
      dtype='object')

In [8]:
df.loc[(df["SEX"] == "Male") & (df["AGE"] <= 21), "GENDER_CAT"]= "youngmale"

In [9]:
df.columns

Index(['ID', 'AGE', 'SEX', 'DATASET', 'CP', 'TRESTBPS', 'CHOL', 'FBS',
       'RESTECG', 'THALCH', 'EXANG', 'OLDPEAK', 'SLOPE', 'CA', 'THAL', 'NUM',
       'GENDER_CAT'],
      dtype='object')

In [10]:
df.loc[(df["SEX"] == "Male") & (df["AGE"] > 21) & ((df["AGE"]) <= 50), "GENDER_CAT"] = "maturemale"

In [11]:
df.loc[(df["SEX"] == "Male") & (df["AGE"] > 50), "GENDER_CAT"] = "seniormale"

In [12]:
df.loc[(df["SEX"] == "Female") & (df["AGE"] > 21) & (df["AGE"] <= 50), "GENDER_CAT"] = "maturefemale"

In [13]:
df.loc[(df["SEX"] == "Female") & (df["AGE"] > 50), "GENDER_CAT"] = "seniorfemale"

In [14]:
df["GENDER_CAT"]

0        seniormale
1        seniormale
2        seniormale
3        maturemale
4      maturefemale
           ...     
915    seniorfemale
916      seniormale
917      seniormale
918      seniormale
919      seniormale
Name: GENDER_CAT, Length: 920, dtype: object

In [15]:
df["NEW_RISK_SCORE"] = df["OLDPEAK"] + df["EXANG"] + df["CA"]

In [16]:
df["NEW_HIGH_CHOL"] = (df["CHOL"] > 240).astype(int)

In [17]:
#combination feature
df["NEW_EXANG_SLOPE"] = df["SLOPE"] * df["EXANG"]

In [18]:
df["NEW_AGE_THR_RATIO"] = df["AGE"] / df["THALCH"]


In [19]:
df["NEW_HIGH_RISK"] = ((df["OLDPEAK"] > 2) & (df["EXANG"] == 1)).astype(int)


In [20]:
df["NEW_RISK_COUNT"] = (
    (df["CHOL"] > 240).astype(int) +
    (df["TRESTBPS"] > 140).astype(int) +
    (df["EXANG"] == 1).astype(int)
)


In [21]:
df.shape

(920, 23)

In [22]:
def grab_col_names(dataframe, cat_th = 10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th
              and dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
              dataframe[col].dtypes == "O"]
    
    cat_cols = num_but_cat + cat_cols


    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]

    num_cols = [col for col in num_cols if col not in num_but_cat]

    num_cols = [col for col in num_cols if col not in cat_but_car]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")
    return cat_cols, num_cols, cat_but_car


In [23]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 920
Variables: 23
cat_cols: 15
num_cols: 7
cat_but_car: 1
num_but_cat: 5


In [24]:
cat_cols

['CA',
 'NUM',
 'NEW_HIGH_CHOL',
 'NEW_HIGH_RISK',
 'NEW_RISK_COUNT',
 'SEX',
 'DATASET',
 'CP',
 'FBS',
 'RESTECG',
 'EXANG',
 'SLOPE',
 'THAL',
 'GENDER_CAT',
 'NEW_EXANG_SLOPE']

In [25]:
num_cols

['ID', 'AGE', 'TRESTBPS', 'CHOL', 'THALCH', 'OLDPEAK', 'NEW_AGE_THR_RATIO']

In [26]:
#we dont need to use ID
num_cols = [col for col in num_cols if "ID" not in col]

In [27]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe.loc[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit), col_name].any():
        return True
    else:
        return False

In [28]:
for col in num_cols:
    print(col, check_outlier(df, col))

AGE False
TRESTBPS True
CHOL True
THALCH True
OLDPEAK True
NEW_AGE_THR_RATIO True


In [29]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    
    

In [30]:
for col in num_cols:
    replace_with_thresholds(df, col)

  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit


In [31]:
for col in num_cols:
    print(col, check_outlier(df, col))

AGE False
TRESTBPS False
CHOL False
THALCH False
OLDPEAK False
NEW_AGE_THR_RATIO False


In [32]:
#This function shows how many missing values there are in each column and their percentages.
def missing_values_table(dataframe, na_name=False):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=["n_miss", "ratio"])
    print(missing_df, end="\n")

    if na_name:
        return na_columns

In [33]:
missing_values_table(df)

                   n_miss  ratio
CA                    611  66.41
NEW_RISK_SCORE        611  66.41
THAL                  486  52.83
NEW_EXANG_SLOPE       310  33.70
SLOPE                 309  33.59
FBS                    90   9.78
OLDPEAK                62   6.74
TRESTBPS               59   6.41
THALCH                 55   5.98
EXANG                  55   5.98
NEW_AGE_THR_RATIO      55   5.98
CHOL                   30   3.26
RESTECG                 2   0.22


In [34]:
df.drop(columns=["OLDPEAK","EXANG","CA"],inplace=True,axis=1)

In [35]:
df.head()

Unnamed: 0,ID,AGE,SEX,DATASET,CP,TRESTBPS,CHOL,FBS,RESTECG,THALCH,SLOPE,THAL,NUM,GENDER_CAT,NEW_RISK_SCORE,NEW_HIGH_CHOL,NEW_EXANG_SLOPE,NEW_AGE_THR_RATIO,NEW_HIGH_RISK,NEW_RISK_COUNT
0,1,63.0,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,downsloping,fixed defect,0,seniormale,2.3,0,,0.42,0,1
1,2,67.0,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,flat,normal,2,seniormale,5.5,1,flat,0.62037,0,3
2,3,67.0,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,flat,reversable defect,1,seniormale,5.6,0,flat,0.51938,1,1
3,4,37.0,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,downsloping,normal,0,maturemale,3.5,1,,0.197861,0,1
4,5,41.0,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,upsloping,normal,0,maturefemale,1.4,0,,0.238372,0,0


In [36]:
df.drop("NEW_HIGH_CHOL",inplace=True, axis=1)

In [37]:
df.drop("NEW_RISK_SCORE",inplace=True,axis=1)

In [38]:
df.drop("NEW_EXANG_SLOPE",inplace=True,axis=1)

In [39]:
df.drop("THAL",inplace=True,axis=1)

In [40]:
df["SLOPE"].fillna(df["SLOPE"].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["SLOPE"].fillna(df["SLOPE"].mode()[0],inplace=True)


In [41]:
num_cols2 = ["TRESTBPS","THALCH","CHOL"]

for col in num_cols2:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [42]:
df["FBS"].fillna(df["FBS"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["FBS"].fillna(df["FBS"].mode()[0], inplace=True)
  df["FBS"].fillna(df["FBS"].mode()[0], inplace=True)


In [43]:
df.isnull()

Unnamed: 0,ID,AGE,SEX,DATASET,CP,TRESTBPS,CHOL,FBS,RESTECG,THALCH,SLOPE,NUM,GENDER_CAT,NEW_AGE_THR_RATIO,NEW_HIGH_RISK,NEW_RISK_COUNT
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
916,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
917,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
918,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False


In [44]:
df = df.apply(lambda x:x.fillna(x.mode()[0]) if (x.dtype == "O" and len(x.unique()) <= 10) else x, axis=0)

In [45]:
missing_values_table(df)

                   n_miss  ratio
NEW_AGE_THR_RATIO      55   5.98


In [46]:
df["NEW_AGE_THR_RATIO"].fillna(df["NEW_AGE_THR_RATIO"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["NEW_AGE_THR_RATIO"].fillna(df["NEW_AGE_THR_RATIO"].median(), inplace=True)


In [47]:
missing_values_table(df)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


In [48]:
#selecting binary categorical variables
binary_cols = [col for col in df.columns if df[col].dtype not in [int, float]
              and df[col].nunique() == 2]

In [49]:
binary_cols


['SEX', 'FBS']

In [50]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [51]:
for col in binary_cols:
    df = label_encoder(df, col)

In [52]:
def cat_summary(dataframe, col_name,plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                       "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("#######################################")
    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()

In [53]:
def rare_analyser(dataframe, target, cat_cols):
    for col in cat_cols:
        print(col, ":", len(dataframe[col].value_counts()))#How many classes does the relevant categorical variable have?
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                           "RATIO": dataframe[col].value_counts() / len(dataframe),
                           "TARGET_MEAN": dataframe.groupby(col)[target].mean()}),end="\n\n\n")

In [54]:
df.columns

Index(['ID', 'AGE', 'SEX', 'DATASET', 'CP', 'TRESTBPS', 'CHOL', 'FBS',
       'RESTECG', 'THALCH', 'SLOPE', 'NUM', 'GENDER_CAT', 'NEW_AGE_THR_RATIO',
       'NEW_HIGH_RISK', 'NEW_RISK_COUNT'],
      dtype='object')

In [55]:
cat_cols = [col for col in cat_cols if col in df.columns]


In [56]:
rare_analyser(df, "NUM", cat_cols)

NUM : 5
     COUNT     RATIO  TARGET_MEAN
NUM                              
0      411  0.446739          0.0
1      265  0.288043          1.0
2      109  0.118478          2.0
3      107  0.116304          3.0
4       28  0.030435          4.0


NEW_HIGH_RISK : 2
               COUNT     RATIO  TARGET_MEAN
NEW_HIGH_RISK                              
0                855  0.929348     0.907602
1                 65  0.070652     2.153846


NEW_RISK_COUNT : 4
                COUNT     RATIO  TARGET_MEAN
NEW_RISK_COUNT                              
0                 303  0.329348     0.699670
1                 386  0.419565     0.989637
2                 182  0.197826     1.318681
3                  49  0.053261     1.673469


SEX : 2
     COUNT    RATIO  TARGET_MEAN
SEX                             
0      194  0.21087      0.42268
1      726  0.78913      1.14876


DATASET : 4
               COUNT     RATIO  TARGET_MEAN
DATASET                                    
Cleveland        304  0

In [57]:
def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == "O"
                   and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]
    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)#tmp = threshold
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), "Rare", temp_df[var])#Replace the rare labels you see with "Rare".

    return temp_df

In [58]:
df = rare_encoder(df, 0.01)

In [59]:
df["CP"].value_counts()
df["RESTECG"].value_counts()
## Rare encoding was not applied to any of them because none of them are below 0.01


RESTECG
normal              553
lv hypertrophy      188
st-t abnormality    179
Name: count, dtype: int64

In [60]:
ohe_cols = [col for col in df.columns if 10 >= df[col].nunique() > 2]

In [61]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [62]:
df = one_hot_encoder(df, ohe_cols)

In [63]:
df.head()

Unnamed: 0,ID,AGE,SEX,TRESTBPS,CHOL,FBS,THALCH,NEW_AGE_THR_RATIO,NEW_HIGH_RISK,DATASET_Hungary,...,NUM_1,NUM_2,NUM_3,NUM_4,GENDER_CAT_maturemale,GENDER_CAT_seniorfemale,GENDER_CAT_seniormale,NEW_RISK_COUNT_1,NEW_RISK_COUNT_2,NEW_RISK_COUNT_3
0,1,63.0,1,145.0,233.0,1,150.0,0.42,0,False,...,False,False,False,False,False,False,True,True,False,False
1,2,67.0,1,160.0,286.0,0,108.0,0.62037,0,False,...,False,True,False,False,False,False,True,False,False,True
2,3,67.0,1,120.0,229.0,0,129.0,0.51938,1,False,...,True,False,False,False,False,False,True,True,False,False
3,4,37.0,1,130.0,250.0,0,187.0,0.197861,0,False,...,False,False,False,False,True,False,False,True,False,False
4,5,41.0,0,130.0,204.0,0,172.0,0.238372,0,False,...,False,False,False,False,False,False,False,False,False,False


In [64]:
df.shape

(920, 29)

In [65]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 920
Variables: 29
cat_cols: 23
num_cols: 6
cat_but_car: 0
num_but_cat: 23


In [66]:
num_cols = [col for col in num_cols if "ID" not in col]

In [67]:
df.columns

Index(['ID', 'AGE', 'SEX', 'TRESTBPS', 'CHOL', 'FBS', 'THALCH',
       'NEW_AGE_THR_RATIO', 'NEW_HIGH_RISK', 'DATASET_Hungary',
       'DATASET_Switzerland', 'DATASET_VA Long Beach', 'CP_atypical angina',
       'CP_non-anginal', 'CP_typical angina', 'RESTECG_normal',
       'RESTECG_st-t abnormality', 'SLOPE_flat', 'SLOPE_upsloping', 'NUM_1',
       'NUM_2', 'NUM_3', 'NUM_4', 'GENDER_CAT_maturemale',
       'GENDER_CAT_seniorfemale', 'GENDER_CAT_seniormale', 'NEW_RISK_COUNT_1',
       'NEW_RISK_COUNT_2', 'NEW_RISK_COUNT_3'],
      dtype='object')

In [68]:
rare_analyser(df, "NUM_0", cat_cols)

SEX : 2


KeyError: 'Column not found: NUM_0'

In [None]:
useless_cols = [col for col in df.columns if df[col].nunique() == 2 and
               (df[col].value_counts() / len(df) < 0.01).any(axis=None)]
#sınıf sayısını gozlem sayısına boluyoruz herhangi bir tanesinde 0.01 den dusuk iki sınıflı bir kategorik değişken varsa onu getir 

In [None]:
useless_cols

#There aren't any useless cols

In [None]:
#df.drop("useless_cols",axis=1,inplace=True)

In [None]:
#bu problemde gerekli değil ama standartlaştırmaya ihtiyacımız olursa ne yapacağız?

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df[num_cols].head()

In [None]:
df["NUM"] = df[["NUM_0","NUM_1","NUM_2","NUM_3","NUM_4"]].idxmax(axis=1)
df["NUM"] = df["NUM"].str.split("_").str[1].astype(int)


In [None]:
df.drop(["NUM_0","NUM_1","NUM_2","NUM_3","NUM_4"], axis=1, inplace=True)
