In [41]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [43]:
data = pd.read_csv("./data/diabetes.csv")
display(data)

# データを特徴量とターゲットに分割
X = data.drop('Outcome', axis=1)
y = data['Outcome']

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [37]:
data.info

<bound method DataFrame.info of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   5

In [38]:

# Insulin は欠損率が高いので今回は除外する
data = data.drop(columns=['Insulin'])

# Highly skewed
data["BMI"].replace(to_replace=np.nan, value=data["BMI"].median(), inplace=True)
data["Pregnancies"].replace(to_replace=np.nan, value=data["Pregnancies"].median(), inplace=True)

# Normal
data["Glucose"].replace(to_replace=np.nan, value=data["Glucose"].mean(), inplace=True)
data["BloodPressure"].replace(to_replace=np.nan, value=data["BloodPressure"].mean(), inplace=True)
data["SkinThickness"].replace(to_replace=np.nan, value=data["SkinThickness"].mean(), inplace=True)

Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

In [39]:
data.to_csv('./data/diabetes_cleaned.csv')

In [40]:
bins = 3
labels = [0, 1, 2]

data_tmp_1 = data.drop(['Outcome'], axis=1)
data_tmp_2 = data['Outcome']

for column in range(data_tmp_1.shape[1]):
    data_tmp_1.iloc[:, column] = pd.cut(data_tmp_1.iloc[:, column], bins=bins, labels=labels)

data = pd.concat([data_tmp_1, data_tmp_2], axis=1)
data.to_csv('./data/diabetes_discretized_old.csv')

# 離散化したデータを one-hot encoding する

In [81]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,2,1,1,1,1,1,1
1,0,0,1,1,0,0,0,0
2,1,2,1,0,0,1,0,1
3,0,0,1,1,0,0,0,0
5,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
763,2,1,1,2,1,0,2,0
764,0,1,1,1,1,0,0,0
765,1,1,1,1,0,0,0,0
766,0,1,0,0,1,0,1,1


In [87]:
df = data.copy()

target_col_names = df.columns.tolist()[:-1]

df_encoded = pd.get_dummies(df, columns=target_col_names)
df_encoded = df_encoded.replace({True: 1, False:0})

# column 名を rename する
columns = df_encoded.columns.to_list()
new_columns = []
mapping_dict = {'0': 'Low', '1': 'Medium', '2': 'High'}

for col in columns:
    if '_' in col:
        tmp = col.split('_')
        tmp1, tmp2 = tmp[0], tmp[1]
        new_col = tmp[0] + '_' + mapping_dict[tmp2]
        new_columns.append(new_col)
    else:
        new_columns.append(col)

df_encoded.columns = new_columns
df_encoded.to_csv('./data/diabetes_discretized.csv')

In [86]:
df_encoded

Unnamed: 0,Outcome,Pregnancies_Low,Pregnancies_Medium,Pregnancies_High,Glucose_Low,Glucose_Medium,Glucose_High,BloodPressure_Low,BloodPressure_Medium,BloodPressure_High,...,SkinThickness_High,BMI_Low,BMI_Medium,BMI_High,DiabetesPedigreeFunction_Low,DiabetesPedigreeFunction_Medium,DiabetesPedigreeFunction_High,Age_Low,Age_Medium,Age_High
0,1,0,1,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
1,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,1,0,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
3,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
5,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0,0,0,1,0,1,0,0,1,0,...,1,0,1,0,1,0,0,0,0,1
764,0,1,0,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,1,0,0
765,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
766,1,1,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
