In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
data = pd.read_csv("./data/diabetes.csv")
display(data)

# データを特徴量とターゲットに分割
X = data.drop('Outcome', axis=1)
y = data['Outcome']


# Insulin は欠損率が高いので今回は除外する
data = data.drop(columns=['Insulin'])

# Highly skewed
data["BMI"].replace(to_replace=np.nan, value=data["BMI"].median(), inplace=True)
data["Pregnancies"].replace(to_replace=np.nan, value=data["Pregnancies"].median(), inplace=True)

# Normal
data["Glucose"].replace(to_replace=np.nan, value=data["Glucose"].mean(), inplace=True)
data["BloodPressure"].replace(to_replace=np.nan, value=data["BloodPressure"].mean(), inplace=True)
data["SkinThickness"].replace(to_replace=np.nan, value=data["SkinThickness"].mean(), inplace=True)

Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

data.to_csv('./data/diabetes_cleaned.csv')


bins = 3
labels = [0, 1, 2]

data_tmp_1 = data.drop(['Outcome'], axis=1)
data_tmp_2 = data['Outcome']

for column in range(data_tmp_1.shape[1]):
    data_tmp_1.iloc[:, column] = pd.cut(data_tmp_1.iloc[:, column], bins=bins, labels=labels)

data = pd.concat([data_tmp_1, data_tmp_2], axis=1)
data.to_csv('./data/diabetes_discretized_old.csv')



target_col_names = data.columns.tolist()[:-1]

data = pd.get_dummies(data, columns=target_col_names)
data = data.replace({True: 1, False:0})

# column 名を rename する
columns = data.columns.to_list()
new_columns = []
mapping_dict = {'0': 'Low', '1': 'Medium', '2': 'High'}

for col in columns:
    if '_' in col:
        tmp = col.split('_')
        tmp1, tmp2 = tmp[0], tmp[1]
        new_col = tmp[0] + '_' + mapping_dict[tmp2]
        new_columns.append(new_col)
    else:
        new_columns.append(col)

data.columns = new_columns
data.to_csv('./data/diabetes_discretized.csv')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


- diabetes.csv                 ... オリジナル
- diabetes_cleaned.csv         ... 前処理済み
- diabetes_discretized_old.csv ... 前処理 + 離散化（3 段階）
- diabetes_discretized.csv     ... 前処理 + 離散化 + one-hot エンコーディング

# オリジナルデータ

In [4]:
data = pd.read_csv("./data/diabetes.csv", index_col=0)
display(data.head())
display(data.info())

# データを特徴量とターゲットに分割
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# SVM（線形カーネル）のモデルを作成
svm_linear = SVC(kernel='linear')

# モデルを訓練
svm_linear.fit(X_train, y_train)

# テストデータで予測
y_pred_linear = svm_linear.predict(X_test)

# Accuracyを計算
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# Confusion Matrixを計算
cm_linear = confusion_matrix(y_test, y_pred_linear)

# Classification Reportを出力
report_linear = classification_report(y_test, y_pred_linear)

# 結果を出力
print("SVM (Linear Kernel) Accuracy:", accuracy_linear)
print()
print()
print("SVM (Linear Kernel) Confusion Matrix:\n", cm_linear)
print()
print()
print("SVM (Linear Kernel) Classification Report:\n", report_linear)

Unnamed: 0_level_0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1


<class 'pandas.core.frame.DataFrame'>
Index: 768 entries, 6 to 1
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Glucose                   768 non-null    int64  
 1   BloodPressure             768 non-null    int64  
 2   SkinThickness             768 non-null    int64  
 3   Insulin                   768 non-null    int64  
 4   BMI                       768 non-null    float64
 5   DiabetesPedigreeFunction  768 non-null    float64
 6   Age                       768 non-null    int64  
 7   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 54.0 KB


None

SVM (Linear Kernel) Accuracy: 0.7857142857142857


SVM (Linear Kernel) Confusion Matrix:
 [[84 15]
 [18 37]]


SVM (Linear Kernel) Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.84        99
           1       0.71      0.67      0.69        55

    accuracy                           0.79       154
   macro avg       0.77      0.76      0.76       154
weighted avg       0.78      0.79      0.78       154



# 前処理済みデータ

In [5]:
data = pd.read_csv("./data/diabetes_cleaned.csv", index_col=0)
display(data.head())
display(data.info())

# データを特徴量とターゲットに分割
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# SVM（線形カーネル）のモデルを作成
svm_linear = SVC(kernel='linear')

# モデルを訓練
svm_linear.fit(X_train, y_train)

# テストデータで予測
y_pred_linear = svm_linear.predict(X_test)

# Accuracyを計算
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# Confusion Matrixを計算
cm_linear = confusion_matrix(y_test, y_pred_linear)

# Classification Reportを出力
report_linear = classification_report(y_test, y_pred_linear)

# 結果を出力
print("SVM (Linear Kernel) Accuracy:", accuracy_linear)
print()
print()
print("SVM (Linear Kernel) Confusion Matrix:\n", cm_linear)
print()
print()
print("SVM (Linear Kernel) Classification Report:\n", report_linear)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,33.6,0.627,50,1
1,1,85,66,29,26.6,0.351,31,0
2,8,183,64,0,23.3,0.672,32,1
3,1,89,66,23,28.1,0.167,21,0
5,5,116,74,0,25.6,0.201,30,0


<class 'pandas.core.frame.DataFrame'>
Index: 670 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               670 non-null    int64  
 1   Glucose                   670 non-null    int64  
 2   BloodPressure             670 non-null    int64  
 3   SkinThickness             670 non-null    int64  
 4   BMI                       670 non-null    float64
 5   DiabetesPedigreeFunction  670 non-null    float64
 6   Age                       670 non-null    int64  
 7   Outcome                   670 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 47.1 KB


None

SVM (Linear Kernel) Accuracy: 0.7835820895522388


SVM (Linear Kernel) Confusion Matrix:
 [[77 13]
 [16 28]]


SVM (Linear Kernel) Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84        90
           1       0.68      0.64      0.66        44

    accuracy                           0.78       134
   macro avg       0.76      0.75      0.75       134
weighted avg       0.78      0.78      0.78       134



# 離散化済みデータ

In [4]:
data = pd.read_csv("./data/diabetes_discretized.csv", index_col=0)
display(data.head())
display(data.info())

# データを特徴量とターゲットに分割
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# SVM（線形カーネル）のモデルを作成
svm_linear = SVC(kernel='linear')

# モデルを訓練
svm_linear.fit(X_train, y_train)

# テストデータで予測
y_pred_linear = svm_linear.predict(X_test)

# Accuracyを計算
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# Confusion Matrixを計算
cm_linear = confusion_matrix(y_test, y_pred_linear)

# Classification Reportを出力
report_linear = classification_report(y_test, y_pred_linear)

# 結果を出力
print("SVM (Linear Kernel) Accuracy:", accuracy_linear)
print()
print()
print("SVM (Linear Kernel) Confusion Matrix:\n", cm_linear)
print()
print()
print("SVM (Linear Kernel) Classification Report:\n", report_linear)

Unnamed: 0,Outcome,Pregnancies_Low,Pregnancies_Medium,Pregnancies_High,Glucose_Low,Glucose_Medium,Glucose_High,BloodPressure_Low,BloodPressure_Medium,BloodPressure_High,...,SkinThickness_High,BMI_Low,BMI_Medium,BMI_High,DiabetesPedigreeFunction_Low,DiabetesPedigreeFunction_Medium,DiabetesPedigreeFunction_High,Age_Low,Age_Medium,Age_High
0,1,0,1,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
1,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,1,0,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
3,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
5,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0


<class 'pandas.core.frame.DataFrame'>
Index: 670 entries, 0 to 767
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Outcome                          670 non-null    int64
 1   Pregnancies_Low                  670 non-null    int64
 2   Pregnancies_Medium               670 non-null    int64
 3   Pregnancies_High                 670 non-null    int64
 4   Glucose_Low                      670 non-null    int64
 5   Glucose_Medium                   670 non-null    int64
 6   Glucose_High                     670 non-null    int64
 7   BloodPressure_Low                670 non-null    int64
 8   BloodPressure_Medium             670 non-null    int64
 9   BloodPressure_High               670 non-null    int64
 10  SkinThickness_Low                670 non-null    int64
 11  SkinThickness_Medium             670 non-null    int64
 12  SkinThickness_High               670 non-null    int64


None

SVM (Linear Kernel) Accuracy: 0.7761194029850746


SVM (Linear Kernel) Confusion Matrix:
 [[82  8]
 [22 22]]


SVM (Linear Kernel) Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.91      0.85        90
           1       0.73      0.50      0.59        44

    accuracy                           0.78       134
   macro avg       0.76      0.71      0.72       134
weighted avg       0.77      0.78      0.76       134

