In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./../../data/pima_indian_diabetes/diabetes_cleaned.csv', index_col=0)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,33.6,0.627,50,1
1,1,85,66,29,26.6,0.351,31,0
2,8,183,64,0,23.3,0.672,32,1
3,1,89,66,23,28.1,0.167,21,0
5,5,116,74,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...
763,10,101,76,48,32.9,0.171,63,0
764,2,122,70,27,36.8,0.340,27,0
765,5,121,72,23,26.2,0.245,30,0
766,1,126,60,0,30.1,0.349,47,1


In [3]:
print(df)

     Pregnancies  Glucose  BloodPressure  SkinThickness   BMI  \
0              6      148             72             35  33.6   
1              1       85             66             29  26.6   
2              8      183             64              0  23.3   
3              1       89             66             23  28.1   
5              5      116             74              0  25.6   
..           ...      ...            ...            ...   ...   
763           10      101             76             48  32.9   
764            2      122             70             27  36.8   
765            5      121             72             23  26.2   
766            1      126             60              0  30.1   
767            1       93             70             31  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                       0.351   31        0  
2                       0.672   32        1  
3                       0.167   21 

In [5]:
import pandas as pd

# データフレームの読み込み（例としてdfという変数に代入）
# ここではデータを手動で与えた部分を読み込んでいないので、実際のデータを読み込む必要があります。
# df = pd.read_csv("your_data.csv")

# 'Outcome' 列以外の列を平均0、標準偏差1に標準化
df_standardized = df.drop(columns=['Outcome']).apply(lambda x: (x - x.mean()) / x.std())

# 'Outcome' 列をそのまま追加
df_standardized['Outcome'] = df['Outcome']

# 標準化されたデータを表示
df_standardized


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.670823,0.908191,-0.004845,0.908365,0.231905,0.779053,1.534905,1
1,-0.865238,-1.196379,-0.531253,0.516854,-0.857582,-0.320191,-0.163725,0
2,1.285247,2.077397,-0.706722,-1.375453,-1.371197,0.958278,-0.074323,1
3,-0.865238,-1.062756,-0.531253,0.125342,-0.624120,-1.053020,-1.057741,0
5,0.363611,-0.160797,0.170624,-1.375453,-1.013223,-0.917606,-0.253127,0
...,...,...,...,...,...,...,...,...
763,1.899671,-0.661885,0.346093,1.756641,0.122956,-1.037089,2.697126,0
764,-0.558026,0.039638,-0.180314,0.386350,0.729956,-0.364001,-0.521331,0
765,0.363611,0.006232,-0.004845,0.125342,-0.919838,-0.742364,-0.253127,0
766,-0.865238,0.173262,-1.057660,-1.375453,-0.312838,-0.328156,1.266701,1


In [8]:
df_standardized.to_csv('./../../data/pima_indian_diabetes/diabetes_cleaned_standardized.csv')

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 特徴量とラベルに分割
X = df_standardized.drop(columns=['Outcome'])
y = df_standardized['Outcome']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear SVM モデルの学習
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# テストデータを用いて予測
y_pred = svm_model.predict(X_test)

# 評価指標の計算
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# 結果の表示
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC: {roc_auc:.4f}")


Accuracy: 0.7836
Precision: 0.6829
Recall: 0.6364
F1-Score: 0.6588
AUC: 0.7460


In [7]:
import pandas as pd

# データフレームの読み込み（例としてdfという変数に代入）
# ここではデータを手動で与えた部分を読み込んでいないので、実際のデータを読み込む必要があります。
# df = pd.read_csv("your_data.csv")

# 'Outcome' 列以外の列をmin-max正規化
df_normalized = df.drop(columns=['Outcome']).apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# 'Outcome' 列をそのまま追加
df_normalized['Outcome'] = df['Outcome']

df_normalized


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.461538,0.675325,0.500000,0.583333,0.484277,0.493261,0.644444,1
1,0.076923,0.266234,0.411765,0.483333,0.264151,0.245283,0.222222,0
2,0.615385,0.902597,0.382353,0.000000,0.160377,0.533693,0.244444,1
3,0.076923,0.292208,0.411765,0.383333,0.311321,0.079964,0.000000,0
5,0.384615,0.467532,0.529412,0.000000,0.232704,0.110512,0.200000,0
...,...,...,...,...,...,...,...,...
763,0.769231,0.370130,0.558824,0.800000,0.462264,0.083558,0.933333,0
764,0.153846,0.506494,0.470588,0.450000,0.584906,0.235400,0.133333,0
765,0.384615,0.500000,0.500000,0.383333,0.251572,0.150045,0.200000,0
766,0.076923,0.532468,0.323529,0.000000,0.374214,0.243486,0.577778,1


In [9]:
df_normalized.to_csv('./../../data/pima_indian_diabetes/diabetes_cleaned_normalized.csv')

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 特徴量とラベルに分割
X = df_normalized.drop(columns=['Outcome'])
y = df_normalized['Outcome']

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear SVM モデルの学習
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# テストデータを用いて予測
y_pred = svm_model.predict(X_test)

# 評価指標の計算
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# 結果の表示
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC: {roc_auc:.4f}")


Accuracy: 0.7836
Precision: 0.6744
Recall: 0.6591
F1-Score: 0.6667
AUC: 0.7518
