讀取資料


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('diabetes_binary_1.csv')

# df = pd.read_csv("diabetes_ver2.csv")

# standardize "BMI", "GenHlth", "MentHlth", "PhysHlth", "Age", "Education", "Income"
df["BMI"] = (df["BMI"] - df["BMI"].mean()) / df["BMI"].std()
df["GenHlth"] = (df["GenHlth"] - df["GenHlth"].mean()) / df["GenHlth"].std()
df["MentHlth"] = (df["MentHlth"] - df["MentHlth"].mean()) / df["MentHlth"].std()
df["PhysHlth"] = (df["PhysHlth"] - df["PhysHlth"].mean()) / df["PhysHlth"].std()
df["Age"] = (df["Age"] - df["Age"].mean()) / df["Age"].std()
df["Education"] = (df["Education"] - df["Education"].mean()) / df["Education"].std()
df["Income"] = (df["Income"] - df["Income"].mean()) / df["Income"].std()


train = df[df['fold'].isin(range(1, 8))]
test = df[df['fold'] == 10]
tune = df[df['fold'] == 8]

train = train.drop("fold", axis=1)
test = test.drop("fold", axis=1)
tune = tune.drop("fold", axis=1)


x_train = train.drop("Diabetes_binary", axis=1)
y_train = train["Diabetes_binary"]
x_test = test.drop("Diabetes_binary", axis=1)
y_test = test["Diabetes_binary"]
x_tune = tune.drop("Diabetes_binary", axis=1)
y_tune = tune["Diabetes_binary"]

# standardize the data
x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()
x_tune = (x_tune - x_tune.mean()) / x_tune.std()


x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)
x_tune = np.array(x_tune)
y_tune = np.array(y_tune)


將最佳化的參數放入訓練

In [2]:
from sklearn import svm
import joblib

# Assuming you have already trained your SVM model
clf = svm.SVC(kernel='linear', C=1, gamma='auto')
clf.fit(x_train, y_train)

# Save the trained model to a file
joblib.dump(clf, 'svm_model1.pkl')


['svm_model1.pkl']

看表現

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib


clf = joblib.load('svm_model1.pkl')
y_pred = clf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1:", f1_score(y_test, y_pred, average='macro'))


Accuracy: 0.7527287993282955
Precision: 0.7542630130574616
Recall: 0.7518382940656148
F1: 0.7518427524704868


取得各項weight

In [4]:
# get parameterns of best SVM model
print(clf.get_params())

# get weight of each feature of best SVM model, with the same order as in the dataset, and with feature names
import pandas as pd
import numpy as np

df = pd.read_csv('diabetes_binary_1.csv')
df = df.drop("fold", axis=1)
x_train = df.drop("Diabetes_binary", axis=1)
feature_names = list(x_train.columns.values)
print(feature_names)
print(clf.coef_)
print(clf.intercept_)

# map feature names to weights
feature_weight = {}
for i in range(len(feature_names)):
    feature_weight[feature_names[i]] = clf.coef_[0][i]
print(feature_weight)

# save feature weights into csv
import csv
with open('feature_weight/SVM_b1.csv', 'w') as f:
    writer = csv.writer(f)
    for key, value in feature_weight.items():
        writer.writerow([key, value])


{'C': 1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
[[ 0.37600989  0.23360685  0.16440517  0.42047905  0.00693593  0.02225315
   0.07594148 -0.01113205 -0.0006653  -0.02435445 -0.1453853   0.0102078
   0.01260952  0.52453581 -0.03865735 -0.07761941  0.03586539  0.08747957
   0.33052137 -0.02912478 -0.10850282]]
[0.03919867]
{'HighBP': 0.3760098949898172, 'HighChol': 0.23360684705362567, 'CholCheck': 0.164405174981042, 'BMI': 0.42047904800384583, 'Smoker': 0.006935928570898753, 'Stroke': 0.02225315307944342

讀取b2資料

In [5]:
import pandas as pd
import numpy as np
import io

df = pd.read_csv('diabetes_binary_2.csv')

# standardize "BMI", "GenHlth", "MentHlth", "PhysHlth", "Age", "Education", "Income"
df["BMI"] = (df["BMI"] - df["BMI"].mean()) / df["BMI"].std()
df["GenHlth"] = (df["GenHlth"] - df["GenHlth"].mean()) / df["GenHlth"].std()
df["MentHlth"] = (df["MentHlth"] - df["MentHlth"].mean()) / df["MentHlth"].std()
df["PhysHlth"] = (df["PhysHlth"] - df["PhysHlth"].mean()) / df["PhysHlth"].std()
df["Age"] = (df["Age"] - df["Age"].mean()) / df["Age"].std()
df["Education"] = (df["Education"] - df["Education"].mean()) / df["Education"].std()
df["Income"] = (df["Income"] - df["Income"].mean()) / df["Income"].std()

train = df[df['fold'].isin(range(1, 9))]
test = df[df['fold'] == 10]

train = train.drop("fold", axis=1)
test = test.drop("fold", axis=1)

x_train = train.drop("Diabetes_binary", axis=1)
y_train = train["Diabetes_binary"]
x_test = test.drop("Diabetes_binary", axis=1)
y_test = test["Diabetes_binary"]

x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

將最佳化參數放入

In [7]:
from sklearn import svm
import joblib

# Assuming you have already trained your SVM model
clf = svm.SVC(kernel='linear', C=0.1, gamma='auto')
clf.fit(x_train, y_train)

# Save the trained model to a file
joblib.dump(clf, 'svm_model2.pkl')


['svm_model2.pkl']

看表現

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib


clf = joblib.load('svm_model2.pkl')
y_pred = clf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1:", f1_score(y_test, y_pred, average='macro'))

Accuracy: 0.8639001510453931
Precision: 0.43195007552269654
Recall: 0.5
F1: 0.4634905740851318


  _warn_prf(average, modifier, msg_start, len(result))
