### 지도학습 (Classification)
### SVM (Support Vector Machine)
###

In [2]:
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd

# 키와 몸무게 데이터 읽어 들이기 --- (※1)
tbl = pd.read_csv("../dataset/customer.csv")
print(tbl.head())

# 칼럼(열)을 자르고 정규화하기 --- (※2)
label = tbl["label"]
deposit = tbl["deposit"]  # 최대 100kg라고 가정
stock = tbl["stock"]  # 최대 200cm라고 가정
features = pd.concat([deposit, stock], axis=1)
print(features.head())

# 학습 전용 데이터와 테스트 전용 데이터로 나누기 --- (※3)
data_train, data_test, label_train, label_test = train_test_split(features, label, test_size = 0.3)

print(len(features))
print(len(data_train))
print(len(data_test))

# 데이터 학습하기 --- (※4)
model = svm.SVC()
model.fit(data_train, label_train)

# 데이터 예측하기 --- (※5)
predict = pd.DataFrame(model.predict(data_test))
print(predict.size)
predict.head()

r = pd.concat([data_test.reset_index(drop=True), label_test.reset_index(drop=True), predict.reset_index(drop=True)], axis = 1)
r.to_csv("d:/result_customer.csv")
print(len(r))
print(r.head())
# 결과 테스트하기 --- (※6)
ac_score = metrics.accuracy_score(label_test, predict)
cl_report = metrics.classification_report(label_test, predict)

print("정답률 =", ac_score)
print("리포트 =\n", cl_report)

      deposit     stock    label
0  1400000000  45000000   normal
1  1450000000  72000000  diamond
2  1500000000  61000000  diamond
3  1370000000  56000000   normal
4  1920000000  48000000  diamond
      deposit     stock
0  1400000000  45000000
1  1450000000  72000000
2  1500000000  61000000
3  1370000000  56000000
4  1920000000  48000000
20000
14000
6000
6000
6000
      deposit     stock    label        0
0  1450000000  49000000   normal  diamond
1  1500000000  51000000  diamond  diamond
2  1740000000  49000000  diamond  diamond
3  1930000000  50000000  diamond  diamond
4  1580000000  64000000  diamond  diamond
정답률 = 0.9886666666666667
리포트 =
              precision    recall  f1-score   support

    diamond       0.98      1.00      0.99      3520
     normal       1.00      0.98      0.99      1721
        vip       1.00      0.96      0.98       759

avg / total       0.99      0.99      0.99      6000



In [3]:
cl_report

'             precision    recall  f1-score   support\n\n    diamond       0.98      1.00      0.99      3520\n     normal       1.00      0.98      0.99      1721\n        vip       1.00      0.96      0.98       759\n\navg / total       0.99      0.99      0.99      6000\n'

In [3]:
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd

# 키와 몸무게 데이터 읽어 들이기 --- (※1)
tbl = pd.read_csv("../dataset/bmi.csv")
print(tbl.head())

# 칼럼(열)을 자르고 정규화하기 --- (※2)
label = tbl["label"]
w = tbl["weight"] / 100 # 최대 100kg라고 가정
h = tbl["height"] / 200 # 최대 200cm라고 가정
wh = pd.concat([w, h], axis=1)
print(wh.head())

# 학습 전용 데이터와 테스트 전용 데이터로 나누기 --- (※3)
data_train, data_test, label_train, label_test = train_test_split(wh, label, test_size = 0.3)

print(len(wh))
print(len(data_train))
print(len(data_test))
data_test.to_csv("d:/test.csv")
# 데이터 학습하기 --- (※4)
model = svm.SVC()
model.fit(data_train, label_train)

# 데이터 예측하기 --- (※5)
predict = pd.DataFrame(model.predict(data_test))
print(predict.size)
predict.head()

   height  weight   label
0     140      45  normal
1     145      72     fat
2     150      61     fat
3     137      56     fat
4     192      48    thin
   weight  height
0    0.45   0.700
1    0.72   0.725
2    0.61   0.750
3    0.56   0.685
4    0.48   0.960
20000
14000
6000
6000


Unnamed: 0,0
0,thin
1,normal
2,normal
3,thin
4,fat


In [4]:
print(data_test.head(2))
print(label_test.head(2))
print(predict.head(2))

r = pd.concat([data_test.reset_index(drop=True), label_test.reset_index(drop=True), predict.reset_index(drop=True)], axis = 1)
r.to_csv("d:/result.csv")
print(len(r))
print(r.head())
# 결과 테스트하기 --- (※6)
ac_score = metrics.accuracy_score(label_test, predict)
cl_report = metrics.classification_report(label_test, predict)

print("정답률 =", ac_score)
print("리포트 =\n", cl_report)

       weight  height
4764     0.40    1.00
14353    0.64    0.88
4764       thin
14353    normal
Name: label, dtype: object
        0
0    thin
1  normal
6000
   weight  height   label       0
0    0.40   1.000    thin    thin
1    0.64   0.880  normal  normal
2    0.79   0.960  normal  normal
3    0.45   0.870    thin    thin
4    0.66   0.775     fat     fat
정답률 = 0.9881666666666666
리포트 =
              precision    recall  f1-score   support

        fat       1.00      0.99      1.00      2253
     normal       0.97      0.99      0.98      1846
       thin       0.99      0.98      0.99      1901

avg / total       0.99      0.99      0.99      6000



## PRECISION: 맞다고 예측한것중 실제로 답은?
## RECALL: 맞는케이스에 대해 얼마나 많이 맞다고 예측했나?  (실제 답지에 대한 정확도)
## F1-SCORE: 조화로운 예측