In [5]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

X = pd.DataFrame(cancer['data'])
y = pd.Series(cancer['target'])


In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X.values, y.values,
                                                random_state = 700,
                                                stratify=y.values)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

knn_model = KNeighborsClassifier(n_neighbors = 5,n_jobs = -1).fit(X_train,y_train)
lr_model = LogisticRegression(solver = 'lbfgs',max_iter=5000).fit(X_train,y_train)

In [12]:
print('학습평가 knn:',knn_model.score(X_train,y_train))
print('       LR:',lr_model.score(X_train,y_train))

print('테스트 knn:',knn_model.score(X_test,y_test))
print('       LR:',lr_model.score(X_test,y_test))

학습평가 knn: 0.9483568075117371
       LR: 0.9647887323943662
테스트 knn: 0.916083916083916
       LR: 0.958041958041958


#### 1. accuracy : 맞춘비율을 계산하여 얼마나 잘 예측하는지를 계산
#### 2. presicion(정밀도) : 맞춘 것들 중 진자 정답인 비율 
#### 3. recall(재현율) : 정답 맞춘 갯수 / (오답데이터 빼고) 정답 데이터의 갯수

100개의 스팸메일 중 2개가 스팸이라고 예측
정밀도는 100%
재현율은 2%



In [10]:
print(y.value_counts()/len(y))
# 암인 사람의 데이터가 더 많기 때문에 암을 잘 맞출것..

1    0.627417
0    0.372583
dtype: float64


In [16]:
from sklearn.metrics import classification_report
pred_train = lr_model.predict(X_train)
pred_test = lr_model.predict(X_test)

# (정답 데이터,예측데이터)
print(classification_report(y_train,pred_train))
# support 는 데이터 갯수 -> 편향 확인 가능

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       159
           1       0.97      0.98      0.97       267

    accuracy                           0.96       426
   macro avg       0.96      0.96      0.96       426
weighted avg       0.96      0.96      0.96       426



In [19]:
# 다중분류
from sklearn.datasets import load_iris
iris = load_iris()
X = iris['data']
y = iris['target']

X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                random_state = 1,
                                                stratify=y)

In [21]:
X = pd.DataFrame(X)
y = pd.Series(y)
print(y.value_counts()/len(y))

0    0.333333
1    0.333333
2    0.333333
dtype: float64


In [22]:
lr_model = LogisticRegression(C=1,
                              solver = 'lbfgs',
                              class_weight='balanced',
                             random_state =1,
                             multi_class='ovr').fit(X_train,y_train)
# ovr (one vs rest) 방식 사용
# multinomial - softmax 방식 기반

In [23]:
print('학습결과:',lr_model.score(X_train,y_train))

학습결과: 0.9285714285714286


In [28]:
print('예측값',lr_model.predict(X))

예측값 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [30]:
pred_train = lr_model.predict(X_train)
print(classification_report(y_train,pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       0.91      0.86      0.89        37
           2       0.87      0.92      0.89        37

    accuracy                           0.93       112
   macro avg       0.93      0.93      0.93       112
weighted avg       0.93      0.93      0.93       112

