<a href="https://colab.research.google.com/github/leehj112/Data-Analyses/blob/master/colab_(%EB%A1%9C%EC%A7%80%EC%8A%A4%ED%8B%B1%ED%9A%8C%EA%B7%80%EB%AA%A8%EB%8D%B8).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
from sklearn.datasets import load_breast_cancer

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

plt.rc('font', family='Malgun Gothic')

In [39]:
breast_cancer = load_breast_cancer()  # data 설명
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [40]:
print(breast_cancer. feature_names)  # x, y check
print(breast_cancer. target_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']


In [41]:
breast_cancer.target = np.where(breast_cancer.target==0,1,0)

In [42]:
x_train, x_test, y_train, y_test = train_test_split(breast_cancer.data,
                                                    breast_cancer.target,
                                                    test_size=0.3,
                                                    random_state=2021)

normalizer = StandardScaler()
x_train = normalizer.fit_transform(x_train)
x_test = normalizer.transform(x_test)
# data columns 정규화

In [43]:
model = LogisticRegression()

model.fit(x_train, y_train)

In [44]:
# 데이터 컬럼에 따른 beta, exp(beta) 체크
column_name = ["const"] + breast_cancer.feature_names.tolist()
beta = np.concatenate([model.intercept_,model.coef_.reshape(-1)]).round(2) # round(2) 소수점 둘째짜리까지 반올림
odds = np.exp(beta).round(2) # round(2) 소수점 둘째짜리까지 반올림
interpret = np.where(beta>0,'risky','protective') # np.where() 조건문

# beta 해석 정리
beta_analysis = pd.DataFrame(np.c_[beta,odds,interpret],index=column_name,columns=['beta','exp(beta)','interpret'])
beta_analysis

Unnamed: 0,beta,exp(beta),interpret
const,-0.42,0.66,protective
mean radius,0.54,1.72,risky
mean texture,0.92,2.51,risky
mean perimeter,0.55,1.73,risky
mean area,0.56,1.75,risky
mean smoothness,0.13,1.14,risky
mean compactness,-0.48,0.62,protective
mean concavity,0.93,2.53,risky
mean concave points,0.72,2.05,risky
mean symmetry,0.23,1.26,risky


In [45]:
model.predict_proba(x_test)



array([[9.99587046e-01, 4.12953680e-04],
       [9.99985165e-01, 1.48350838e-05],
       [9.92817867e-01, 7.18213277e-03],
       [2.53352894e-13, 1.00000000e+00],
       [9.99986549e-01, 1.34505123e-05],
       [9.81298434e-02, 9.01870157e-01],
       [9.80024204e-07, 9.99999020e-01],
       [9.99932732e-01, 6.72677815e-05],
       [8.15117335e-05, 9.99918488e-01],
       [9.99995214e-01, 4.78559272e-06],
       [9.99999649e-01, 3.51231679e-07],
       [6.98821879e-06, 9.99993012e-01],
       [9.89324333e-01, 1.06756670e-02],
       [5.05948740e-04, 9.99494051e-01],
       [9.99274489e-01, 7.25511212e-04],
       [9.99218536e-01, 7.81464092e-04],
       [9.37761744e-01, 6.22382562e-02],
       [9.99971984e-01, 2.80156564e-05],
       [9.99964918e-01, 3.50820533e-05],
       [1.03711710e-03, 9.98962883e-01],
       [9.24921333e-01, 7.50786674e-02],
       [9.78028441e-01, 2.19715590e-02],
       [2.99663721e-04, 9.99700336e-01],
       [9.94848393e-01, 5.15160717e-03],
       [9.998572

In [46]:
xbeta = np.matmul(np.c_[np.ones(x_test.shape[0]), x_test], beta.reshape(-1,1))
P_1 = 1 / (1+np.exp(-xbeta))

pd.DataFrame(np.concatenate([P_1,model.predict_proba(x_test)[:,1].reshape(-1,1)],axis=1),columns=['직접','패키지'])

Unnamed: 0,직접,패키지
0,0.000409,0.000413
1,0.000014,0.000015
2,0.007251,0.007182
3,1.000000,1.000000
4,0.000013,0.000013
...,...,...
166,0.000223,0.000225
167,0.000090,0.000089
168,0.008879,0.008804
169,0.168902,0.169033
