In [1]:
import pandas as pd
import numpy as np

# Data

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
cancer = load_breast_cancer()
print(cancer.DESCR) # class 0: malignant, class 1: benign

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [4]:
data = pd.DataFrame(cancer.data, columns = cancer.feature_names)
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
# min max 값이 들쭉날쭉 하니 scaling 진행할게요 ! 
from sklearn.preprocessing import StandardScaler

In [6]:
scaler = StandardScaler() # Scaling
scaler.fit_transform(data)

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [7]:
data = pd.DataFrame(scaler.fit_transform(data), columns = cancer.feature_names)

In [8]:
# target 변수 설정 
target = pd.Series(cancer.target, dtype = "category")
data['class'] = target # class 0: malignant, class 1: benign
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015,0
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119,0
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391,0
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501,0
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971,0


In [9]:
target.value_counts()

1    357
0    212
dtype: int64

# Model

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [11]:
X = data.drop(["class"], axis=1)
y = data["class"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [13]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(426, 30) (143, 30) (426,) (143,)


In [14]:
# Logistic Regression 
classifier = LogisticRegression()
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
# class 예측
y_pred = classifier.predict(X_test) 
y_pred

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0], dtype=int64)

In [16]:
# [0으로 분류될 확률, 1로 분류될 확률]
classifier.predict_proba(X_test)

array([[9.98643831e-01, 1.35616905e-03],
       [3.82939361e-02, 9.61706064e-01],
       [1.27346089e-03, 9.98726539e-01],
       [1.25224048e-02, 9.87477595e-01],
       [2.17694860e-04, 9.99782305e-01],
       [4.55239833e-03, 9.95447602e-01],
       [1.08848351e-04, 9.99891152e-01],
       [1.80300227e-03, 9.98196998e-01],
       [1.00269932e-04, 9.99899730e-01],
       [1.89641678e-06, 9.99998104e-01],
       [1.94716966e-01, 8.05283034e-01],
       [8.33655551e-02, 9.16634445e-01],
       [9.60301963e-06, 9.99990397e-01],
       [5.50790043e-01, 4.49209957e-01],
       [4.17018030e-01, 5.82981970e-01],
       [9.95756882e-01, 4.24311817e-03],
       [2.92975773e-03, 9.97070242e-01],
       [9.99997194e-01, 2.80606050e-06],
       [9.99923401e-01, 7.65988691e-05],
       [9.99999998e-01, 2.36955875e-09],
       [9.98793232e-01, 1.20676759e-03],
       [9.81196639e-01, 1.88033614e-02],
       [1.77915082e-02, 9.82208492e-01],
       [9.76880298e-04, 9.99023120e-01],
       [9.994534

In [17]:
# mean accuracy 
classifier.score(X_test, y_test)

0.965034965034965

#### 입력값과 출력값 비교

In [18]:
prob = [prob[1] for prob in classifier.predict_proba(X_test)] # class 1에 속할 확률 

In [19]:
ce_df = {
    'y_test' : y_test, # 실제
    'y_pred' : y_pred, # 예측
    'prob' : prob      # 1로 분류할 확률 
}

ce_df = pd.DataFrame(ce_df)

In [20]:
ce_df 

Unnamed: 0,y_test,y_pred,prob
512,0,0,1.356169e-03
457,1,1,9.617061e-01
439,1,1,9.987265e-01
298,1,1,9.874776e-01
37,1,1,9.997823e-01
...,...,...,...
236,0,0,1.125707e-12
113,1,1,9.999123e-01
527,1,1,9.984054e-01
76,1,1,9.986494e-01


In [21]:
yi = ce_df["y_test"].to_numpy()
pi = ce_df["prob"].to_numpy()

In [22]:
# cross entropy 
-np.sum((yi * np.log(pi) + (1-yi) * np.log(1-pi))) / ce_df.shape[0]

0.09744178345724831

# Metrics Evaluation
* https://scikit-learn.org/stable/modules/classes.html?highlight=metrics#module-sklearn.metrics

In [23]:
from sklearn.metrics import *

In [24]:
# f1 score
f1_score(y_pred, y_test) 

0.9723756906077348

In [25]:
confusion_matrix(y_pred, y_test)

array([[50,  2],
       [ 3, 88]], dtype=int64)