In [19]:
import tensorflow as tf
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as sk_metrics
import tempfile
import os
import numpy as np
from sklearn.preprocessing import OneHotEncoder


#크기 설정
matplotlib.rcParams['figure.figsize'] = [9, 6]
tf.random.set_seed(22)

In [20]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'

features = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness',
            'concavity', 'concave_poinits', 'symmetry', 'fractal_dimension']
column_names = ['id', 'diagnosis']

for attr in ['mean', 'ste', 'largest']:
    for feature in features:
        column_names.append(feature + "_" + attr)

In [21]:
dataset = pd.read_csv(url, names=column_names)
#결측치 확인
dataset.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave_poinits_mean         0
symmetry_mean                0
fractal_dimension_mean       0
radius_ste                   0
texture_ste                  0
perimeter_ste                0
area_ste                     0
smoothness_ste               0
compactness_ste              0
concavity_ste                0
concave_poinits_ste          0
symmetry_ste                 0
fractal_dimension_ste        0
radius_largest               0
texture_largest              0
perimeter_largest            0
area_largest                 0
smoothness_largest           0
compactness_largest          0
concavity_largest            0
concave_poinits_largest      0
symmetry_largest             0
fractal_dimension_largest    0
dtype: i

In [22]:
dataset.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_poinits_mean,...,radius_largest,texture_largest,perimeter_largest,area_largest,smoothness_largest,compactness_largest,concavity_largest,concave_poinits_largest,symmetry_largest,fractal_dimension_largest
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [23]:
def split_dataset(dataset, test_ratio =0.25):
    test = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test],dataset[test]

In [24]:
train_data,test_data = split_dataset(dataset)
print("{} training, {} testing".format(len(train_data),len(test_data)))

424 training, 145 testing


In [25]:
train_data.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_poinits_mean,...,radius_largest,texture_largest,perimeter_largest,area_largest,smoothness_largest,compactness_largest,concavity_largest,concave_poinits_largest,symmetry_largest,fractal_dimension_largest
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
6,844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072


### 데이터 분류
많은 데이터들을 이용해서 diagnosis를 판별해야하기 때문에 diagnosis와 나머지 데이터로 묶음

In [26]:
x_train, y_train = train_data.iloc[:, 2:], train_data.iloc[:, 1]
x_test, y_test = test_data.iloc[:, 2:], test_data.iloc[:, 1]

In [27]:
train_data['diagnosis'].value_counts()

diagnosis
B    272
M    152
Name: count, dtype: int64

In [28]:
#값 매핑
y_train, y_test = y_train.map({'B': 0, 'M': 1}), y_test.map({'B': 0, 'M': 1})
print(y_train)
x_train, y_train = tf.convert_to_tensor(x_train, dtype=tf.float32), tf.convert_to_tensor(y_train, dtype=tf.float32)
x_test, y_test = tf.convert_to_tensor(x_test, dtype=tf.float32), tf.convert_to_tensor(y_test, dtype=tf.float32)

1      1
3      1
6      1
7      1
8      1
      ..
562    1
563    1
564    1
565    1
566    1
Name: diagnosis, Length: 424, dtype: int64


In [29]:
class Normalize(tf.Module):
    def __init__(self, x):
        #평균
        self.mean = tf.Variable(tf.math.reduce_mean(x, axis=0))
        #표준편차
        self.std = tf.Variable(tf.math.reduce_std(x, axis=0))

    def norm(self, x):
    #정규화 공식 z = (x - μ) / σ
        return (x - self.mean)/self.std

    def unnorm(self, x):
    #비정규화 => 정규화 반대로
        return (x * self.std) + self.mean

norm_x = Normalize(x_train)
x_train_norm, x_test_norm = norm_x.norm(x_train), norm_x.norm(x_test)

### tf.math.reduce_mean 함수 예시

In [30]:
example = [[1.,1.],[2.,2.]]
tf.math.reduce_mean(example,axis=0)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.5, 1.5], dtype=float32)>

### tf.Variable 함수
역전파 과정에서 변수에 대한 기울기를 계산하고 엄데이트할 수 있는 변수 생성

In [31]:
import tensorflow as tf
li=[1,2,3,4]
mean = tf.Variable(tf.math.reduce_mean(li))
mean

<tf.Variable 'Variable:0' shape=() dtype=int32, numpy=2>

In [32]:
class LogisticRegression(tf.Module):
    
    def __init__(self):
        self.built = False

    def __call__(self, x, train=True):
        # Initialize the model parameters on the first call
        if not self.built:
            # Randomly generate the weights and the bias term
            rand_w = tf.random.uniform(shape=[x.shape[-1], 1], seed=22)
            rand_b = tf.random.uniform(shape=[], seed=22)
            self.w = tf.Variable(rand_w)
            self.b = tf.Variable(rand_b)
            self.built = True
        # Compute the model output
        z = tf.add(tf.matmul(x, self.w), self.b)
        z = tf.squeeze(z, axis=1)
        if train:
            return z
        return tf.sigmoid(z)

In [33]:
log_reg = LogisticRegression()
y_pred = log_reg(x_train_norm[:5], train=False)
y_pred.numpy()

array([0.81977856, 1.        , 0.92273366, 0.999811  , 0.999987  ],
      dtype=float32)