In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
# load the data
data = load_breast_cancer()
# check the type of 'data'
type(data)

sklearn.utils.Bunch

In [3]:
# note: it is a Bunch object
# this basically acts like a dictionary where you can treat the keys like attributes
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
# 'data' (the attribute) means the input data
data.data.shape
# it has 569 samples, 30 features

(569, 30)

In [5]:
# 'targets'
data.target
# note how the targets are just 0s and 1s
# normally, when you have K targets, they are labeled 0..K-1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [6]:
# their meaning is not lost
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [7]:
# there are also 569 corresponding targets
data.target.shape

(569,)

In [8]:
# you can also determine the meaning of each feature
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [9]:
# X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33)
# N, D = X_train.shape

In [10]:
# Use scaler's fit_transform, then use transform to keep the scale params

scaler = StandardScaler()

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33)
    print('Before scaling')
    print(f'X_train: min {np.min(X_train)}, max {np.max(X_train)}, mean {np.mean(X_train)}, std {np.std(X_train)}')
    print(f'X_test: min = {np.min(X_test)}, max {np.max(X_test)}, mean {np.mean(X_test)}, std {np.std(X_test)}')
    
    print('After scaling')
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    print(f'X_train: min {np.min(X_train)}, max {np.max(X_train)}, mean {np.mean(X_train)}, std {np.std(X_train)}')
    print(f'X_test: min = {np.min(X_test)}, max {np.max(X_test)}, mean {np.mean(X_test)}, std {np.std(X_test)}')
    print()

Before scaling
X_train: min 0.0, max 4254.0, mean 62.519100407287844, std 230.67458699046028
X_test: min = 0.0, max 3216.0, mean 60.617223755372336, std 223.39683472910164
After scaling
X_train: min -3.035435035610804, max 10.10418865051444, mean -8.703060630482416e-17, std 0.9999999999999999
X_test: min = -2.6729908889091494, max 14.938577087111412, mean -0.060951691894148445, std 0.9770635861161499

Before scaling
X_train: min 0.0, max 3432.0, mean 60.93568072609799, std 221.9971169984755
X_test: min = 0.0, max 4254.0, mean 63.826175343315604, std 240.5484002862403
After scaling
X_train: min -3.0775186660488862, max 12.48515096025033, mean -1.7406121260964835e-17, std 1.0
X_test: min = -2.3114815137730287, max 11.88654533009795, mean 0.03383365395372047, std 1.0379765351714945

Before scaling
X_train: min 0.0, max 3234.0, mean 61.70971523923885, std 228.15566207912755
X_test: min = 0.0, max 4254.0, mean 62.257520292748225, std 228.58395176530632
After scaling
X_train: min -3.01632763