In [1]:
import sklearn
import pandas as pd 

# Data Standardization
z = (x - u) / q

For each value, x, we subtract the overall mean of the data, 
then divide by the overall standard deviation q.

The new value, z, represents the standardized data value

In [None]:
from sklearn.preprocessing import scale

data = pd.read_csv('data.csv')
pizza_data = data[data['food_type'] == 'pizza']

col_standardized = scale(pizza_data)

col_means = col_standardized.mean(axis = 0).round(decimals = 3)

col_std = col_standardized.std(axis = 0)

# Range Scaling 

Scale data by compressing it into a fixed range. 
Allows better view of the data in terms of proportions or percentages, based on the minimum and maximum values in the data.

x_prop = (x - d_min) / (d_max - d_min)

- for a given value x, we compute the proportion of the value with respect to the minimum and maximum of the data (d_min, d_max).
- formula given above computes the proportion of the data value, x_prop
- use the proportion of the value to scale to the specified range [r_min, r_max]
The formula below calculates the new scaled value x_scale

x_scale = x_prop * (r_max - r_min) + r_min


In [None]:
from sklearn.preprocessing import MinMaxScaler

default_scaler = MinMaxScaler()
transformed = default_scaler.fit_transform(data)

custom_scaler = MinMaxScaler(feature_range = (-2, 3)) #compress data within a given range
transformed = custom_scaler.fit_transform(data)

In [None]:
from sklearn.preprocessing import MinMaxScaler

new_data = data

default_scaler = MinMaxScaler()
transformed = default_scaler.fit_transform(new_data)

default_Scaler = MinMaxScaler()
default_scaler.fit(data)
transformed = default_scaler.transform(new_data)

# Robust Scaling

Data standardization uses each feature's mean and standard deviation while ranged scaling uses the maximum and minimum feature vakues meaning they're both susceptible to being skewed by outlier values.

Robustly scale the data by using data's median and IQR.
Since median and IQR are percentile measurements of the data, they are not affected by outliers -> subtract median from each data then scale to IQR.

In [None]:
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()
transformed = robust_scaler.fit_transform(data)

# L2 Normalization

- When we wish to scale individual data observations
e.g., when we want to cluster data -> calculating cosine similarity scores

L2 normalization applied to a particular row of data array will:
- divide each value in that row by the row's L2 norm
- L2 Norm: square root of the sum of squared values for the row

![](L2Norm.png)

In [None]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer() # Implements the L2 Normalizations
transformed = normalizer.fit_transform(data)

# Data imputation Methods

- some datasets contain missing values
- perform data imputation on little missing values

4 Methods of Imputing:
- mean values
- median value
- frequent value
- missing values with a constant

In [None]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(strategy = 'mean')
transformed = imp_mean.fit_transform(data)

In [None]:
imp_median = SimpleImputer(strategy = 'median')
transformed = imp_median.fit_transform(data)

imp_frequent = SimpleImputer(strategy = 'most_frequent')
transformed = imp_frequent.fit_transform(data)

In [None]:
imp_constant = SimpleImputer(strategy='constant',
                             fill_value=-1)
transformed = imp_constant.fit_transform(data)

# Dimensionality Reduction

- data contains correlated numeric features, perform principal component analysis for dimensionality reduction
- PCA extracts principal components which are uncorrelated sets of latent variables that encompass most of the information from the original dataset

In [None]:
from sklearn.decomposition import PCA 

pca_obj = PCA()
pc = pca_obj.fit_transform(data).round(3)

pca_obj2 = PCA(n_components = 3)
pc = pca_obj.fit_transform(data).round(3)

def pca_data(data, n_components):
  pca_obj = PCA(n_components = n_components)
  component_data = pca_obj.fit_transform(data)
  return component_data

In [2]:
from sklearn.datasets import load_breast_cancer

bc = load_breast_cancer()
print(bc.data.shape)
print(bc.target)
print(bc.target.shape)
print(bc.target_names)

malignant = bc.data[bc.target == 0]
print(malignant.shape)

benign = bc.data[bc.target == 1]
print(benign.shape)

(569, 30)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 