**Quantitative data** is the measurement of something—whether class size, monthly sales, or student scores. The natural way to represent these quantities is numerically (e.g., 29 students, $529,392 in sales).

In [49]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn import preprocessing
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [38]:
# Variables 
features = np.array([[0.5, 0.5],[1.1, 3.4],[1.5, 20.2],[1.63, 34.4],[10.9, 3.3]])
matrix = np.array([[2, 3],[2, 3],[2, 3]])
age = np.array([[6],[12],[20],[36],[65]])


In [11]:
# Rescaling a feature
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_feature = minmax_scale.fit_transform(features)
scaled_feature

array([[0.        , 0.        ],
       [0.05769231, 0.08554572],
       [0.09615385, 0.58112094],
       [0.10865385, 1.        ],
       [1.        , 0.08259587]])

In [12]:
# Standardizing a Feature
scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(features)
print("Mean:", round(standardized.mean()))
print("Standard deviation:", standardized.std())
standardized

Mean: 0
Standard deviation: 1.0


array([[-0.67215216, -0.90948567],
       [-0.51857589, -0.68709879],
       [-0.4161917 ,  0.60121144],
       [-0.38291684,  1.69014032],
       [ 1.9898366 , -0.6947673 ]])

In [13]:
# If our data has significant outliers.
robust_scaler = preprocessing.RobustScaler()
# Transform feature
robust_scaler.fit_transform(features)

array([[-1.88679245e+00, -1.71597633e-01],
       [-7.54716981e-01,  0.00000000e+00],
       [ 0.00000000e+00,  9.94082840e-01],
       [ 2.45283019e-01,  1.83431953e+00],
       [ 1.77358491e+01, -5.91715976e-03]])

In [None]:
# Normalizing Observations
features_l2_norm = preprocessing.Normalizer(norm="l2").transform(features)
features_l1_norm = preprocessing.Normalizer(norm="l1").transform(features)
print("Sum of the first observation\'s values:",features_l1_norm[0, 0] + features_l1_norm[0, 1])
features_l1_norm

Sum of the first observation's values: 1.414213562373095


array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [22]:
# Generating Polynomial and Interaction Features
polynomial_interaction = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
polynomial_interaction.fit_transform(features)
# Restrict only interaction features
interaction = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction.fit_transform(features)

array([[ 0.5  ,  0.5  ,  0.25 ],
       [ 1.1  ,  3.4  ,  3.74 ],
       [ 1.5  , 20.2  , 30.3  ],
       [ 1.63 , 34.4  , 56.072],
       [10.9  ,  3.3  , 35.97 ]])

In [24]:
# Transforming Features
def add_ten(x: int) -> int:
    return x+10

ten_transformer = preprocessing.FunctionTransformer(add_ten)
ten_transformer.transform(matrix)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [27]:
# Detecting Outliers
# 1. assume the data is normally distributed
simulated_features, _ = make_blobs(
    n_samples=10, n_features=2, centers=1, random_state=1
)
simulated_features[0,0], simulated_features[0,1] = 1000, 1000
outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(simulated_features)
outlier_detector.predict(simulated_features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [29]:
# Using IQR
feature = simulated_features[:,0]
def indices_of_outliers(x: int) -> np.array(int):
    q1,q3 = np.percentile(x, [25,75])
    iqr = q3-q1
    lower_bound = q1 - (iqr*1.5)
    upper_bound = q3 + (iqr*1.5)
    return np.where((x>upper_bound) | (x<lower_bound))

indices_of_outliers(feature)

(array([0]),)

In [37]:
# Handling Outliers
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, "Not Outlier", "Outlier")
# Log Feature
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,Not Outlier,7.31322
1,392333,3.5,2500,Not Outlier,7.824046
2,293222,2.0,1500,Not Outlier,7.31322
3,4322032,116.0,48000,Outlier,10.778956


In [None]:
# Discretizating Features
binarizer = preprocessing.Binarizer(threshold=18)
# Two bins
binarizer.fit_transform(age)
# Multiple bins
np.digitize(age, bins=[20,30,64])
np.digitize(age, bins=[20,30,64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

In [45]:
# Grouping Observations Using Clustering
blob_features, _ = make_blobs(
    n_samples = 50,n_features = 2,centers = 3,random_state = 1
)
df = pd.DataFrame(blob_features, columns=["Feature 1", "Features 2"])
clusterer = KMeans(3, random_state=0)
clusterer.fit(blob_features)
df['Group'] = clusterer.predict(blob_features)

df.head(5)

Unnamed: 0,Feature 1,Features 2,Group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


In [None]:
# Deleting Observations with Missing Values
# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]
df.dropna()

In [48]:
# Imputing Missing Values
impute_features, _ = make_blobs(
    n_samples = 1000,n_features = 2,random_state = 1
)
scaler = preprocessing.StandardScaler()
standardized_features = scaler.fit_transform(impute_features)

# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

# Predict the missing values in the feature matrix
knn_imputer = KNNImputer(n_neighbors=5)
features_knn_imputed = knn_imputer.fit_transform(standardized_features)

print("True Value:", true_value)
print("Imputed Value:", features_knn_imputed[0,0])

True Value: 0.8730186113995938
Imputed Value: 1.0959262913919632


In [50]:
# Using SimpleImputer
# Create imputer using the "mean" strategy
mean_imputer = SimpleImputer(strategy="mean")
# Impute values
features_mean_imputed = mean_imputer.fit_transform(features)
# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_mean_imputed[0,0])

True Value: 0.8730186113995938
Imputed Value: 0.5
