<a href="https://colab.research.google.com/github/jadhav-rakesh/ML/blob/main/ds4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

#Rescaling a Feature

In [2]:
#rescale the values of a numerical feature to be between two values
#x = (x - min(X) / (max(X) - min(X)))

from sklearn import preprocessing

feature = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [900.9]])

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

scaled_feature = minmax_scale.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

* fit to calculate the minimum and maximum values of the feature
* transform to rescale the feature
* fit_transform to do both operations at once


#Standardizing a Feature

In [3]:
#transform a feature to have a mean of 0 and a standard deviation of 1.
#x = (x - mean(X)) / std(X)

x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])

scaler = preprocessing.StandardScaler()

standardized = scaler.fit_transform(x)

standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [4]:
standardized.mean()

4.4408920985006264e-17

In [5]:
standardized.std()

1.0

In [6]:
# rescale the feature using the median and quartile range

robust_scale = preprocessing.RobustScaler()

robust_scale.fit_transform(x)


array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

#Normalizing Observations

In [7]:
#rescale the feature values of observations to have unit norm (a total length of 1).
#Normalizer rescales the values on individual observations to have unit norm (the sum of their lengths is 1).
#This type of rescaling is often used when we have many equivalent features

from sklearn.preprocessing import Normalizer

features = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])

normalizer = Normalizer(norm="l2")

normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [9]:
#Manhattan norm (L1)

features_l1_norm = Normalizer(norm="l1").transform(features)

features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [11]:
#create polynomial and interaction features

from sklearn.preprocessing import PolynomialFeatures

features = np.array([[2, 3],
                     [2, 3],
                     [2, 3]])

polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)

polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [12]:
interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

# a coffee would only be sweet if the coffee had sugar and was stirred.

#Transforming Features

In [13]:
#make a custom transformation to one or more features.

from sklearn.preprocessing import FunctionTransformer

features = np.array([[2, 3],
                     [2, 3],
                     [2, 3]])

def add_ten(x:int) -> int:
    return x + 10

ten_transformer = FunctionTransformer(add_ten)

ten_transformer.transform(features)


array([[12, 13],
       [12, 13],
       [12, 13]])

In [14]:
df = pd.DataFrame(features, columns=["features_1", "features_2"])

df.apply(add_ten)

Unnamed: 0,features_1,features_2
0,12,13
1,12,13
2,12,13


#Detecting Outliers

In [19]:
#identify extreme observations.

from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

features, _ = make_blobs(n_samples=10,
                         n_features=2,
                         centers=1,
                         random_state=42)

features[0, 0] = 10000
features[0, 1] = 10000

outlier_detector = EllipticEnvelope(contamination=.1)

outlier_detector.fit(features)

outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [24]:
# individual features and identify extreme values in those features using interquartile range

features, _ = make_blobs(n_samples = 10,
                         n_features = 2,
                         centers = 1,
                         random_state = 1)
features[0, 0] = 10000
features[0, 1] = 10000

features = features[:, 0]

def indicies_of_outliers(x:int) -> np.array(int):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

indicies_of_outliers(features)

(array([0]),)

#Handling Outliers

In [25]:
#identify and then reduce their impact on the data distribution.

houses = pd.DataFrame()
houses["Price"] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

houses[houses["Bathrooms"] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [26]:
houses["Outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [28]:
houses["Log_of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


# Discretizating Features

In [29]:
#You have a numerical feature and want to break it up into discrete bins.

from sklearn.preprocessing import Binarizer

age = np.array([[6],
               [12],
               [20],
               [36],
               [65]])

binarizer = Binarizer(threshold=18)

binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [31]:
np.digitize(age, bins=[20, 34, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [33]:
np.digitize(age, bins=[20, 30, 64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

In [34]:
np.digitize(age, bins=[18])


array([[0],
       [0],
       [1],
       [1],
       [1]])

#Grouping Observations Using Clustering

In [35]:
#cluster observations so that similar observations are grouped together.

from sklearn.cluster import KMeans

features, _ = make_blobs(n_samples = 50,
                         n_features = 2,
                         centers = 3,
                         random_state = 42)

df = pd.DataFrame(features, columns=["f_1", "f_2"])

clusterer = KMeans(3, random_state=0)

clusterer.fit(features)

df["group"] = clusterer.predict(features)

df.head(5)

Unnamed: 0,f_1,f_2,group
0,-3.110904,10.866564,2
1,-2.300334,7.054616,2
2,3.161357,1.253325,0
3,-9.499372,-6.058207,1
4,-3.417222,7.601982,2


#Deleting Observations with Missing Values

In [36]:
#delete observations containing missing values.

features = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])

features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [38]:
df = pd.DataFrame(features, columns=["f_1", "f_2"])

df.dropna()

Unnamed: 0,f_1,f_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


#There are three types of missing data:

* Missing completely at random (MCAR)

* Missing at random (MAR)

* Missing not at random (MNAR)

# Imputing Missing Values

In [39]:
#You have missing values in your data and want to impute them via a generic method or prediction.

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

features, _ = make_blobs(n_samples = 1000,
                         n_features = 2,
                         random_state = 42)

scaler = StandardScaler()

standardized_features = scaler.fit_transform(features)

true_value = standardized_features[0, 0]
standardized_features[0, 0] = np.nan

knn_imputer = KNNImputer(n_neighbors=5)
features_knn_imputed = knn_imputer.fit_transform(standardized_features)

print(true_value)
print(features_knn_imputed[0, 0])

-1.0337492807585151
-0.9524567720467048


In [41]:
from sklearn.impute import SimpleImputer

features, _ = make_blobs(n_samples = 1000,
                         n_features = 2,
                         random_state = 1)

scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

mean_imputer = SimpleImputer(strategy="mean")

features_mean_imputed = mean_imputer.fit_transform(features)

print(true_value)
print(features_mean_imputed[0, 0])

0.8730186113995938
-3.058372724614996
