# Normalization

## Min Max Normalization

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

X = np.array([[10], [20], [30], [40], [100]])
scaler = MinMaxScaler()
X_transformed = scaler.fit_transform(X)

X_transformed


array([[0.        ],
       [0.11111111],
       [0.22222222],
       [0.33333333],
       [1.        ]])

## Z-Score Normalization

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = np.array([[10], [20], [30], [40], [100]])
X_transformed = scaler.fit_transform(X)

X_transformed

array([[-0.9486833 ],
       [-0.63245553],
       [-0.31622777],
       [ 0.        ],
       [ 1.8973666 ]])

## Normalization by decimal scaling

$ v^{'} = \frac{v}{10^j}$

In [13]:
X = np.array([[10], [20], [30], [40], [100]])

j = len(str(np.max(np.abs(X))))

X_transformed = X/10**j

X_transformed

array([[0.01],
       [0.02],
       [0.03],
       [0.04],
       [0.1 ]])

# Feature Encoding
Feature encoding converts categorical data into numeric format so that machine learning algorithms can process it.

Types of categorical features:
- Nominal: Categories with no inherent order..
- Examples: color, city, water, source type
- Ordinal: Categories with a defined order.
- Examples: water quality classes(poor < fair < good)

## Label Encoding
Assigns each category a unique integer.

Pros:
- Simple and fast
- Works for ordinal data

Cons:
- Imposes an arbitrary order for nominal data
- Not suitable for tree-based models if nominal

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cities = ["Kathmandu", "Pokhara", "Biratnagar"]
encoded = le.fit_transform(cities)

encoded

array([1, 2, 0])

## One Hot Encoding
Creates binary columns for each category

Pros:
- No ordinal assumption
- Works well for nominal data

Cons:
- Increases dimensionality
- May slow down algorithms if many categories

In [16]:
import pandas as pd

df = pd.DataFrame({'source': ['river', 'lake', 'well']})
df_encoded = pd.get_dummies(df, columns=['source'])

df_encoded

Unnamed: 0,source_lake,source_river,source_well
0,False,True,False
1,True,False,False
2,False,False,True


## Ordinal Encoding

In [20]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['poor', 'fair', 'good']])
quality=[['poor'], ['good'], ['fair']]

encoded = encoder.fit_transform(quality)
print(encoded)

[[0.]
 [2.]
 [1.]]


# Optimization Strategies

## Vectorization

In [34]:
import time

x = np.random.random(10000000)

print(x[:5])
start = time.time()
for i in range(x.shape[0]):
    x[i]+=2
end = time.time()
print(x[:5])
print(f"Time taken is {end - start} seconds")

[0.08360072 0.5548284  0.02672216 0.12480026 0.71360527]
[2.08360072 2.5548284  2.02672216 2.12480026 2.71360527]
Time taken is 1.4343130588531494 seconds


In [35]:
import time

x = np.random.random(10000000)

print(x[:5])
start = time.time()
x = x+2
end = time.time()
print(x[:5])
print(f"Time taken is {end - start} seconds")

[0.06982598 0.039798   0.94269605 0.49690145 0.30510371]
[2.06982598 2.039798   2.94269605 2.49690145 2.30510371]
Time taken is 0.0042078495025634766 seconds


In [None]:
isinstance()