# Rescaling a feature

In [5]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn import preprocessing

In [6]:
dataset = load_diabetes()
import pandas as pd
df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
df["Y"] = dataset.target

In [7]:
df.head(3)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0


In [8]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))

In [56]:
scaled_df = min_max_scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns = df.columns)
scaled_df.head(3)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
0,0.666667,1.0,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394,0.392523
1,0.483333,0.0,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222443,0.166667,0.155763
2,0.883333,1.0,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496584,0.409091,0.361371


In [13]:
var_names = {}
count = 0
for var in dataset.feature_names:
    var_names[count] = var
    count += 1

In [14]:
var_names

{0: 'age',
 1: 'sex',
 2: 'bmi',
 3: 'bp',
 4: 's1',
 5: 's2',
 6: 's3',
 7: 's4',
 8: 's5',
 9: 's6'}

In [15]:
var_names[10] = "Y"

In [16]:
var_names

{0: 'age',
 1: 'sex',
 2: 'bmi',
 3: 'bp',
 4: 's1',
 5: 's2',
 6: 's3',
 7: 's4',
 8: 's5',
 9: 's6',
 10: 'Y'}

In [19]:
scaled_df.rename(columns = var_names, inplace = True)
scaled_df.head(3)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
0,0.666667,1.0,0.582645,0.549296,0.294118,0.256972,0.207792,0.282087,0.562217,0.439394,0.392523
1,0.483333,0.0,0.14876,0.352113,0.421569,0.306773,0.623377,0.141044,0.222443,0.166667,0.155763
2,0.883333,1.0,0.516529,0.43662,0.289216,0.258964,0.246753,0.282087,0.496584,0.409091,0.361371


In [29]:
scaled_df.min(), scaled_df.max()

(age    0.0
 sex    0.0
 bmi    0.0
 bp     0.0
 s1     0.0
 s2     0.0
 s3     0.0
 s4     0.0
 s5     0.0
 s6     0.0
 Y      0.0
 dtype: float64,
 age    1.0
 sex    1.0
 bmi    1.0
 bp     1.0
 s1     1.0
 s2     1.0
 s3     1.0
 s4     1.0
 s5     1.0
 s6     1.0
 Y      1.0
 dtype: float64)

# Standardizing a feature

**Mean 0, std 1**

In [55]:
scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(df)
standardized = pd.DataFrame(standardized, columns = df.columns)
standardized.head(3)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
0,0.8005,1.065488,1.297088,0.45984,-0.929746,-0.732065,-0.912451,-0.054499,0.418551,-0.370989,-0.014719
1,-0.039567,-0.938537,-1.08218,-0.553511,-0.177624,-0.402886,1.564414,-0.830301,-1.436551,-1.938479,-1.001659
2,1.793307,1.065488,0.934533,-0.119218,-0.958674,-0.718897,-0.680245,-0.054499,0.060207,-0.545154,-0.14458


In [32]:
standardized.min(), standardized.max()

(0    -2.254290
 1    -0.938537
 2    -1.897929
 3    -2.363066
 4    -2.665411
 5    -2.430626
 6    -2.150883
 7    -1.606102
 8    -2.651046
 9    -2.896390
 10   -1.650961
 dtype: float64,
 0     2.327895
 1     1.065488
 2     3.585718
 3     2.776071
 4     3.235851
 5     4.179278
 6     3.809072
 7     3.894331
 8     2.808758
 9     2.851075
 10    2.517559
 dtype: float64)

In [36]:
round(standardized.mean()), round(standardized.std())

(0    -0.0
 1    -0.0
 2    -0.0
 3    -0.0
 4     0.0
 5    -0.0
 6    -0.0
 7    -0.0
 8    -0.0
 9    -0.0
 10   -0.0
 dtype: float64,
 0     1.0
 1     1.0
 2     1.0
 3     1.0
 4     1.0
 5     1.0
 6     1.0
 7     1.0
 8     1.0
 9     1.0
 10    1.0
 dtype: float64)

**If the dataset has significant outliers, it can negatively impact the standardization by affecting the feature's mean and variance. It is often helpful to instead rescale the feature using the median and quartile range. In scikit-learn use RobustScaler to do this.**

In [52]:
robust_scaler = preprocessing.RobustScaler()
scaled = robust_scaler.fit_transform(df)
scaled = pd.DataFrame(scaled)
scaled.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.433735,1.0,1.053498,0.380952,-0.637363,-0.514954,-0.571429,0.0,0.332756,-0.271186,0.084337
1,-0.096386,0.0,-0.674897,-0.285714,-0.065934,-0.254876,1.257143,-0.5,-1.010659,-1.491525,-0.526104
2,1.060241,1.0,0.790123,0.0,-0.659341,-0.504551,-0.4,0.0,0.073253,-0.40678,0.004016


In [40]:
scaled.min(), scaled.max()

(0    -1.493976
 1     0.000000
 2    -1.267490
 3    -1.476190
 4    -1.956044
 5    -1.856957
 6    -1.485714
 7    -1.000000
 8    -1.890164
 9    -2.237288
 10   -0.927711
 dtype: float64,
 0     1.397590
 1     1.000000
 2     2.716049
 3     1.904762
 4     2.527473
 5     3.365410
 6     2.914286
 7     2.545000
 8     2.063679
 9     2.237288
 10    1.650602
 dtype: float64)

In [42]:
round(scaled.mean()), round(scaled.std())

(0    -0.0
 1     0.0
 2     0.0
 3     0.0
 4     0.0
 5     0.0
 6     0.0
 7     0.0
 8     0.0
 9     0.0
 10    0.0
 dtype: float64,
 0     1.0
 1     0.0
 2     1.0
 3     1.0
 4     1.0
 5     1.0
 6     1.0
 7     1.0
 8     1.0
 9     1.0
 10    1.0
 dtype: float64)

# Normalizing observations

**Rescale the feature values of observations to have unit norm (a total length of 1).**

In [43]:
from sklearn.preprocessing import Normalizer

In [76]:
normalizer = Normalizer(norm = "l2")

In [79]:
normalized_df = normalizer.transform(df)
normalized_df = pd.DataFrame(normalized_df, columns = df.columns)
normalized_df.head(3)



Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
0,0.000252,0.000336,0.000409,0.000145,-0.000293,-0.000231,-0.000287,-1.7e-05,0.000132,-0.000117,1.0
1,-2.5e-05,-0.000595,-0.000686,-0.000351,-0.000113,-0.000256,0.000992,-0.000527,-0.000911,-0.001229,0.999998
2,0.000605,0.000359,0.000315,-4e-05,-0.000323,-0.000243,-0.000229,-1.8e-05,2e-05,-0.000184,1.0


In [80]:
#if the features and target names are rescaled then when converting the rescaled data to a dataframe use columns = df.columns to get the feature names

In [82]:
l1 = Normalizer(norm = "l1").transform(df)
l1 = pd.DataFrame(l1)
l1.head(3)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.000252,0.000335,0.000408,0.000145,-0.000292,-0.00023,-0.000287,-1.7e-05,0.000132,-0.000117,0.997787
1,-2.5e-05,-0.000592,-0.000682,-0.000349,-0.000112,-0.000254,0.000987,-0.000524,-0.000906,-0.001222,0.994347
2,0.000604,0.000359,0.000315,-4e-05,-0.000323,-0.000242,-0.000229,-1.8e-05,2e-05,-0.000183,0.997668


# Generating polynomial and interaction features

In [83]:
from sklearn.preprocessing import PolynomialFeatures

In [91]:
polynomial_interaction = PolynomialFeatures(degree = 2, include_bias = False)
pd.DataFrame(polynomial_interaction.fit_transform(df)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,...,7e-06,-5.2e-05,4.6e-05,-0.391432,0.000396,-0.000351,3.006172,0.000311,-2.664565,22801.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,...,0.00156,0.002699,0.003641,-2.962004,0.004669,0.0063,-5.124731,0.008502,-6.915304,5625.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,...,7e-06,-7e-06,6.7e-05,-0.365509,8e-06,-7.4e-05,0.403792,0.000672,-3.656178,19881.0


**The degree parameter determines the maximum degree of the polynomial. For example. degree = 2 will create new features raised to the second power: x1, x2, x1^2, x2^2**

**with degree 3: x1, x2, x1^2, x2^2, x1^3, x2^3**

In [92]:
polynomial_interaction = PolynomialFeatures(degree = 3, include_bias = False)
pd.DataFrame(polynomial_interaction.fit_transform(df)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,353,354,355,356,357,358,359,360,361,362
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,...,7.890607e-06,-6.993957e-06,0.059848,6e-06,-0.053047,453.931904,-5e-06,0.047019,-402.3493,3442951.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,...,-0.0003190284,-0.0004304965,0.350172,-0.000581,0.472521,-384.354808,-0.000784,0.637619,-518.647779,421875.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,...,2.34863e-08,-2.126594e-07,0.001156,2e-06,-0.01047,56.934622,-1.7e-05,0.094806,-515.521069,2803221.0


**by default PolynomialFeatures includes interaction features: x1x2.**

In [101]:
interaction = PolynomialFeatures(degree = 2, include_bias = False, interaction_only = True)
pd.DataFrame(interaction.fit_transform(df)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,...,0.000113,-0.000864,0.000766,-6.553528,-5.2e-05,4.6e-05,-0.391432,-0.000351,3.006172,-2.664565
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,...,-0.002939,-0.005085,-0.006861,5.580867,0.002699,0.003641,-2.962004,0.0063,-5.124731,-6.915304
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,...,8.4e-05,-9.3e-05,0.000839,-4.562186,-7e-06,6.7e-05,-0.365509,-7.4e-05,0.403792,-3.656178


**Polynomial features often created when we want to include the notion that there exists a nonlinear relationship between the features and the target. For example, we might suspect that the effect of age on the probability of having a major medical condition is not constant over time but increases as age increases. We can encode that nonconstant effect in a feature, x, by generating that feature's higher order forms (x^2, x^3 etc.). Additionally, often we run into situations where the effect of one feature is dependent on another feature. A simple example would be if we were trying to predict whether or not our coffee was sweet and we had two features: 1. whether or not the coffee was stirred and 2. if we added sugar. Individually, each feature does not predict coffee sweetness, but the combination of their effects does. That is a coffee would only be sweet if the coffee has sugar and was stirred. The effects of each feature on the target are dependent on each other. We can encode that relationship by including an interaction feature that is the product of the individual features.**

# Transforming features

In [1]:
from sklearn.preprocessing import FunctionTransformer

In [3]:
import numpy as np
features = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

In [4]:
def add_ten(x):
    return x+10

In [5]:
ten_transformer = FunctionTransformer(add_ten)
ten_transformer.transform(features)

array([[11, 12, 13],
       [14, 15, 16],
       [17, 18, 19]])

In [6]:
#same transformation using apply
import pandas as pd
df = pd.DataFrame(features, columns = ["feature_1", "feature_2", "feature_3"])
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2,feature_3
0,11,12,13
1,14,15,16
2,17,18,19


# Detecting outliers

In [7]:
from sklearn.covariance import EllipticEnvelope

In [8]:
from sklearn.datasets import make_blobs

In [9]:
features, _ = make_blobs(n_samples = 10, n_features = 2, centers = 1, random_state = 0)
#replace the first observation values with extreme values
features[0, 0] = 10000
features[0, 1] = 10000

In [10]:
#create detector
outlier_detector = EllipticEnvelope(contamination = 0.1)
outlier_detector.fit(features)
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

# Handling outliers

In [14]:
houses = pd.DataFrame()
houses["Price"] = [292902, 202020, 10292, 209102, 10292992]
houses["Bathrooms"] = [2, 3, 5, 2, 50]
houses["Square_Feet"] = [1000, 2002, 1002, 12022, 59403]

In [17]:
houses[houses["Bathrooms"] > 5] #first technique: drop the outliers

Unnamed: 0,Price,Bathrooms,Square_Feet
4,10292992,50,59403


In [21]:
houses["Outlier"] = np.where(houses["Bathrooms"] > 5, 1, 0) #mark them as outliers and include it as a feature

In [22]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,292902,2,1000,0
1,202020,3,2002,0
2,10292,5,1002,0
3,209102,2,12022,0
4,10292992,50,59403,1


In [25]:
houses["Log_of_sq_feet"] = [np.log(x) for x in houses["Square_Feet"]] #transform the feature to dampen the effect of the outlier

In [26]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_of_sq_feet
0,292902,2,1000,0,6.907755
1,202020,3,2002,0,7.601902
2,10292,5,1002,0,6.909753
3,209102,2,12022,0,9.394494
4,10292992,50,59403,1,10.9921


# Discretizing features

In [29]:
#break a numerical feature into discrete bins
from sklearn.preprocessing import Binarizer

In [30]:
age = np.array([
    [9],
    [3],
    [20],
    [59],
    [20]
])

In [37]:
binarizer = Binarizer(threshold = 10) #first way, threshold 10 means upto 9 it will be 0 and the rest will be 1

In [38]:
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [41]:
np.digitize(age, bins = [5, 10, 100]) #break up numerical features according to multiple thresholds

array([[1],
       [0],
       [2],
       [2],
       [2]], dtype=int64)

In [54]:
np.digitize(age, bins = [5, 20, 100]) #without right = True 20 is added to 2

array([[1],
       [0],
       [2],
       [2],
       [2]], dtype=int64)

In [55]:
np.digitize(age, bins = [5, 20, 100], right = True) #with right = True 20 is added to 1

array([[1],
       [0],
       [1],
       [2],
       [1]], dtype=int64)

**Discretization can be a fruitful strategy when we have reason to believe that a numerical feature should behave more like a categorical feature.**

# Grouping observations using clustering

In [56]:
from sklearn.cluster import KMeans

In [57]:
features, _ = make_blobs(n_samples = 50, n_features = 2, centers = 3, random_state = 0)

In [58]:
dataframe = pd.DataFrame(features, columns = ["feature_1", "feature_2"])

In [59]:
dataframe.head(3)

Unnamed: 0,feature_1,feature_2
0,1.926358,4.15243
1,1.420945,0.534922
2,1.420133,4.637462


In [60]:
clusterer = KMeans(3, random_state = 0)
clusterer.fit(features)

KMeans(n_clusters=3, random_state=0)

In [61]:
dataframe["group"] = clusterer.predict(features)

In [63]:
dataframe.head() #clustering can be used in preprocessing

Unnamed: 0,feature_1,feature_2,group
0,1.926358,4.15243,2
1,1.420945,0.534922,0
2,1.420133,4.637462,2
3,1.289338,3.449692,2
4,-1.930081,4.140327,1


# Deleting observations with missing values

In [67]:
features = np.array([
    [1, 2, 3, 4, 5],
    [6, 7, 8, 9, np.nan],
    [11, 12, np.nan, 14, np.nan],
    [16, 17, 18, 19, 100]
])

In [68]:
features[~np.isnan(features).any(axis = 1)] #keep only the observations that are not (denoted by ~) missing

array([[  1.,   2.,   3.,   4.,   5.],
       [ 16.,  17.,  18.,  19., 100.]])

In [72]:
dataframe = pd.DataFrame(features, columns = ["feature_1", "feature_2", "feature_3", "feature_4", "feature_5"])

In [73]:
dataframe

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5
0,1.0,2.0,3.0,4.0,5.0
1,6.0,7.0,8.0,9.0,
2,11.0,12.0,,14.0,
3,16.0,17.0,18.0,19.0,100.0


In [74]:
dataframe.dropna()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5
0,1.0,2.0,3.0,4.0,5.0
3,16.0,17.0,18.0,19.0,100.0


# Imputing missing values

In [80]:
#from fancyimpute import KNN
#from sklearn.preprocessing import StandardScaler
#features, _ = make_blobs(n_samples = 1000, n_features = 2, random_state = 10)
#scaler = StandardScaler()
#standardized_features = scaler.fit_transform(features)
#true_value = standardized_features[0, 0]
#standardized_features[0, 0] = np.nan
#features_knn_imputed = KNN(k = 5, verbose = 0).complete(standardized_features)
#print("True value: ", true_value)
#print("Imputed value: ", features_knn_imputed[0, 0])
#sklearn Imputer can also be used but it will give worse results than knn
#if we use imputation it's good idea to make a binary feature indicating whether or not the observation contains an imputed value