In [39]:
# Handling Numerical Data
import pandas as pd 
import numpy as np 
from sklearn import preprocessing 
from sklearn.preprocessing import Normalizer, PolynomialFeatures, FunctionTransformer, Binarizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN


In [3]:
# 4.1 rescaling a feature 
feature = np.array([[-500.5], 
                    [-100.1], 
                    [0], 
                    [100.1], 
                    [900.9]])
# create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

# scale feature 
scaled_feature = minmax_scale.fit_transform(feature)

scaled_feature


array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [5]:
# 4.2 Standarding a feature 
# mean of zero and standard deviation of one 
x = np.array([[-1000.1], 
              [-200.2], 
              [500.5], 
              [600.6], 
              [900.9], 
              [100.1]])

scalar = preprocessing.StandardScaler() 
standardized = scalar.fit_transform(x)
print("Mean: ", standardized.mean())
print("Standard deviation: ", standardized.std())


Mean:  2.3129646346357427e-18
Standard deviation:  0.9999999999999998


In [7]:
# 4.3 Normalizing Observations 
# rescale the feature values of oservations to have unit norm 
features = np.array([[0.5, 0.5], 
                     [1.1, 3.4], 
                     [1.5, 20.2], 
                     [1.62, 34.4], 
                     [10.9, 3.3]])
normalizer = Normalizer(norm="l2")
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04704089, 0.99889296],
       [0.95709822, 0.28976368]])

In [9]:
# 4.4 Generating polynomial and interation features
features = np.array([[2, 3], 
                     [2, 3], 
                     [2, 3]])
polynomial_interaction = PolynomialFeatures(degree = 2, include_bias=False)
polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [10]:
interaction = PolynomialFeatures(degree=2, 
                                  interaction_only = True, include_bias = False)
interaction.fit_transform(features)


array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

In [12]:
# 4.5 Transforming Features
features = np.array([[2, 3], 
                     [2, 3], 
                     [2, 3]])
def add_ten(x):
    return x + 10

ten_transformer = FunctionTransformer(add_ten)
ten_transformer.transform(features)


array([[12, 13],
       [12, 13],
       [12, 13]])

In [13]:
df = pd.DataFrame(features, columns=['feature_1', 'feature_2'])
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


In [15]:
# 4.6 Detecting Outliers 
features, _ = make_blobs(n_samples = 10, n_features = 2, 
                         centers = 1, random_state = 1)
features[0, 0] = 10000
features[0, 1] = 10000

# create detector 
outlier_detector = EllipticEnvelope(contamination = 0.1)
outlier_detector.fit(features)
outlier_detector.predict(features)


array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [16]:
feature - features[:, 0]
def indices_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

indices_of_outliers(feature)


(array([0, 4]), array([0, 0]))

In [17]:
# 4.7 Handling Outliers 
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [18]:
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [19]:
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


In [22]:
# 4.8 Discretizating Features
age = np.array([[6], 
                [12], 
                [20], 
                [36], 
                [65]])
age_discretizer = Binarizer(threshold=18)
age_discretizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [23]:
np.digitize(age, bins=[20, 30, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [24]:
np.digitize(age, bins=[20, 30, 64], right = True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

In [25]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [27]:
# 4.9 Grouping Observations Using Clustering 
features, _ = make_blobs(n_samples = 50, n_features = 2, 
                         centers= 3, random_state = 1)

dataframe = pd.DataFrame(features, columns=['feature_1', 'feature_2'])
clusterer = KMeans(3, random_state = 0)

clusterer.fit(features)

dataframe['group'] = clusterer.predict(features)

dataframe.head()

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


In [29]:
# 4.10 Deleteing ovservations with missing values 

features = np.array([[1.1, 11.1], 
                     [2.2, 22.2], 
                     [3.3, 33.3], 
                     [4.4, 44.4], 
                     [np.nan, 55]])
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [30]:
dataframe = pd.DataFrame(features, columns=['feature_1', 'feature_2'])
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


In [31]:
# 4.11 Imputing missing values 


Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4
4,,55.0


In [36]:
features, _  = make_blobs(n_samples = 1000, n_features = 2, random_state = 1)
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)
print("Standardized features: ", standardized_features)

# replace the first features'd first value with a missing value
true_value = standardized_features[0, 0]
standardized_features[0, 0] = np.nan

# Predict the missing values in the feature matrix
features_knn_imputed = KNN(k=5, verbose=0).fit_transform(standardized_features)

# Compare true and imputed values 
print("True value: ", true_value)
print("Imputed value: ", features_knn_imputed[0, 0])


Standardized features:  [[ 0.87301861  1.31426523]
 [-0.67073178 -0.22369263]
 [ 2.1048424   1.45332359]
 ...
 [ 1.18998798  1.33439442]
 [ 1.22406396  1.27667052]
 [-0.21664919 -1.19113343]]
True value:  0.8730186113995938
Imputed value:  1.0955332713113226




In [42]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
features_mean_imputed = mean_imputer.fit_transform(features)
print("Mean imputed features: ", features_mean_imputed)

print("True Value: ", true_value)
print("Imputed Value: ", features_mean_imputed[0, 0])


Mean imputed features:  [[-3.05837272  4.48825769]
 [-8.60973869 -3.72714879]
 [ 1.37129721  5.23107449]
 ...
 [-1.91854276  4.59578307]
 [-1.79600465  4.28743568]
 [-6.97684609 -8.89498834]]
True Value:  0.8730186113995938
Imputed Value:  -3.0583727246149963
