Whith this exercise we will practice on the most common methodologies for features processing and features engineering
# Features Processing

In [None]:
import pandas as pd
import numpy as np

## Handling Outliers
Common ways to identify the presence of outliers is to visualize the data or to calculate summary statistics

In [None]:
# visualizing boxplot
import seaborn as sns
sns.boxplot(x=data['column_name'])

In [None]:
#discovering outliers with z-score
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(data[["column_name"]]))
threshold = 3
outliers_indexes = np.where(z > 3) #array of indexes of the samples with zscore > 3 (outliers)

In [None]:
##discovering outliers with IQR-score
Q1 = data["column_name"].quantile(0.05)
Q3 = data["column_name"].quantile(0.95)
IQR = Q3 - Q1
print(IQR)

logical_index_outliers = data["column_name"] < (Q1 - 1.5 * IQR)) |(data["column_name"] > (Q3 + 1.5 * IQR))

### Outlier detection with standard deviation

In [None]:
#Dropping the outlier rows with standard deviation
k = 3
upper_lim = data['column'].mean () + data['column'].std () * k
lower_lim = data['column'].mean () - data['column'].std () * k

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)] #Drop outliers

### Outlier detecion with percentiles

In [None]:
#Dropping the outlier rows with Percentiles
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)] #Drop outliers

### Cap instead of Drop

In [None]:
#Capping the outlier rows with Percentiles
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)

data.loc[(df[column] > upper_lim),column] = upper_lim
data.loc[(df[column] < lower_lim),column] = lower_lim

## Handling Missing Data


In [None]:
# shows the type of data and the existence of missing values
data.info()
# or show the summary statistics of the complete dataset for all features in the dataset
data.describe(include="all")
# or count the missing values per features and flattened
sum(isnan(data).flatten())
data.isnull().sum()

### Eliminate Data

In [None]:
threshold = 0.7
#Dropping columns with missing value rate higher than threshold
data = data[data.columns[data.isnull().mean() < threshold]]

#Dropping rows with missing value rate higher than threshold
data = data.loc[data.isnull().mean(axis=1) < threshold]

### Numerical Imputation

In [None]:
#Filling all missing values with 0
data = data.fillna(0)

#Filling missing values with mean, median, mode of the columns
data = data.fillna(data.mean())
data = data.fillna(data.median())
data = data.fillna(data.mode())

# Filling with interpolation
data['column_name'].interpolate(method='linear', limit=2, inplace=True)

### Cathegorical Imputation

In [None]:
#Filling missing values on categorical columns with the most frequent value
data['column_name'].fillna(data['column_name'].value_counts().idxmax(), inplace=True)

### Imputation using kNN

In [None]:
from sklearn.impute import KNNImputer

#...
# initial total missing values
print('Missing: %d' % sum(isnan(X).flatten()))

# define imputer
imputer = KNNImputer(n_neighbors=5, weights='distance', metric='nan_euclidean')
# NaN-aware Euclidean Distance: does not include NaN values when calculating the distance between members 
# of the training dataset.
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html

# fit the imputer on the dataset
imputer.fit(X[["col1","col2","col3","col4"]])

# transform the dataset
Xtrans = imputer.transform(X[["col1","col2","col3","col4"]])

# fit_transform
Xtrans = pd.DataFrame(imputer.fit_transform(X[["col1","col2","col3","col4"]]))

# final total missing values
print('Missing: %d' % sum(isnan(Xtrans).flatten()))

### Regression based Imputation

In [None]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()

test_data = data[data['column_name'].isnull()==True]
train_data = data[data['column_name'].isnull()==False]

y = train_data['column_name'] #target is "column_name"
train_data.drop("column_name",axis=1,inplace=True) #features are all other features except "column_name"

lr_model.fit(train_data,y)

test_data.drop("column_name",axis=1,inplace=True)

# infer the missing values with the learned model
pred = lr_model.predict(test_data)
test_data['column_name']= pred

## Processing and Scaling
We will use the `cancer` dataset available on `sklearn`.

In [None]:
## LOAD the dataset
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split 
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=1)
print(X_train.shape) 
print(X_test.shape)

*   The dataset contains 569 data points, each represented by 30 measurements
*   We split the dataset into training set and test set.

### Normalization (MinMax Scaler)

In [None]:
# first import the class necessary for the preprocessing, and then instantiate it
from sklearn.preprocessing import MinMaxScaler 

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
#X_train_scaled = scaler.fit_transform(X_train)  # do this to fit and transform in the same command
X_test_scaled = scaler.transform(X_test) 

# DO NOT TRAIN a new scaler on test data (all the above but with the test)

> *   We **fit** the scaler on the **training data**. 
> *   For the *MinMaxScaler*, the `fit` method computes the ***minimum*** and ***maximum*** value of each feature on the `training` set. 
> *   We call fit on the `training` set, and then call transform on the `training` and `test` sets
> *   For the test set, after scaling, the minimum and maximum values are not 0 and 1. Some of the features will be even outside the 0â€“1 range! The explanation is that the `MinMaxScaler` (and all the other scalers) always applies exactly the same transformation to the training and the test set. This means the transform method always subtracts the training set minimum and divides by the training set range, which might be different from the minimum and range for the test set.



### Standardization

In [None]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

scaler = preprocessing.StandardScaler().fit(X_train)
print(scaler.mean_) #The mean value for each feature in the training set
print(scaler.scale_) #Per feature relative scaling of the data to achieve zero mean and unit variance

X_scaled = scaler.transform(X_train)
print(X_scaled.mean(axis=0)) #Zero mean
print(X_scaled.std(axis=0)) #unit variance


### Robust Standardization

In [None]:
from sklearn.preprocessing import RobustScaler
X = [[ 1., -2.,  2.],
    [ -2.,  1.,  3.],
    [ 4.,  1., -2.]]
transformer = RobustScaler().fit(X)
transformer.transform(X)

# PRACTICING on Wine dataset
The dataset is available on iCorsi

#### 1. Check the presence of outliers and identify the best way to handle them
#### 2. Verify if the dataset presents missing values and handle them
#### 3. Verify the range and location of the features values and if necessary scale them. 

In [None]:
# from google.colab import files
# uploaded = files.upload()



In [None]:
import os
import pandas as pd
import seaborn as sns

red_df = pd.read_csv('winequality-red.csv', sep=';')
white_df = pd.read_csv('winequality-white.csv', sep=';')

red_df['wine_type'] = 1 # let's encode red with the value 1
white_df['wine_type'] = 0 # let's encode white with the value 0

df = pd.concat([red_df, white_df])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
sns.boxplot(x=df['pH'])

In [None]:
##discovering outliers with IQR-score
Q1 = df["pH"].quantile(0.05)
Q3 = df["pH"].quantile(0.95)
IQR = Q3 - Q1
print(IQR)

logical_index_outliers = (df["pH"] < (Q1 - 1.5 * IQR)) | (df["pH"] > (Q3 + 1.5 * IQR))

In [None]:
#Dropping the outlier rows with Percentiles
upper_lim = df["pH"].quantile(.95)
lower_lim = df["pH"].quantile(.05)

df = df[(df["pH"] < upper_lim) & (df["pH"] > lower_lim)] #Drop outliers
df.shape

In [None]:
sns.boxplot(x=df['pH'])