### Feature Engineering notebook 

This is a demo notebook to play with feature engineering toolkit. In this notebook we will see some capabilities of the toolkit like filling missing values, PCA, Random Projections, Normalizing values, and etc.

In [1]:
%load_ext autoreload
%autoreload 1
%matplotlib inline

In [2]:
from Pipeline import Pipeline
from Compare import Compare
from StructuredData.LoadCSV import LoadCSV
from StructuredData.MissingValues import MissingValues
from StructuredData.Normalize import Normalize
from StructuredData.Factorize import Factorize
from StructuredData.PCAFeatures import PCAFeatures
from StructuredData.RandomProjection import RandomProjection

In [3]:
csv_path = './DemoData/synthetic_classification.csv'
df = LoadCSV(csv_path)()

In [4]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,6,2.0,0.367227,6.0,4,0.258202,6.0,,4,4,-1.144969,3,0.744971,1.198658,8,1.450775,-0.20348,7,-0.720539,0.455101
1,5,,-0.957909,5.0,4,-0.622693,,1.253042,5,7,-0.35195,6,0.155066,0.53749,7,0.124916,0.596243,4,-0.748488,1.701948
2,5,3.0,-0.025323,5.0,3,-0.205105,5.0,1.935712,3,5,-1.125377,4,1.339869,0.257055,5,0.963987,-0.119324,6,0.134284,-0.019586
3,7,2.0,-0.276148,,5,0.709411,8.0,2.815076,5,5,-1.703041,3,0.852819,0.357297,6,-1.687569,-0.292334,5,-0.68894,-0.628006
4,4,5.0,1.246114,4.0,2,0.292027,3.0,0.671939,4,4,-0.445409,4,-1.03512,-0.225838,6,-1.119412,-0.139365,6,0.172082,-0.91662


### Filling missing values

By default, median of the values of the column is applied for filling out the missing values

In [5]:
pipelineObj = Pipeline([MissingValues()])
new_df = pipelineObj(df, '0')
new_df.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,0
0,2.0,0.367227,6.0,4,0.258202,6.0,0.18021,4,4,-1.144969,3,0.744971,1.198658,8,1.450775,-0.20348,7,-0.720539,0.455101,6
1,4.0,-0.957909,5.0,4,-0.622693,5.0,1.253042,5,7,-0.35195,6,0.155066,0.53749,7,0.124916,0.596243,4,-0.748488,1.701948,5
2,3.0,-0.025323,5.0,3,-0.205105,5.0,1.935712,3,5,-1.125377,4,1.339869,0.257055,5,0.963987,-0.119324,6,0.134284,-0.019586,5
3,2.0,-0.276148,5.0,5,0.709411,8.0,2.815076,5,5,-1.703041,3,0.852819,0.357297,6,-1.687569,-0.292334,5,-0.68894,-0.628006,7
4,5.0,1.246114,4.0,2,0.292027,3.0,0.671939,4,4,-0.445409,4,-1.03512,-0.225838,6,-1.119412,-0.139365,6,0.172082,-0.91662,4


However, the imputation type is a configurable parameter to customize it as per needs.

In [6]:
pipelineObj = Pipeline([MissingValues(imputation_type = 'mean')])
new_df = pipelineObj(df, '0')
new_df.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,0
0,2.0,0.367227,6.0,4,0.258202,6.0,0.026382,4,4,-1.144969,3,0.744971,1.198658,8,1.450775,-0.20348,7,-0.720539,0.455101,6
1,4.472472,-0.957909,5.0,4,-0.622693,5.051051,1.253042,5,7,-0.35195,6,0.155066,0.53749,7,0.124916,0.596243,4,-0.748488,1.701948,5
2,3.0,-0.025323,5.0,3,-0.205105,5.0,1.935712,3,5,-1.125377,4,1.339869,0.257055,5,0.963987,-0.119324,6,0.134284,-0.019586,5
3,2.0,-0.276148,5.208208,5,0.709411,8.0,2.815076,5,5,-1.703041,3,0.852819,0.357297,6,-1.687569,-0.292334,5,-0.68894,-0.628006,7
4,5.0,1.246114,4.0,2,0.292027,3.0,0.671939,4,4,-0.445409,4,-1.03512,-0.225838,6,-1.119412,-0.139365,6,0.172082,-0.91662,4


### Normalize data

By default, Min max normalization is applied. Please note that assertion has been set such that normlization cant be applied if there rae missing values in that column. This is part of validation phase

In [10]:
pipelineObj = Pipeline([MissingValues(), Normalize(['1','2', '3'])])
new_df = pipelineObj(df, '0')
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,6,2.0,0.367227,6.0,4,0.258202,6.0,,4,4,-1.144969,3,0.744971,1.198658,8,1.450775,-0.20348,7,-0.720539,0.455101
1,5,,-0.957909,5.0,4,-0.622693,,1.253042,5,7,-0.35195,6,0.155066,0.53749,7,0.124916,0.596243,4,-0.748488,1.701948
2,5,3.0,-0.025323,5.0,3,-0.205105,5.0,1.935712,3,5,-1.125377,4,1.339869,0.257055,5,0.963987,-0.119324,6,0.134284,-0.019586
3,7,2.0,-0.276148,,5,0.709411,8.0,2.815076,5,5,-1.703041,3,0.852819,0.357297,6,-1.687569,-0.292334,5,-0.68894,-0.628006
4,4,5.0,1.246114,4.0,2,0.292027,3.0,0.671939,4,4,-0.445409,4,-1.03512,-0.225838,6,-1.119412,-0.139365,6,0.172082,-0.91662


### Factorize data

Encode the object as an enumerated type or categorical variable for column 4 and 8, but we must remove missing values before Factorizing

In [11]:
pipelineObj = Pipeline([MissingValues(), Factorize(['4','8'])])
new_df = pipelineObj(df, '0')
new_df.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,0
0,2.0,0.367227,6.0,0,0.258202,6.0,0.18021,0,4,-1.144969,3,0.744971,1.198658,8,1.450775,-0.20348,7,-0.720539,0.455101,6
1,4.0,-0.957909,5.0,0,-0.622693,5.0,1.253042,1,7,-0.35195,6,0.155066,0.53749,7,0.124916,0.596243,4,-0.748488,1.701948,5
2,3.0,-0.025323,5.0,1,-0.205105,5.0,1.935712,2,5,-1.125377,4,1.339869,0.257055,5,0.963987,-0.119324,6,0.134284,-0.019586,5
3,2.0,-0.276148,5.0,2,0.709411,8.0,2.815076,1,5,-1.703041,3,0.852819,0.357297,6,-1.687569,-0.292334,5,-0.68894,-0.628006,7
4,5.0,1.246114,4.0,3,0.292027,3.0,0.671939,0,4,-0.445409,4,-1.03512,-0.225838,6,-1.119412,-0.139365,6,0.172082,-0.91662,4


### Principal Component Analysis 

Use n_components to play around with how many dimensions you want to keep. Please note that assertions will validate if a data frame has any missing values before applying PCA. In the below example, the pipeline first removed missing values before applying PCA.

In [28]:
pipelineObj = Pipeline([MissingValues(imputation_type = 'mean'), PCAFeatures(n_components = 5)])
pca_df = pipelineObj(df, '0')
pca_df.head(5)

Unnamed: 0,pca_0,pca_1,pca_2,pca_3,pca_4,0
0,-0.056314,2.433791,-1.13351,-0.381215,1.591039,6
1,-1.514456,-0.939038,-1.124172,1.38698,-1.25215,5
2,-1.980937,1.776331,-2.863584,-0.647862,0.398949,5
3,-2.571669,3.078554,-0.141075,-0.188001,0.420431,7
4,-0.76027,1.367929,-2.680216,1.017784,0.905902,4


### Random Projections

Use n_components to play around with how many dimensions you want to keep. Please note that assertions will validate if a data frame has any missing values before applying Random Projections. Type of projections can be specified as an argument, by default GaussianRandomProjection is applied. In the below example, the pipeline first removed missing values before applying Sparse Random Projection. As of now, 'auto' deduction of number of dimensions which are sufficient to represent the features with minimal loss of information has not been implemeted, hence default value for ouput columns is 2 (Use n_components to specify custom value)

In [31]:
pipelineObj = Pipeline([MissingValues(imputation_type = 'mean'), RandomProjection(n_components = 6, proj_type = 'Sparse')])
new_df = pipelineObj(df, '0')
new_df.head()

Unnamed: 0,Spa_0,Spa_1,Spa_2,Spa_3,Spa_4,Spa_5,0
0,1.490213,2.318824,12.254046,2.585285,10.250564,7.947205,6
1,1.853227,6.604344,11.58384,3.057152,7.527542,10.935465,5
2,2.675418,2.442564,12.541899,2.217898,7.616263,8.617697,5
3,2.489123,4.671446,16.495807,2.162418,12.627479,14.175429,7
4,5.161757,3.262687,11.947408,1.573356,3.98208,7.540937,4


### Download the modified CSV
At any point, the new tranformed features can be downloaded using below command


In [14]:
csv_path = './DemoData/synthetic_classification_transformed.csv'
new_df.to_csv(csv_path)