In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

import autofeat
import featuretools as ft

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_train = pd.read_csv('../data/clean_train1.csv')
raw_test = pd.read_csv('../data/clean_test1.csv')

In [3]:
print(raw_train.info())
raw_train.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9053 entries, 0 to 9052
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mass_npea       9053 non-null   float64
 1   size_npear      9053 non-null   float64
 2   malign_ratio    9053 non-null   float64
 3   damage_size     9053 non-null   float64
 4   exposed_area    9053 non-null   float64
 5   std_dev_malign  9053 non-null   float64
 6   err_malign      9053 non-null   float64
 7   malign_penalty  9053 non-null   float64
 8   damage_ratio    9053 non-null   float64
 9   tumor_size      9053 non-null   float64
dtypes: float64(10)
memory usage: 707.4 KB
None


Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,tumor_size
9048,-0.569553,0.291878,1.928772,-0.172751,-0.643363,-0.189448,-0.351735,-0.038111,0.360958,13.127
9049,0.29338,0.563212,0.767917,0.014736,0.210867,0.37425,0.884965,-0.689687,-0.658556,17.091
9050,-0.284974,-1.069571,-1.697011,-0.261039,-0.273391,-0.477059,0.02189,-1.512917,-0.130792,1.971
9051,1.155406,1.838692,2.086468,-0.12398,1.099978,1.356517,5.283015,-0.398802,-0.07646,17.749
9052,-0.718325,-1.116628,-1.148658,-1.138468,-0.704548,-1.055985,-0.63444,-0.76354,0.523749,14.103


In [4]:
train = raw_train.copy()
train.sample(3)

Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,tumor_size
3476,0.04417,0.774402,1.862463,0.041385,0.110245,-0.053276,-0.071072,-0.654205,-0.622592,10.288
4161,-1.337869,-1.835107,-1.636963,-1.21352,-1.202859,-1.474675,-1.637376,0.449171,1.491019,2.779
5832,1.499893,1.421564,0.25823,1.476641,1.488177,1.43896,1.538428,1.786865,-2.134895,17.781


In [5]:
X = train.drop('tumor_size', axis=1)
y = train['tumor_size']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25, random_state=1234)

In [8]:
afreg = autofeat.AutoFeatRegressor(verbose=1, feateng_steps=2)
# train on noisy data
X_train_new = afreg.fit_transform(X, y)

[AutoFeat] The 2 step feature engineering process could generate up to 2016 features.
[AutoFeat] With 9053 data points this new feature matrix would use about 0.07 gb of space.
[feateng] Step 1: transformation of original features
[feateng] Generated 44 transformed features from 9 original features - done.
[feateng] Step 2: first combination of features
[feateng] Generated 1342 feature combinations from 1378 original feature tuples - done.
[feateng] Generated altogether 1387 new features in 2 steps
[feateng] Removing correlated features, as well as additions at the highest level
[feateng] Generated a total of 1213 additional features
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 53 features after 5 feature selection runs
[featsel] 47 features after correlation filtering
[featsel] 31 features after noise filtering


In [9]:
X_train_new

Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,exp(damage_size),...,Abs(malign_penalty)/malign_penalty,Abs(err_malign),err_malign*malign_ratio,damage_size**2*Abs(malign_ratio),malign_penalty**3*Abs(err_malign),damage_ratio*exposed_area**2,size_npear*exp(malign_ratio),Abs(malign_ratio)/malign_ratio,malign_ratio**2*exp(damage_size),exp(damage_ratio)*exp(size_npear)
0,-0.688271,0.150536,1.801721,-1.117626,-0.619398,-0.368977,-0.964623,0.345142,0.807538,0.327055,...,1.0,0.964623,-1.737981,2.250508,0.039660,0.309816,0.912261,1.0,1.061687,2.606673
1,1.348315,1.252353,0.192869,1.716044,1.229464,1.376569,1.414469,2.094171,-1.868851,5.562477,...,1.0,1.414469,0.272807,0.567961,12.990611,-2.824921,1.518758,1.0,0.206915,0.539831
2,0.349871,-0.081906,-0.791991,0.684628,0.319116,0.456767,0.633297,0.362952,-0.821343,1.983034,...,1.0,0.633297,-0.501565,0.371218,0.030280,-0.083642,-0.037099,-1.0,1.243856,0.405251
3,0.923094,1.081385,0.692788,1.058498,0.908190,0.621616,1.035512,-0.619613,-1.093747,2.882039,...,-1.0,1.035512,0.717390,0.776213,-0.246330,-0.902131,2.161994,1.0,1.383250,0.987715
4,-0.465060,0.123852,1.248519,-0.099043,-0.482686,-0.087708,-0.450706,0.050792,-0.001150,0.905704,...,1.0,0.450706,-0.562715,0.012247,0.000059,-0.000268,0.431647,1.0,1.411810,1.130547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9048,-0.569553,0.291878,1.928772,-0.172751,-0.643363,-0.189448,-0.351735,-0.038111,0.360958,0.841347,...,-1.0,0.351735,-0.678417,0.057560,-0.000019,0.149406,2.008430,1.0,3.129946,1.920982
9049,0.293380,0.563212,0.767917,0.014736,0.210867,0.374250,0.884965,-0.689687,-0.658556,1.014845,...,-1.0,0.884965,0.679579,0.000167,-0.290323,-0.029282,1.213876,1.0,0.598450,0.909060
9050,-0.284974,-1.069571,-1.697011,-0.261039,-0.273391,-0.477059,0.021890,-1.512917,-0.130792,0.770251,...,-1.0,0.021890,-0.037148,0.115636,-0.075805,-0.009776,-0.195978,-1.0,2.218205,0.301085
9051,1.155406,1.838692,2.086468,-0.123980,1.099978,1.356517,5.283015,-0.398802,-0.076460,0.883398,...,-1.0,5.283015,11.022840,0.032071,-0.335085,-0.092513,14.813253,1.0,3.845738,5.825423


In [11]:
new_test = afreg.transform(raw_test)

[AutoFeat] Computing 28 new features.
[AutoFeat]     0/   28 new features[AutoFeat]     1/   28 new features[AutoFeat]     2/   28 new features[AutoFeat]     3/   28 new features[AutoFeat]     4/   28 new features[AutoFeat]     5/   28 new features[AutoFeat]     6/   28 new features[AutoFeat]     7/   28 new features[AutoFeat]     8/   28 new features[AutoFeat]     9/   28 new features[AutoFeat]    10/   28 new features[AutoFeat]    11/   28 new features[AutoFeat]    12/   28 new features[AutoFeat]    13/   28 new features[AutoFeat]    14/   28 new features[AutoFeat]    15/   28 new features[AutoFeat]    16/   28 new features[AutoFeat]    17/   28 new features[AutoFeat]    18/   28 new features[AutoFeat]    19/   28 new features[AutoFeat]    20/   28 new features[AutoFeat]    21/   28 new features[AutoFeat]    22/   28 new features[AutoFeat]    23/   28 new features[AutoFeat]    24/   28 new features[AutoFeat]    25/   28 new features[AutoFeat]    26/   28 new

In [12]:
new_test

Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,exp(damage_size),...,Abs(malign_penalty)/malign_penalty,Abs(err_malign),err_malign*malign_ratio,damage_size**2*Abs(malign_ratio),malign_penalty**3*Abs(err_malign),damage_ratio*exposed_area**2,size_npear*exp(malign_ratio),Abs(malign_ratio)/malign_ratio,malign_ratio**2*exp(damage_size),exp(damage_ratio)*exp(size_npear)
0,-1.600626,-2.219139,-1.987398,-1.009650,-1.628078,-1.480941,-1.515171,-1.013203,1.334724,0.364346,...,-1.0,1.515171,3.011248,2.025941,-1.575980,3.537869,-0.304137,-1.0,1.439077,0.412956
1,-1.253411,-1.060195,-0.156451,-1.128963,-1.203991,-1.068913,-0.477480,-0.157108,1.083747,0.323368,...,-1.0,0.477480,0.074702,0.199405,-0.001852,1.570995,-0.906651,-1.0,0.007915,1.023831
2,0.794705,0.251966,-0.883713,1.043133,0.839609,0.783808,0.666240,0.175221,-0.898949,2.838094,...,1.0,0.666240,-0.588765,0.961592,0.003584,-0.633707,0.104124,-1.0,2.216408,0.523623
3,-0.726778,-0.210982,0.922902,-0.921566,-0.774990,-0.739706,-0.499435,-0.313719,0.640147,0.397895,...,-1.0,0.499435,-0.460929,0.783806,-0.015421,0.384478,-0.530955,1.0,0.338906,1.535974
4,3.056971,3.331526,1.070375,1.680238,3.254951,2.721966,3.995989,1.697705,-1.272687,5.366832,...,1.0,3.995989,4.277208,3.021882,19.552898,-13.483743,9.716306,1.0,6.148795,7.836865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36579,1.446844,1.291459,0.076866,1.605278,1.315962,1.523226,1.596569,2.186904,-1.909555,4.979246,...,1.0,1.596569,0.122722,0.198078,16.698478,-3.306882,1.394643,1.0,0.029419,0.538970
36580,-0.276968,-0.733496,-1.073476,-0.469598,-0.239382,-0.834230,-0.656596,0.175221,0.755080,0.625254,...,1.0,0.656596,0.704840,0.236725,0.003532,0.043269,-0.250722,-1.0,0.720512,1.021819
36581,1.006703,0.802389,-0.120709,0.924386,0.912327,0.944295,0.729983,0.029046,-0.686039,2.520320,...,1.0,0.729983,-0.088116,0.103145,0.000018,-0.571018,0.711151,-1.0,0.036723,1.123389
36582,-1.181427,-1.821277,-1.820689,-0.994894,-1.222597,-1.272351,-1.257681,-0.802032,1.144294,0.369763,...,-1.0,1.257681,2.289845,1.802143,-0.648852,1.710426,-0.294891,-1.0,1.225729,0.508148


In [10]:
clean_train = pd.read_csv('../data/clean_train1.csv')
clean_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9053 entries, 0 to 9052
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mass_npea       9053 non-null   float64
 1   size_npear      9053 non-null   float64
 2   malign_ratio    9053 non-null   float64
 3   damage_size     9053 non-null   float64
 4   exposed_area    9053 non-null   float64
 5   std_dev_malign  9053 non-null   float64
 6   err_malign      9053 non-null   float64
 7   malign_penalty  9053 non-null   float64
 8   damage_ratio    9053 non-null   float64
 9   tumor_size      9053 non-null   float64
dtypes: float64(10)
memory usage: 707.4 KB


In [11]:
clean_train.describe()

Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,tumor_size
count,9053.0,9053.0,9053.0,9053.0,9053.0,9053.0,9053.0,9053.0,9053.0,9053.0
mean,0.010053,0.013912,0.012047,0.009354,0.0,0.009756,0.011318,0.003585,-0.011155,7.741067
std,0.997876,0.995929,0.993625,1.000337,0.0,1.001578,0.988126,0.992942,0.998211,6.093301
min,-3.168962,-3.347234,-3.358582,-4.800137,0.0,-3.256478,-3.751022,-3.515128,-2.516503,0.0
25%,-0.665895,-0.655962,-0.671035,-0.676135,0.0,-0.664979,-0.529774,-0.654205,-0.776571,2.325
50%,-0.038485,-0.02503,0.018541,-0.021596,0.0,-0.056682,0.055529,-0.015395,0.037351,5.096
75%,0.73411,0.730154,0.672343,0.783693,0.0,0.73114,0.654115,0.654189,0.698156,13.367
max,3.273444,3.542173,3.158166,2.469075,0.0,2.853436,10.449119,2.679941,2.349033,20.999
