In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('penguins_simple.csv', sep=';')
df.head()

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,Adelie,39.1,18.7,181.0,3750.0,MALE
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,36.7,19.3,193.0,3450.0,FEMALE
4,Adelie,39.3,20.6,190.0,3650.0,MALE


In [3]:
# covert body mass to kg
df['Body Mass (kg)'] = df['Body Mass (g)']/1000

In [4]:
df.head(2)

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Body Mass (kg)
0,Adelie,39.1,18.7,181.0,3750.0,MALE,3.75
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE,3.8


In [5]:
# calculate log of culmen length
import numpy as np

df['log of Culmen Length'] = np.log(df['Culmen Length (mm)'])
df.head()

# with apply (good if you have your own function!)
df['Culmen Length (log)'] = df['Culmen Length (mm)'].apply(np.log)

In [7]:
pd.get_dummies(df['Sex'])

Unnamed: 0,False,True
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
328,1,0
329,1,0
330,0,1
331,1,0


In [6]:
# covert Gender to a Boolean

# with replace
df.replace({'Sex': {'MALE': True, 'FEMALE':False }}, inplace=True)

df['Sex'] = df['Sex'].replace(['FEMALE','MALE'], [True,False])

# with get dummies -> creates one binary column for each categorical variable 
pd.get_dummies(df['Sex'])

# with map 
d = {'MALE': True, 'FEMALE': False}
df['Sex']=df["Sex"].map(d)

# with loc
df.loc[df['Sex'] == "MALE", 'Sex'] = True
df.loc[df['Sex'] == "FEMALE", 'Sex'] = False


TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str'

In [84]:
# combine species and gender in one column
df['Species_Gender']= df['Species']+df['Sex'].astype(str)
df['combine'] = df['Species']+' / '+df['Sex']
df.head()

Unnamed: 0,Species,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Body Mass (kg),log of Culmen Length,Culmen Length (log),Species_Gender,combine
0,Adelie,39.1,18.7,181.0,3750.0,MALE,3.75,3.666122,3.666122,AdelieMALE,Adelie / MALE
1,Adelie,39.5,17.4,186.0,3800.0,FEMALE,3.8,3.676301,3.676301,AdelieFEMALE,Adelie / FEMALE
2,Adelie,40.3,18.0,195.0,3250.0,FEMALE,3.25,3.696351,3.696351,AdelieFEMALE,Adelie / FEMALE
3,Adelie,36.7,19.3,193.0,3450.0,FEMALE,3.45,3.602777,3.602777,AdelieFEMALE,Adelie / FEMALE
4,Adelie,39.3,20.6,190.0,3650.0,MALE,3.65,3.671225,3.671225,AdelieMALE,Adelie / MALE


In [89]:
# check the amout of time for a function in jupyter notebook
%timeit df.head()

384 µs ± 5.36 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Why feature engineering?
* LogReg input are multiple columns (features)
* LogReg assigns one coefficient per feature
* --> number and kind of features determines power of the model
* all features have to be floating-point numbers
#### Feature Engineering is creating columns (features) that make the model better.

### 1. Imputation
**replace empty values by non-empty values**

In [2]:
df.isna()
df.dropna() # <-- not that useful in the context of machine learning
df.fillna() # mean, randon, median, backfill/forwardfill/interpolation (for time series)

NameError: name 'df' is not defined

In [39]:
# more detailed grouped means
import pandas as pd
x = pd.DataFrame({
    'fruit': ['banana', 'banana', 'banana', 'apple', 'apple', 'apple', 'orange', 'melon'],
    'price': [1.00, 1.50, None, 2.00, 2.50, None, 3.0, 5.0],
    'bio':[1, 0, 1, 0, 1, 0, 1, 1]
})

x

Unnamed: 0,fruit,price,bio
0,banana,1.0,1
1,banana,1.5,0
2,banana,,1
3,apple,2.0,0
4,apple,2.5,1
5,apple,,0
6,orange,3.0,1
7,melon,5.0,1


In [40]:
x.groupby('fruit')['price'].mean() # this type of aggregation is good for plotting, but bad for fillna

fruit
apple     2.25
banana    1.25
melon     5.00
orange    3.00
Name: price, dtype: float64

In [41]:
# with transform:
x.groupby('fruit')['price'].transform('mean') # same index as original DF

# Call func on self producing a DataFrame with transformed values.
# Produced DataFrame will have same axis length as self.

0    1.25
1    1.25
2    1.25
3    2.25
4    2.25
5    2.25
6    3.00
7    5.00
Name: price, dtype: float64

In [42]:
category_means = x.groupby('fruit')['price'].transform('mean')

In [43]:
x['price'].fillna(category_means)

0    1.00
1    1.50
2    1.25
3    2.00
4    2.50
5    2.25
6    3.00
7    5.00
Name: price, dtype: float64

In [44]:
x['price_filled']= x['price'].interpolate(method='linear')

### 2. One-Hot Encoding
**Dummy Encoding**

In [45]:
binary = pd.get_dummies(x['fruit']) # you don t need all four columns for the logistic regression
# because the fourth can be derived from the other three

In [49]:
x = x.join(binary.iloc[:, :-1])

### 3. Interaction Terms
**combine columns to get new columns**

In [51]:
x['bio_bananas'] = x['bio'] * x['banana']
x['bio_apples'] = x['bio'] * x['apple']
x['bio_melons'] = x['bio'] * x['melon']
# LogReg can only add features, not multiply
# this is why these extra features help 

In [52]:
x

Unnamed: 0,fruit,price,bio,price_filled,apple,banana,melon,bio_bananas,bio_apples,bio_melons
0,banana,1.0,1,1.0,0,1,0,1,0,0
1,banana,1.5,0,1.5,0,1,0,0,0,0
2,banana,,1,1.75,0,1,0,1,0,0
3,apple,2.0,0,2.0,1,0,0,0,0,0
4,apple,2.5,1,2.5,1,0,0,0,1,0
5,apple,,0,2.75,1,0,0,0,0,0
6,orange,3.0,1,3.0,0,0,0,0,0,0
7,melon,5.0,1,5.0,0,0,1,0,0,1


### 4. Binning

_reduces the amount of information: scalar -> category_

In [55]:
pd.cut(x['price_filled'], bins=3, labels=['cheap', 'medium','expensive'])# 3 cateogories having the same width
# equally spaced intevals
# disadvantage: 

0        cheap
1        cheap
2        cheap
3        cheap
4       medium
5       medium
6       medium
7    expensive
Name: price_filled, dtype: category
Categories (3, object): [cheap < medium < expensive]

In [57]:
# quantile binning, all categories have the same number of data points
pd.qcut(x['price_filled'], q=4, labels=['cheap', 'medium','expensive', 'super expensive'])

0              cheap
1              cheap
2             medium
3             medium
4          expensive
5          expensive
6    super expensive
7    super expensive
Name: price_filled, dtype: category
Categories (4, object): [cheap < medium < expensive < super expensive]

In [58]:
# followup of the binning: One-Hot-Encoding

### 5. Scaling
_features being on different scales is usually bad for the model_

In [61]:
# min-max scaling: smallest values is 0, highest value is 1.0
price_range = x['price_filled'].max() - x['price_filled'].min()

In [63]:
(x['price_filled'] - x['price_filled'].min())/ price_range

0    0.0000
1    0.1250
2    0.1875
3    0.2500
4    0.3750
5    0.4375
6    0.5000
7    1.0000
Name: price_filled, dtype: float64

_only one type of normalization_

### Scaling with scikit

In [65]:
X = x.iloc[:, 2:-1]
X

Unnamed: 0,bio,price_filled,apple,banana,melon,bio_bananas,bio_apples,bio_melons
0,1,1.0,0,1,0,1,0,0
1,0,1.5,0,1,0,0,0,0
2,1,1.75,0,1,0,1,0,0
3,0,2.0,1,0,0,0,0,0
4,1,2.5,1,0,0,0,1,0
5,0,2.75,1,0,0,0,0,0
6,1,3.0,0,0,0,0,0,0
7,1,5.0,0,0,1,0,0,1


In [67]:
from sklearn.preprocessing import MinMaxScaler

In [68]:
scaler = MinMaxScaler()
scaler.fit(X) # memorizes the min and max for each column, no y 
Xscaled = scaler.transform(X) # does the actual scaling; still no y

In [69]:
Xscaled

array([[1.    , 0.    , 0.    , 1.    , 0.    , 1.    , 0.    , 0.    ],
       [0.    , 0.125 , 0.    , 1.    , 0.    , 0.    , 0.    , 0.    ],
       [1.    , 0.1875, 0.    , 1.    , 0.    , 1.    , 0.    , 0.    ],
       [0.    , 0.25  , 1.    , 0.    , 0.    , 0.    , 0.    , 0.    ],
       [1.    , 0.375 , 1.    , 0.    , 0.    , 0.    , 1.    , 0.    ],
       [0.    , 0.4375, 1.    , 0.    , 0.    , 0.    , 0.    , 0.    ],
       [1.    , 0.5   , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ],
       [1.    , 1.    , 0.    , 0.    , 1.    , 0.    , 0.    , 1.    ]])

acc 0.93 
precision 0.4 
recall 0.8 
F1 Score 0.5333333333333333
