# An overview of feature engineering for regression and machine learning algorithms

In [1]:
import pandas as pd
%matplotlib inline

### A simple example to illustrate the intuition behind dummy variables

In [3]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})

In [4]:
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [5]:
pd.get_dummies(df['key'],prefix='key')

Unnamed: 0,key_a,key_b,key_c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


### Now we have a matrix of values based on the presence of absence of the attribute value in our dataset

## Another example with the flight statistics

### Now let's look at another example using our flight data

In [6]:
df = pd.read_csv('data/ontime_reports_may_2015_ny.csv')

In [7]:
#count number of NaNs in column
df['ARR_DELAY'].isnull().sum()

945

In [8]:
#calculate the percentage this represents of the total number of instances
df['ARR_DELAY'].isnull().sum()/df['ARR_DELAY'].sum()

0.0086069493146318145

### We could explore whether the NaNs are actually zero delays, but we'll just filter them out for now, especially since they represent such a small number of instances

In [9]:
#filter ARR_DELAY NaNs
df = df[pd.notnull(df['ARR_DELAY'])]

### We can discretize the continuous DEP_DELAY value by giving it a value of 0 if it's delayed and a 1 if it's not. We record this value into a separate column. (We could also code -1 for early, 0 for ontime, and 1 for late)

In [11]:
#code whether delay or not delayed
df['IS_DELAYED'] = df['ARR_DELAY'].apply(lambda x: 1 if x>0 else 0 )

In [12]:
#Let's check that our column was created properly
df[['ARR_DELAY','IS_DELAYED']]

Unnamed: 0,ARR_DELAY,IS_DELAYED
0,-17.0,0
1,-14.0,0
2,-11.0,0
3,4.0,1
4,-18.0,0
5,-6.0,0
6,-11.0,0
7,-15.0,0
8,5.0,1
9,31.0,1


In [14]:
pd.get_dummies(df['ORIGIN'],prefix='origin') #We'd want to drop one of these before we actually used this in our algorithm

Unnamed: 0,origin_ABQ,origin_ACK,origin_ALB,origin_ATL,origin_AUS,origin_BGR,origin_BHM,origin_BNA,origin_BOS,origin_BQN,...,origin_SJU,origin_SLC,origin_SMF,origin_SRQ,origin_STL,origin_STT,origin_SWF,origin_SYR,origin_TPA,origin_XNA
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Multicollinearity

In [38]:
df = pd.read_csv('data/heights_weights_genders.csv')

In [18]:
pd.get_dummies(df['Gender'],prefix='gender').corr()

Unnamed: 0,gender_Female,gender_Male
gender_Female,1.0,-1.0
gender_Male,-1.0,1.0


<img src="http://i.giphy.com/3ornka9rAaKRA2Rkac.gif"></img>

### That's the dummy variable trap

## We can also normalize variables across a range

In [45]:
from sklearn import preprocessing

x = df[['Height','Weight']].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled)

In [47]:
df_normalized

Unnamed: 0,0,1
0,0.791728,0.863139
1,0.586958,0.475476
2,0.802364,0.721131
3,0.706182,0.756699
4,0.631424,0.689999
5,0.525149,0.426286
6,0.587087,0.580778
7,0.569437,0.503050
8,0.515686,0.541817
9,0.371665,0.446684


## this helps keep the attributes within a standard range