# Explaining Feature Engineering

## Basic questions

In [5]:
# Why is called feature engineering? Classifying data so machine can understand it

In [6]:
# How do we do that? We make them binary. And create features.

In [7]:
# What is a feature? Feature are based on attributes of the data and will help make the predictions.

## An Example

In [40]:
import pandas as pd
%matplotlib inline
from sklearn import preprocessing

In [12]:
df = pd.DataFrame({'key':['cat','cat','dog','donkey','dog','cat'],'data1':range(6)})
df

Unnamed: 0,data1,key
0,0,cat
1,1,cat
2,2,dog
3,3,donkey
4,4,dog
5,5,cat


## Getting the dummies

In [13]:
#data1
#key = attributes, and they can become the features for our pred model.

In [14]:
#Definition
#Dummy variables assign the numbers ‘0’ and ‘1’ to indicate membership in any mutually exclusive and exhaustive category.
#https://www.moresteam.com/whitepapers/download/dummy-variables.pdf

In [16]:
pd.get_dummies(df['key'],prefix='key') #String to append DataFrame column names

Unnamed: 0,key_cat,key_dog,key_donkey
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
5,1.0,0.0,0.0


In [17]:
billionaires = pd.read_excel('richpeople.xlsx')

In [18]:
billionaires.head(2)

Unnamed: 0,year,name,rank,citizenship,countrycode,networthusbillion,selfmade,typeofwealth,gender,age,...,relationshiptocompany,foundingdate,gdpcurrentus,sourceofwealth,notes,notes2,source,source_2,source_3,source_4
0,2001,A Jerrold Perenchio,151,United States,USA,3.0,self-made,executive,male,70.0,...,former chairman and CEO,1955.0,10621800000000.0,,represented Marlon Brando and Elizabeth Taylor,,http://en.wikipedia.org/wiki/Jerry_Perenchio,http://www.forbes.com/profile/a-jerrold-perenc...,COLUMN ONE; A Hollywood Player Who Owns the Ga...,
1,2014,A. Jerrold Perenchio,663,United States,USA,2.6,self-made,executive,male,83.0,...,former chairman and CEO,1955.0,,"television, Univision",represented Marlon Brando and Elizabeth Taylor,,http://en.wikipedia.org/wiki/Jerry_Perenchio,http://www.forbes.com/profile/a-jerrold-perenc...,COLUMN ONE; A Hollywood Player Who Owns the Ga...,


In [19]:
#Doing the same with the billionaires
billionaires['Old Guys'] = billionaires['age'].apply(lambda x: 1 if x>60 else 0 )

In [25]:
#These are the billionaires above 70
Oldguys = billionaires[['name', 'age', 'Old Guys']]

In [27]:
Oldguys.head(2)

Unnamed: 0,name,age,Old Guys
0,A Jerrold Perenchio,70.0,1
1,A. Jerrold Perenchio,83.0,1


In [29]:
#Here we are telling the machine that in row 4 we have an age 0. Which is probably wrong.
pd.get_dummies(billionaires['age'],prefix='age')

Unnamed: 0,age_-42.0,age_-7.0,age_0.0,age_12.0,age_21.0,age_24.0,age_28.0,age_29.0,age_30.0,age_31.0,...,age_88.0,age_89.0,age_90.0,age_91.0,age_92.0,age_93.0,age_94.0,age_95.0,age_96.0,age_98.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Multicollinearity

In [32]:
#What does thar mean? Two values that are kind of dependent. BAsed on one you can predict the other. Highly correlated.

In [33]:
pd.get_dummies(billionaires['gender'],prefix='gender').corr()

Unnamed: 0,gender_female,gender_male,gender_married couple
gender_female,1.0,-0.925748,-0.010999
gender_male,-0.925748,1.0,-0.096709
gender_married couple,-0.010999,-0.096709,1.0


In [36]:
#Why isnt this one?
pd.get_dummies(billionaires['selfmade'],prefix='selfmade').corr()

Unnamed: 0,selfmade_inherited,selfmade_self-made
selfmade_inherited,1.0,-0.982092
selfmade_self-made,-0.982092,1.0


In [41]:
x = billionaires[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [38]:
x

array([[ 70. ,   3. ],
       [ 83. ,   2.6],
       [  nan,   1.5],
       ..., 
       [ 45. ,   1.2],
       [ 68. ,  11.6],
       [ 57. ,   3.5]])

In [42]:
billionaires_nonulls = billionaires[pd.notnull(billionaires['age'])]

In [43]:
x = billionaires_nonulls[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler() #Transforms features by scaling each feature to a given range.

x_scaled = min_max_scaler.fit_transform(x) #Fit to data, then transform it.

df_normalized = pd.DataFrame(x_scaled)

In [44]:
df_normalized

Unnamed: 0,0,1
0,0.800000,0.026667
1,0.892857,0.021333
2,0.635714,0.012000
3,0.300000,0.000000
4,0.685714,0.002667
5,0.542857,0.020000
6,0.578571,0.108000
7,0.671429,0.217333
8,0.721429,0.002667
9,0.757143,0.008000


In [45]:
billionaires_nonulls.describe()



Unnamed: 0,year,rank,networthusbillion,age,north,politicalconnection,founder,foundingdate,gdpcurrentus,Old Guys
count,2429.0,2429.0,2429.0,2429.0,2429.0,71.0,2429.0,2391.0,829.0,2429.0
mean,2008.638946,607.571017,3.603417,57.40387,0.636888,1.0,0.482915,1955.22041,4925504000000.0,0.503499
std,7.526569,472.008031,5.218999,21.386215,0.480996,0.0,0.499811,42.872414,4385036000000.0,0.500091
min,1996.0,1.0,1.0,-42.0,0.0,1.0,0.0,1610.0,2491801000.0,0.0
25%,2001.0,212.0,1.4,50.0,0.0,,0.0,,,0.0
50%,2014.0,446.0,2.1,61.0,1.0,,0.0,,,1.0
75%,2014.0,988.0,3.5,71.0,1.0,,1.0,,,1.0
max,2014.0,1565.0,76.0,98.0,1.0,1.0,1.0,2012.0,10621800000000.0,1.0


# Why are we doing this?

In [None]:
#So we don'r have any misrepresenations.

#