In [1]:
import pandas as pd
%matplotlib inline

###A simple example to illustrate the intuition behind dummy variables

In [2]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})

In [3]:
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [4]:
pd.get_dummies(df['key'],prefix='key')

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


###Now we have a matrix of values based on the presence of absence of the attribute value in our dataset

###Now let's look at another example using our flight data

In [6]:
df = pd.read_csv('data/ontime_reports_may_2015_ny.csv')
df

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,CARRIER,FL_NUM,ORIGIN,DEST,DEP_DELAY,ARR_DELAY,CANCELLED,...,DIVERTED,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
0,2015,5,1,AA,44,LAS,JFK,-6,-17,0,...,0,294,273,2248,,,,,,
1,2015,5,2,AA,44,LAS,JFK,-8,-14,0,...,0,299,280,2248,,,,,,
2,2015,5,3,AA,44,LAS,JFK,0,-11,0,...,0,294,274,2248,,,,,,
3,2015,5,4,AA,44,LAS,JFK,-11,4,0,...,0,320,275,2248,,,,,,
4,2015,5,5,AA,44,LAS,JFK,-4,-18,0,...,0,291,270,2248,,,,,,
5,2015,5,6,AA,44,LAS,JFK,-5,-6,0,...,0,304,279,2248,,,,,,
6,2015,5,7,AA,44,SEA,JFK,-2,-11,0,...,0,319,297,2422,,,,,,
7,2015,5,8,AA,44,SEA,JFK,-3,-15,0,...,0,316,293,2422,,,,,,
8,2015,5,9,AA,44,SEA,JFK,-6,5,0,...,0,339,307,2422,,,,,,
9,2015,5,10,AA,44,SEA,JFK,38,31,0,...,0,321,301,2422,0,31,0,0,0,


In [7]:
#count number of NaNs in column
df['DEP_DELAY'].isnull().sum()

728

In [8]:
#calculate the percentage this represents of the total number of instances
df['DEP_DELAY'].isnull().sum()/df['DEP_DELAY'].sum()

0.0014866851821597779

###We could explore whether the NaNs are actually zero delays, but we'll just filter them out for now, especially since they represent such a small number of instances

In [9]:
#filter DEP_DELAY NaNs
df = df[pd.notnull(df['DEP_DELAY'])]
df

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,CARRIER,FL_NUM,ORIGIN,DEST,DEP_DELAY,ARR_DELAY,CANCELLED,...,DIVERTED,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
0,2015,5,1,AA,44,LAS,JFK,-6,-17,0,...,0,294,273,2248,,,,,,
1,2015,5,2,AA,44,LAS,JFK,-8,-14,0,...,0,299,280,2248,,,,,,
2,2015,5,3,AA,44,LAS,JFK,0,-11,0,...,0,294,274,2248,,,,,,
3,2015,5,4,AA,44,LAS,JFK,-11,4,0,...,0,320,275,2248,,,,,,
4,2015,5,5,AA,44,LAS,JFK,-4,-18,0,...,0,291,270,2248,,,,,,
5,2015,5,6,AA,44,LAS,JFK,-5,-6,0,...,0,304,279,2248,,,,,,
6,2015,5,7,AA,44,SEA,JFK,-2,-11,0,...,0,319,297,2422,,,,,,
7,2015,5,8,AA,44,SEA,JFK,-3,-15,0,...,0,316,293,2422,,,,,,
8,2015,5,9,AA,44,SEA,JFK,-6,5,0,...,0,339,307,2422,,,,,,
9,2015,5,10,AA,44,SEA,JFK,38,31,0,...,0,321,301,2422,0,31,0,0,0,


###We can discretize the continuous DEP_DELAY value by giving it a value of 0 if it's delayed and a 1 if it's not. We record this value into a separate column. (We could also code -1 for early, 0 for ontime, and 1 for late)

In [12]:
#code whether delay or not delayed
df['IS_DELAYED'] = df['DEP_DELAY'].apply(lambda x: 1 if x>0 else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [13]:
#Let's check that our column was created properly
df[['DEP_DELAY','IS_DELAYED']]

Unnamed: 0,DEP_DELAY,IS_DELAYED
0,-6,0
1,-8,0
2,0,0
3,-11,0
4,-4,0
5,-5,0
6,-2,0
7,-3,0
8,-6,0
9,38,1


In [None]:
###Dummy variables create a 

In [None]:
pd.get_dummies(df['ORIGIN'],prefix='origin')

###Normalize values

In [None]:
#Normalize the data attributes for the Iris dataset
# Example from Jump Start Scikit Learn https://machinelearningmastery.com/jump-start-scikit-learn/
from sklearn.datasets import load_iris 
from sklearn import preprocessing #load the iris dataset
iris=load_iris()
X=iris.data
y=iris.target #normalize the data attributes 
normalized_X = preprocessing.normalize(X)

In [None]:
zip(X,normalized_X)