In [3]:
import pandas as pd

In [7]:
sales = {'month':['Jan','Feb','Mar','Apr','May','Jun'],
        'eggs':[47,110,221,77,132,205],
        'salt':['12.0','50.0','89.0','87.0','','60.0'],
        'spam':[17,31,72,20,52,55]}

In [8]:
df = pd.DataFrame(sales)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
eggs     6 non-null int64
month    6 non-null object
salt     6 non-null object
spam     6 non-null int64
dtypes: int64(2), object(2)
memory usage: 272.0+ bytes


In [10]:
df['salt'] = pd.to_numeric(df['salt'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
eggs     6 non-null int64
month    6 non-null object
salt     5 non-null float64
spam     6 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 272.0+ bytes


In [12]:
df

Unnamed: 0,eggs,month,salt,spam
0,47,Jan,12.0,17
1,110,Feb,50.0,31
2,221,Mar,89.0,72
3,77,Apr,87.0,20
4,132,May,,52
5,205,Jun,60.0,55


In [13]:
df.index = df['month']

In [14]:
df

Unnamed: 0_level_0,eggs,month,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,Jan,12.0,17
Feb,110,Feb,50.0,31
Mar,221,Mar,89.0,72
Apr,77,Apr,87.0,20
May,132,May,,52
Jun,205,Jun,60.0,55


In [15]:
del df['month']

In [16]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


### Indexing using square brackets

In [17]:
df['salt']['Jan']

12.0

### Indexing using column attribute and row label

In [19]:
df.eggs['Mar']

221

### Indexing using the .loc accessor

In [20]:
df.loc['May','spam']

52

### Indexing using the .iloc acessor

In [22]:
df.iloc[4,2]

52

### Selecting only some columns

In [23]:
df_new = df[['salt','eggs']]

In [24]:
df_new

Unnamed: 0_level_0,salt,eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,12.0,47
Feb,50.0,110
Mar,89.0,221
Apr,87.0,77
May,,132
Jun,60.0,205


# Slicing DataFrames

### Selecting a columng (i.e., Series)

In [25]:
df['eggs']

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

In [26]:
type(df['eggs'])

pandas.core.series.Series

In [27]:
df['eggs'][1:4] # Part of the eggs column

month
Feb    110
Mar    221
Apr     77
Name: eggs, dtype: int64

In [29]:
df['eggs'][4] # The value associated with May

132

### Using .loc[]

In [31]:
df.loc[:, 'eggs':'salt'] # All rows, some columns

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
Feb,110,50.0
Mar,221,89.0
Apr,77,87.0
May,132,
Jun,205,60.0


In [32]:
df.loc['Jan':'Apr',:] # Some rows, all columns

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20


In [34]:
df.loc['Mar':'May', 'salt':'spam']

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


### Using .iloc[]

In [35]:
df.iloc[2:5, 1:]

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


### Using lists rather than slices

In [36]:
df.loc['Jan':'May',['eggs','spam']]

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Feb,110,31
Mar,221,72
Apr,77,20
May,132,52


In [38]:
df.iloc[[0,4,5],0:2]

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
May,132,
Jun,205,60.0


### Series versus 1-column DataFrame

In [39]:
df['eggs']

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

In [40]:
type(df['eggs'])

pandas.core.series.Series

In [41]:
df[['eggs']]

Unnamed: 0_level_0,eggs
month,Unnamed: 1_level_1
Jan,47
Feb,110
Mar,221
Apr,77
May,132
Jun,205


In [42]:
type(df[['eggs']])

pandas.core.frame.DataFrame

# Filtering DataFrames

## Creating a Boolean Series

In [43]:
df.salt > 60

month
Jan    False
Feb    False
Mar     True
Apr     True
May    False
Jun    False
Name: salt, dtype: bool

### Filtering with a Boolean Series

In [44]:
df[df.salt > 60]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mar,221,89.0,72
Apr,77,87.0,20


In [45]:
enough_salt_sold = df.salt > 60

In [46]:
df[enough_salt_sold]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mar,221,89.0,72
Apr,77,87.0,20


### Combining filters

In [50]:
df[(df.salt >= 50) & (df.eggs < 200)]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Feb,110,50.0,31
Apr,77,87.0,20


In [51]:
df[(df.salt >= 50) | (df.eggs < 200)]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


### DataFrames with zeros and NaNs

In [52]:
df2 = df.copy()

In [53]:
df2['bacon'] = [0, 0, 50, 60, 70, 80]

In [54]:
df2

Unnamed: 0_level_0,eggs,salt,spam,bacon
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,0
Feb,110,50.0,31,0
Mar,221,89.0,72,50
Apr,77,87.0,20,60
May,132,,52,70
Jun,205,60.0,55,80


### Select columns with all nonzeros

In [55]:
df2.loc[:, df2.all()]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


### Select columns with any nonzeros

In [59]:
df2.loc[:, df2.any()]

Unnamed: 0_level_0,eggs,salt,spam,bacon
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,0
Feb,110,50.0,31,0
Mar,221,89.0,72,50
Apr,77,87.0,20,60
May,132,,52,70
Jun,205,60.0,55,80


### Select columns with any NaNs

In [63]:
df.loc[:, df.isnull().any()]

Unnamed: 0_level_0,salt
month,Unnamed: 1_level_1
Jan,12.0
Feb,50.0
Mar,89.0
Apr,87.0
May,
Jun,60.0


### Select columns without NaNs

In [65]:
df.loc[:, df.notnull().all()]

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Feb,110,31
Mar,221,72
Apr,77,20
May,132,52
Jun,205,55


### Drop rows with any NaNs

In [66]:
df.dropna(how='any')

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
Jun,205,60.0,55


### Filtering a column based on another

In [67]:
df.eggs[df.salt > 50]

month
Mar    221
Apr     77
Jun    205
Name: eggs, dtype: int64

### Modifying a column based on another

In [68]:
df.eggs[df.salt > 55] += 5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Transforming DataFrames

### DataFrame vectorized methods

In [69]:
df.floordiv(12) # Convert to dozens unit

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Jan to Jun
Data columns (total 3 columns):
eggs    6 non-null int64
salt    5 non-null float64
spam    6 non-null int64
dtypes: float64(1), int64(2)
memory usage: 352.0+ bytes


### Numpy vectorized functions

In [71]:
import numpy as np

In [73]:
np.floor_divide(df, 12) # Convert to dozens unit

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3.0,1.0,1.0
Feb,9.0,4.0,2.0
Mar,18.0,7.0,6.0
Apr,6.0,7.0,1.0
May,11.0,,4.0
Jun,17.0,5.0,4.0


### Plain python functions

In [76]:
def dozens(n):
    return n//12

In [77]:
df.apply(dozens) # Convert to dozens unit

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [78]:
df.apply(lambda n: n//12)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


### Storing a transformation

In [79]:
df['dozens_of_eggs'] = df.eggs.floordiv(12)

In [80]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,3
Feb,110,50.0,31,9
Mar,226,89.0,72,18
Apr,82,87.0,20,6
May,132,,52,11
Jun,210,60.0,55,17


In [82]:
df.index

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], dtype='object', name='month')

### Working with string values

In [83]:
df.index = df.index.str.upper()

In [84]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JAN,47,12.0,17,3
FEB,110,50.0,31,9
MAR,226,89.0,72,18
APR,82,87.0,20,6
MAY,132,,52,11
JUN,210,60.0,55,17


In [85]:
df.index = df.index.map(str.lower)

In [86]:
df['salty_eggs'] = df.salt + df.dozens_of_eggs

In [87]:
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs,salty_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jan,47,12.0,17,3,15.0
feb,110,50.0,31,9,59.0
mar,226,89.0,72,18,107.0
apr,82,87.0,20,6,93.0
may,132,,52,11,
jun,210,60.0,55,17,77.0
