In [5]:
import pandas as pd

In [6]:
from pandas import Series, DataFrame

In [7]:
import numpy as np

In [8]:
from numpy import nan as NA

In [9]:
df = DataFrame(np.random.randn(7, 3))

### <span style='color:red'>Filling in Missing Data 

Rather than filtering out missing data, you may want to fill in the “holes” in any number of ways.
we use <span style='color:lime'>fillna</span> for this:

In [10]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.483193,0.457153,-0.339506
1,-1.785743,-1.369151,-0.324709
2,1.128634,0.908643,0.050394
3,-0.671962,-1.859586,-0.338575
4,-0.318593,-0.606542,-0.492881
5,-1.561441,1.211132,2.230623
6,0.887587,-1.766855,-0.45322


Calling <span style='color:coral'>fillna with a dict</span> you can use a different fill value for each column: 

In [11]:
df.fillna({1: 0.5, 3: -1})

Unnamed: 0,0,1,2
0,0.483193,0.457153,-0.339506
1,-1.785743,-1.369151,-0.324709
2,1.128634,0.908643,0.050394
3,-0.671962,-1.859586,-0.338575
4,-0.318593,-0.606542,-0.492881
5,-1.561441,1.211132,2.230623
6,0.887587,-1.766855,-0.45322


In [13]:
_ = df.fillna(0, inplace=True)  # always returns a reference to the filled object

In [14]:
df 

Unnamed: 0,0,1,2
0,0.483193,0.457153,-0.339506
1,-1.785743,-1.369151,-0.324709
2,1.128634,0.908643,0.050394
3,-0.671962,-1.859586,-0.338575
4,-0.318593,-0.606542,-0.492881
5,-1.561441,1.211132,2.230623
6,0.887587,-1.766855,-0.45322


In [15]:
df = DataFrame(np.random.randn(6, 3))

In [16]:
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [17]:
df

Unnamed: 0,0,1,2
0,1.346783,-0.001861,0.274994
1,1.76039,-0.194982,-0.703207
2,1.215265,,1.850664
3,0.701917,,0.659364
4,-0.614564,,
5,0.991411,,


In [18]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.346783,-0.001861,0.274994
1,1.76039,-0.194982,-0.703207
2,1.215265,-0.194982,1.850664
3,0.701917,-0.194982,0.659364
4,-0.614564,-0.194982,0.659364
5,0.991411,-0.194982,0.659364


In [19]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,1.346783,-0.001861,0.274994
1,1.76039,-0.194982,-0.703207
2,1.215265,-0.194982,1.850664
3,0.701917,-0.194982,0.659364
4,-0.614564,,0.659364
5,0.991411,,0.659364


<span style='color:orchid'>With fillna you can do lots of other things with a little creativity. For example, you might pass the mean or median value of a Series:</span>

In [20]:
data = Series([1., NA, 3.5, NA, 7])

In [21]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

<span style='color:mediumseagreen'>fillna function arguments

<span style='color:mediumseagreen'>value:</span> Scalar value or dict-like object to use to fill missing values 

<span style='color:mediumseagreen'>method:</span> Interpolation, by default 'ffill' if function called with no other arguments 

<span style='color:mediumseagreen'>axis:</span> Axis to fill on, default axis=0 

<span style='color:mediumseagreen'>inplace:</span> Modify the calling object without producing a copy 

<span style='color:mediumseagreen'>limit:</span> For forward and backward filling, maximum number of consecutive periods to fill


### <span style='color:red'>Hierarchical Indexing 

Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form.

In [22]:
data = Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

In [23]:
data

a  1    0.185879
   2   -0.142880
   3    0.615779
b  1   -0.636779
   2    0.911484
   3    0.447551
c  1   -0.048898
   2   -0.225192
d  2    0.819550
   3   -0.232880
dtype: float64

what you saw is a Series with a MultiIndex as its index. 

In [24]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [25]:
data['b'] 

1   -0.636779
2    0.911484
3    0.447551
dtype: float64

In [26]:
data['b':'c']

b  1   -0.636779
   2    0.911484
   3    0.447551
c  1   -0.048898
   2   -0.225192
dtype: float64

In [27]:
data.ix[['b', 'd']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


b  1   -0.636779
   2    0.911484
   3    0.447551
d  2    0.819550
   3   -0.232880
dtype: float64

In [28]:
data[:, 2] 

a   -0.142880
b    0.911484
c   -0.225192
d    0.819550
dtype: float64

Hierarchical indexing plays a critical role in reshaping data and group-based operations like forming a pivot table. For example, this data could be rearranged into a DataFrame using its <span style='color:fuchsia'>unstack </span>method: 

In [29]:
data.unstack()

Unnamed: 0,1,2,3
a,0.185879,-0.14288,0.615779
b,-0.636779,0.911484,0.447551
c,-0.048898,-0.225192,
d,,0.81955,-0.23288


In [30]:
data.unstack().stack() 

a  1    0.185879
   2   -0.142880
   3    0.615779
b  1   -0.636779
   2    0.911484
   3    0.447551
c  1   -0.048898
   2   -0.225192
d  2    0.819550
   3   -0.232880
dtype: float64

With a DataFrame, either axis can have a hierarchical index: 

In [31]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']])

In [32]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


The hierarchical levels can have names (as strings or any Python objects).

<span style='color:mediumvioletred'>don’t confuse the index names with the axis labels !!! 

In [33]:
frame.index.names = ['key1', 'key2']

In [34]:
frame.columns.names = ['state', 'color']

In [35]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [38]:
frame['Ohio'] 

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


### <span style='color:red'>Reordering and Sorting Levels 

At times you will need to rearrange the order of the levels on an axis or sort the data by the values in one specific level. The <span style='color:coral'>swaplevel</span> takes two level numbers or names and returns a new object with the levels interchanged (but the data is otherwise unaltered): 

In [39]:
frame.swaplevel('key1', 'key2') 

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


<span style='color:cadetblue'>sortlevel</span>, on the other hand, sorts the data (stably) using only the values in a single level

In [40]:
frame.sortlevel(1)           

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [41]:
frame.swaplevel(0, 1).sortlevel(0) 

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### <span style='color:red'>Summary Statistics by Level 

Many descriptive and summary statistics on DataFrame and Series have a level option in which you can specify the level you want to sum by on a particular axis. 

In [42]:
frame.sum(level='key2')   

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [43]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### <span style='color:red'>Using a DataFrame’s Columns 

It’s not unusual to want to use one or more columns from a DataFrame as the row index; alternatively, you may wish to move the row index into the DataFrame’s columns :

In [44]:
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})

In [45]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


DataFrame’s <span style='color:magenta'>set_index</span> function will create a new DataFrame using one or more of its columns as the index:

In [46]:
frame2 = frame.set_index(['c', 'd'])

In [47]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


By default the columns are removed from the DataFrame, though you can leave them in:

In [48]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


<span style='color:magenta'>reset_index</span>, on the other hand, does the opposite of <span style='color:magenta'>set_index</span>

In [49]:
frame2.reset_index() 

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## <span style='color:red'>Other pandas Topics

Here are some additional topics that may be of use to you in your data travels.

### <span style='color:red'>Integer Indexing

In [54]:
ser = Series(np.arange(3.))

In [59]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [55]:
ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])

In [56]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [57]:
ser2[-1]

2.0

In [60]:
 ser.ix[:1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


0    0.0
1    1.0
dtype: float64

### <span style='color:red'>Panel Data 

To create a Panel, you can use a dict of DataFrame objects or a three-dimensional ndarray:

In [61]:
import pandas.io.data as web

ModuleNotFoundError: No module named 'pandas.io.data'

In [62]:
pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk, '1/1/2009', '6/1/2012')) 
                      for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL'])) 

NameError: name 'web' is not defined