In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
obj = pd.Series([4, 7, -5, 3])
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [3]:
# Reinfexing
# It means to create a new object with the data conformed to a new index.
obj = pd.Series(data = [4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [4]:
obj.reindex(['a', 'b', 'c', 'd', 'e']) 

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [5]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [6]:
# ffill or pad Fill (or carry) values forward
# bfill or backfill Fill (or carry) values backward
obj3.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [7]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [8]:
# Both can be reindexed in one shot, though interpolation will only apply row-wise
states = ['Texas', 'Utah', 'California']
#frame.reindex(columns = states) # change columns order 
frame = frame.reindex(index=['a', 'b', 'c', 'd'], method = 'ffill')
frame = frame.reindex(columns = states)
frame

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


### Dropping entires from an axis

In [13]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop(index = 'c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [20]:
data = pd.DataFrame(data = np.arange(16).reshape(4, 4),
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])
print(data)
data.drop(index = ['Colorado', 'Ohio'])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


### Indexing, Selection, and filtering

In [28]:
data = pd.DataFrame(data = np.arange(16).reshape(4, 4), 
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'], 
                    columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
data.ix[2] # the third row

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

### Adding these together retrun a DataFrame whose index and columns are the unions of the ones in each DataFrame

In [48]:
df1 = pd.DataFrame(data = np.arange(9).reshape(3, 3), 
                   columns = list('bcd'),
                   index = ['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(data = np.arange(12.).reshape(4, 3), 
                   columns = list('bde'),
                   index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print('df1 =>\n', df1)
print("-"*20)
print('df2 =>\n', df2)
print("-"*20)
print('df1 + df2 =>\n', df1 + df2)

df1 =>
           b  c  d
Ohio      0  1  2
Texas     3  4  5
Colorado  6  7  8
--------------------
df2 =>
           b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
--------------------
df1 + df2 =>
             b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


#### Using the `add` method on df1 and df2, I pass df2 and an argument to fill_value :

In [56]:
df1.add(other = df2, fill_value = 0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


### Function application and mapping
Another frequent operation is applying a function on 1D arrays to each column or row.

In [61]:
frame = pd.DataFrame(data = np.random.randn(4, 3), columns = list('bde'),
                     index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [62]:
f = lambda x : x.max() - x.min()

In [66]:
frame.apply(func = f, axis = 0) # axis = 0 => columns 

b    2.009313
d    2.456111
e    1.398167
dtype: float64

__**The function passed to apply need not return a scalar value, it can also return a `Series` with multiple values*__

In [73]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

In [77]:
frame.apply(f)

Unnamed: 0,b,d,e
min,0.138948,-1.086464,-0.854771
max,2.148261,1.369647,0.543396


Suppose you want to compute a formatted string from each floating point value in frame. 

In [82]:
float_format = lambda x : "%.2f" % x
frame.applymap(float_format)

Unnamed: 0,b,d,e
Utah,0.32,0.24,0.54
Ohio,0.14,-1.09,0.17
Texas,2.15,1.14,-0.85
Oregon,1.37,1.37,-0.16


### Sort and Ranking

In [89]:
frame = pd.DataFrame(data = np.arange(8).reshape(2, 4), 
                     index = ['three', 'one'], 
                     columns = ['d', 'a', 'b', 'c'])
print('-'*5 + 'frame' + '-'*5)
print(frame)
print('-'*5 + 'sort data by index' + '-'*5)
print(frame.sort_index())
print('-'*5 + 'sort data by column' + '-'*5)
print(frame.sort_index(axis = 1))

-----frame-----
       d  a  b  c
three  0  1  2  3
one    4  5  6  7
-----sort data by index-----
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
-----sort data by column-----
       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [93]:
frame = pd.DataFrame(data = {'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by = ['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [95]:
# rank
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [106]:
# duplicate index value
obj = pd.Series(data = range(5), index = ['a', 'a', 'b', 'b', 'c'])
print('-'*5 + 'Series' + '-'*5)
print(obj)
print('-'*3 + 'Checking index values whether is unique or not' + '-'*3)
print(obj.index.is_unique)

-----Series-----
a    0
a    1
b    2
b    3
c    4
dtype: int64
---Checking index values whether is unique or not---
False


### Correlation and Covariance

In [2]:
# get stock price and volumes obtained from Yahoo Finance
import pandas_datareader.data as web
from datetime import datetime 

In [43]:
start = datetime(year = 2000, month = 1, day = 1)
end = datetime(year = 2010, month = 1, day = 1)

all_data = {}
for company_name in ['AAPL', 'IBM', 'MSFT']:
    data = web.DataReader(name = company_name, data_source = 'morningstar', start = start, end = end)
    data.index = data.index.levels[1] # reset index 
    all_data[company_name] = data

In [44]:
close_price = {tic: data['Close'] for tic, data in all_data.items()}
price = pd.DataFrame(data = close_price)

vol = {tic: data['Volume'] for tic, data in all_data.items()}
volume = pd.DataFrame(vol)

In [50]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-28,0.012293,0.013326,0.005484
2009-12-29,-0.011862,-0.003477,0.007058
2009-12-30,0.012149,0.005461,-0.013699
2009-12-31,-0.00429,-0.012597,-0.015504
2010-01-01,0.0,0.0,0.0


In [57]:
print("Correlation :", returns["AAPL"].corr(returns['IBM']))
print("Covariance :", returns["AAPL"].corr(returns['IBM']))
print('\n')
print("-"*3 + 'Correlation Matrix' + '-'*3)
print(returns.corr())
print("-"*3 + 'Corvarince Matrix' + '-'*3)
print(returns.cov())

Correlation : 0.4229011586484616
Covariance : 0.4229011586484616


---Correlation Matrix---
          AAPL       IBM      MSFT
AAPL  1.000000  0.422901  0.429110
IBM   0.422901  1.000000  0.503922
MSFT  0.429110  0.503922  1.000000
---Corvarince Matrix---
          AAPL       IBM      MSFT
AAPL  0.000987  0.000253  0.000302
IBM   0.000253  0.000361  0.000215
MSFT  0.000302  0.000215  0.000502


In [59]:
returns.corrwith(returns['IBM'])

AAPL    0.422901
IBM     1.000000
MSFT    0.503922
dtype: float64

### UniqueValues, Values Counts, and Membership

In [62]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [70]:
obj.value_counts(sort = False)

a    3
d    1
c    3
b    2
dtype: int64

In [73]:
mask = obj.isin(['b', 'c'])
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [76]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})

In [80]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [90]:
# Handling missing value
data = pd.DataFrame([[1, 6.5, 3], [1, np.NAN, np.NAN], 
                     [np.NAN, np.NAN, np.NAN], [np.NAN, 6.5, 3.]])
print(data)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [89]:
cleaned = data.dropna()
cleaned
# Passing `how = all` will only drop rows that all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [93]:
df = pd.DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = np.NAN; df.ix[:2, 2] = np.NAN

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [98]:
print(df)
print(df.dropna(thresh = 2))

          0         1         2
0 -0.755588       NaN       NaN
1 -0.687043       NaN       NaN
2  0.528455       NaN       NaN
3  0.758197       NaN -0.340304
4  0.841637       NaN  0.260427
5  0.509210 -0.616130 -2.418914
6  0.459743 -1.040918 -2.183278
          0         1         2
3  0.758197       NaN -0.340304
4  0.841637       NaN  0.260427
5  0.509210 -0.616130 -2.418914
6  0.459743 -1.040918 -2.183278


In [101]:
# Calling `fillna` with a dict you can use a different fill value for each columns
df.fillna({1: 0.5, 2:-1})

Unnamed: 0,0,1,2
0,-0.755588,0.5,-1.0
1,-0.687043,0.5,-1.0
2,0.528455,0.5,-1.0
3,0.758197,0.5,-0.340304
4,0.841637,0.5,0.260427
5,0.50921,-0.61613,-2.418914
6,0.459743,-1.040918,-2.183278


In [106]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.NAN; df.iloc[4:, 2] = np.NAN
df

Unnamed: 0,0,1,2
0,0.032513,0.537187,-1.41983
1,-0.928281,-0.83539,0.857687
2,-0.63778,,0.147689
3,-0.030758,,-0.266962
4,-1.654745,,
5,-0.839113,,


In [108]:
df.fillna(method = 'ffill', limit = 2)

Unnamed: 0,0,1,2
0,0.032513,0.537187,-1.41983
1,-0.928281,-0.83539,0.857687
2,-0.63778,-0.83539,0.147689
3,-0.030758,-0.83539,-0.266962
4,-1.654745,,-0.266962
5,-0.839113,,-0.266962


### Hierarchical Indexing

In [111]:
data = pd.Series(data = np.random.randn(10), 
                 index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                         [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

a  1   -0.511972
   2    0.577297
   3    0.674594
b  1    0.584999
   2   -0.525896
   3   -0.228697
c  1    1.106773
   2    0.122174
d  2    1.030784
   3   -0.440420
dtype: float64

In [119]:
data[:, 2] # the third data of each level 

a    0.577297
b   -0.525896
c    0.122174
d    1.030784
dtype: float64

In [123]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.511972,0.577297,0.674594
b,0.584999,-0.525896,-0.228697
c,1.106773,0.122174,
d,,1.030784,-0.44042


In [126]:
# With a DataFrame, either axis can have a hierarchical index
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                     columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [131]:
frame.index.names = ['key1', 'key2']
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [132]:
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [148]:
# With partial column indexing you can similarly select groups of columns:
frame['Ohio'][:2]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4


In [152]:
pd.MultiIndex.from_arrays(arrays = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']], 
                          names = ['State', 'Color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['State', 'Color'])

### Reordering and Sorting Levels

In [159]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [169]:
frame.swaplevel(0, 1).sort_index()

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### Summary Statistic by Level

In [171]:
frame.sum(level = 'key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [172]:
frame.sum(level = 'color', axis = 1) # the same as groupby

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### Using a DataFrame's Columns
#### Using one or more columns from a DataFrame as the row index.

In [175]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                     'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                     'd': [0, 1, 2, 0, 1, 2, 3]})

In [179]:
frame2 = frame.set_index(keys = ['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [181]:
# reset_index does the opposite of `set_index`
frame2.reset_index().reindex(columns = ['a', 'b', 'c', 'd'])

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [188]:
ser = pd.Series(np.arange(3.), index = ['a', 'b', 'c'])
ser[-1]

2.0

### Panel Data

In [205]:
start = datetime(year = 2000, month = 1, day = 1)
end = datetime(year = 2010, month = 1, day = 1)

all_data = {}
for company_name in ['AAPL', 'IBM', 'MSFT']:
    data = web.DataReader(name = company_name, data_source = 'morningstar', start = start, end = end)
    data.index = data.index.levels[1] # reset index 
    all_data[company_name] = data

In [218]:
pdata = pd.Panel(data = all_data)
pdata

<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 2610 (major_axis) x 5 (minor_axis)
Items axis: AAPL to MSFT
Major_axis axis: 2000-01-03 00:00:00 to 2010-01-01 00:00:00
Minor_axis axis: Close to Volume

In [226]:
pdata.swapaxes('items', 'minor')['Close']

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,AAPL,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-03,3.9978,116.0000,58.2820
2000-01-04,3.6607,112.0630,55.7812
2000-01-05,3.7143,118.2500,56.9070
2000-01-06,3.3929,117.2500,55.0000
2000-01-07,3.5536,113.5000,55.7190
2000-01-10,3.4911,118.5000,56.1250
2000-01-11,3.3125,119.0000,54.6880
2000-01-12,3.1183,119.5000,52.9070
2000-01-13,3.4643,118.2500,53.9070
2000-01-14,3.5871,119.6250,56.1250


In [234]:
pdata.ix[:, '2000-01-04', ['Close', 'High']]

Unnamed: 0,AAPL,IBM,MSFT
Close,3.6607,112.063,55.7812
High,3.951,114.5,58.5625


In [235]:
pdata.to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,AAPL,IBM,MSFT
Date,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,Close,3.997800e+00,1.160000e+02,5.828200e+01
2000-01-03,High,4.017900e+00,1.160000e+02,5.931300e+01
2000-01-03,Low,3.631700e+00,1.118750e+02,5.600000e+01
2000-01-03,Open,3.745700e+00,1.124400e+02,5.869170e+01
2000-01-03,Volume,1.330196e+08,1.034720e+07,5.323220e+07
2000-01-04,Close,3.660700e+00,1.120630e+02,5.578120e+01
2000-01-04,High,3.951000e+00,1.145000e+02,5.856250e+01
2000-01-04,Low,3.613800e+00,1.108750e+02,5.575000e+01
2000-01-04,Open,3.865700e+00,1.133100e+02,5.793750e+01
2000-01-04,Volume,1.265404e+08,8.227800e+06,5.408100e+07
