# Pandas

## From http://pandas.pydata.org/pandas-docs/stable/10min.html

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Series

In [3]:
# Index 1-5 of [1.0, 3.0, 5.0, NaN, 6.0, 8.0]
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# Creates 6 dates starting from 2013/01/01
# ['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06']
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

## DataFrame

In [5]:
# Creates random number set with rows labeled with dates, columns as 'A', 'B', 'C', 'D'
# 6,4 is shape of random rows,columns
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.793335,0.520483,1.324437,-0.321294
2013-01-02,1.054783,0.797869,1.550447,-1.913093
2013-01-03,-0.726896,0.091546,-2.183416,1.653323
2013-01-04,0.24637,-0.802665,1.781936,-1.8754
2013-01-05,0.526871,0.061124,1.358796,0.078198
2013-01-06,2.362113,-1.139332,0.864286,0.788251


In [6]:
# DataFrame from dict instead of array, this has dtypes differing for every column
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


## Displaying DataFrame info

In [7]:
df.head() 		# first columns

Unnamed: 0,A,B,C,D
2013-01-01,-0.793335,0.520483,1.324437,-0.321294
2013-01-02,1.054783,0.797869,1.550447,-1.913093
2013-01-03,-0.726896,0.091546,-2.183416,1.653323
2013-01-04,0.24637,-0.802665,1.781936,-1.8754
2013-01-05,0.526871,0.061124,1.358796,0.078198


In [8]:
df.tail(3) 	# last 3 columns

Unnamed: 0,A,B,C,D
2013-01-04,0.24637,-0.802665,1.781936,-1.8754
2013-01-05,0.526871,0.061124,1.358796,0.078198
2013-01-06,2.362113,-1.139332,0.864286,0.788251


In [9]:
df.index 		# array of indecies [0, 1, 2, 3]

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns 	# array of column names [u'A', u'B', a'C', u'D'...]

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.values 		# 2d array of values ignoring indices or column names

array([[-0.79333468,  0.52048312,  1.32443725, -0.32129431],
       [ 1.05478338,  0.79786892,  1.55044711, -1.91309272],
       [-0.72689555,  0.09154629, -2.1834158 ,  1.6533226 ],
       [ 0.24637045, -0.80266485,  1.78193638, -1.87539965],
       [ 0.52687084,  0.06112362,  1.35879624,  0.07819769],
       [ 2.36211299, -1.13933169,  0.86428639,  0.78825071]])

In [12]:
df.describe() 	# Summary of mean, count, std, min, max, % ranges for 25 50 75

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.444985,-0.078496,0.782748,-0.265003
std,1.183227,0.751664,1.484438,1.429675
min,-0.793335,-1.139332,-2.183416,-1.913093
25%,-0.483579,-0.586718,0.979324,-1.486873
50%,0.386621,0.076335,1.341617,-0.121548
75%,0.922805,0.413249,1.502534,0.610737
max,2.362113,0.797869,1.781936,1.653323


## DataFrame Transformations

In [13]:
df.T 			# Transpose

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.793335,1.054783,-0.726896,0.24637,0.526871,2.362113
B,0.520483,0.797869,0.091546,-0.802665,0.061124,-1.139332
C,1.324437,1.550447,-2.183416,1.781936,1.358796,0.864286
D,-0.321294,-1.913093,1.653323,-1.8754,0.078198,0.788251


In [14]:
df.sort_index(axis=1, ascending='false') # Sort by index

Unnamed: 0,A,B,C,D
2013-01-01,-0.793335,0.520483,1.324437,-0.321294
2013-01-02,1.054783,0.797869,1.550447,-1.913093
2013-01-03,-0.726896,0.091546,-2.183416,1.653323
2013-01-04,0.24637,-0.802665,1.781936,-1.8754
2013-01-05,0.526871,0.061124,1.358796,0.078198
2013-01-06,2.362113,-1.139332,0.864286,0.788251


In [15]:
df.sort_values(by='B') # Sorts rows in ascending order of B

Unnamed: 0,A,B,C,D
2013-01-06,2.362113,-1.139332,0.864286,0.788251
2013-01-04,0.24637,-0.802665,1.781936,-1.8754
2013-01-05,0.526871,0.061124,1.358796,0.078198
2013-01-03,-0.726896,0.091546,-2.183416,1.653323
2013-01-01,-0.793335,0.520483,1.324437,-0.321294
2013-01-02,1.054783,0.797869,1.550447,-1.913093


In [16]:
df['E'] = ['one','two','three','four','five','six'] # adds new column
df

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.793335,0.520483,1.324437,-0.321294,one
2013-01-02,1.054783,0.797869,1.550447,-1.913093,two
2013-01-03,-0.726896,0.091546,-2.183416,1.653323,three
2013-01-04,0.24637,-0.802665,1.781936,-1.8754,four
2013-01-05,0.526871,0.061124,1.358796,0.078198,five
2013-01-06,2.362113,-1.139332,0.864286,0.788251,six


## Getting values

In [17]:
df['A'] # Returns column 'A' with indices

2013-01-01   -0.793335
2013-01-02    1.054783
2013-01-03   -0.726896
2013-01-04    0.246370
2013-01-05    0.526871
2013-01-06    2.362113
Freq: D, Name: A, dtype: float64

In [18]:
df[0:3] # first 3 rows

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.793335,0.520483,1.324437,-0.321294,one
2013-01-02,1.054783,0.797869,1.550447,-1.913093,two
2013-01-03,-0.726896,0.091546,-2.183416,1.653323,three


In [19]:
df['20130102':'20130104'] # rows by indices

Unnamed: 0,A,B,C,D,E
2013-01-02,1.054783,0.797869,1.550447,-1.913093,two
2013-01-03,-0.726896,0.091546,-2.183416,1.653323,three
2013-01-04,0.24637,-0.802665,1.781936,-1.8754,four


In [20]:
df.loc[dates[0]] # get columbs (A-D) for first date as one column

A   -0.793335
B    0.520483
C     1.32444
D   -0.321294
E         one
Name: 2013-01-01 00:00:00, dtype: object

In [21]:
df.loc[:,['A','B']] # only use columns A, B

Unnamed: 0,A,B
2013-01-01,-0.793335,0.520483
2013-01-02,1.054783,0.797869
2013-01-03,-0.726896,0.091546
2013-01-04,0.24637,-0.802665
2013-01-05,0.526871,0.061124
2013-01-06,2.362113,-1.139332


In [22]:
df.loc['20130102':'20130104',['A','B']] # get 2x2 of those dates A, B

Unnamed: 0,A,B
2013-01-02,1.054783,0.797869
2013-01-03,-0.726896,0.091546
2013-01-04,0.24637,-0.802665


In [23]:
df.loc['20130102',['A','B']] # get A,B for this date as one column

A     1.05478
B    0.797869
Name: 2013-01-02 00:00:00, dtype: object

In [24]:
df.loc[dates[0],'A'] # get scalar value at index

-0.79333468135975449

## Filtering

In [25]:
df[df.A > 0] # Only entries where A > 0

Unnamed: 0,A,B,C,D,E
2013-01-02,1.054783,0.797869,1.550447,-1.913093,two
2013-01-04,0.24637,-0.802665,1.781936,-1.8754,four
2013-01-05,0.526871,0.061124,1.358796,0.078198,five
2013-01-06,2.362113,-1.139332,0.864286,0.788251,six


In [26]:
df[df > 0] # < 0 entries get replaces with NaN

Unnamed: 0,A,B,C,D,E
2013-01-01,,0.520483,1.324437,,one
2013-01-02,1.054783,0.797869,1.550447,,two
2013-01-03,,0.091546,,1.653323,three
2013-01-04,0.24637,,1.781936,,four
2013-01-05,0.526871,0.061124,1.358796,0.078198,five
2013-01-06,2.362113,,0.864286,0.788251,six


In [32]:
df[df['E'].isin(['two','four'])] # selectes where 'E' is in ['two', 'four']

Unnamed: 0,A,B,C,D,E
2013-01-02,1.054783,0.797869,1.550447,-1.913093,two
2013-01-04,0.24637,-0.802665,1.781936,-1.8754,four
