# 10 Minutes to Pandas Tutorial

From pandas [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) Tutorial.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### pandas Series
A Series in Pandas is a "One-dimensional ndarray with axis labels (including time series)"

In [3]:
# create a pandas Series
# here labels are 0 to 5
# notice it is default type float64
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [11]:
# other pandas objects, like date_range() make an ndarray for you
# date_range() object returns a filled-in DatetimeIndex object
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### pandas Data Frame

Basically, this tutorial is all about pandas dataframes. They can be thought of as "a dict-like container for Series objects."

In [108]:
# pandas DataFrame() object creates a dataframe for me
# numpy's np.random.randn(n,m) returns an ndarray of shape n x m with random values from standard normal distribution
# dates becomes the index (rows) of the dataframe
# column labels generated via Python list() function

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.933851,0.192824,-1.049753,0.258736
2013-01-02,1.884892,-0.158986,0.567058,0.64644
2013-01-03,-0.749961,-0.200973,0.589294,0.270939
2013-01-04,-0.117302,-0.6121,1.307215,-0.108438
2013-01-05,-1.874316,-0.993146,0.406705,1.623868
2013-01-06,0.058644,-0.835104,0.810017,-0.079706


In [16]:
# use DataFrame() object with a Python dict to create the dataframe
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list('ABCD'),dtype='float32'),
                    # notice dict item C creates my index labels
                   'D': np.array([3] * 4,dtype='int32'),
                    # D is a numpy array with number 3 given 4 times
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo'})
                    # all dict values must have same number of items
                    # or be a item (value which is copied 4 times)
df2

Unnamed: 0,A,B,C,D,E,F
A,1.0,2013-01-02,1.0,3,test,foo
B,1.0,2013-01-02,1.0,3,train,foo
C,1.0,2013-01-02,1.0,3,test,foo
D,1.0,2013-01-02,1.0,3,train,foo


In [17]:
# show the data types of each dict item in the dataframe
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [18]:
# see first n rows
df2.head(n=3)

Unnamed: 0,A,B,C,D,E,F
A,1.0,2013-01-02,1.0,3,test,foo
B,1.0,2013-01-02,1.0,3,train,foo
C,1.0,2013-01-02,1.0,3,test,foo


In [19]:
# see bottom n rows
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F
C,1.0,2013-01-02,1.0,3,test,foo
D,1.0,2013-01-02,1.0,3,train,foo


In [21]:
# return index (rows) which is an immutable ndarray
df2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [22]:
# display columns which is an immutable ndarray
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [13]:
# back to the df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [24]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.299041,-0.295299,0.583028,-0.22228
std,0.653414,0.897767,0.402299,0.965052
min,-1.519394,-1.148109,-0.065757,-1.458126
25%,-0.405806,-1.0278,0.391078,-0.906328
50%,-0.080948,-0.478482,0.640283,-0.141124
75%,0.058582,0.193686,0.909502,0.249388
max,0.296046,1.12644,0.976106,1.188533


In [25]:
# transpose the data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.091202,-0.039278,-0.122618,-1.519394,-0.500202,0.296046
B,0.337919,1.12644,-1.131082,-0.23901,-1.148109,-0.717953
C,0.353192,0.504735,0.954059,-0.065757,0.775831,0.976106
D,0.077861,1.188533,-1.458126,-1.088401,-0.360108,0.306564


In [32]:
# sort by an axis (by rows or by columns)
df.sort_index(axis=1, ascending=False)
# axis=1 sorts by row labels, axis=0 by column labels
# ascending=False sorts in descending order

Unnamed: 0,D,C,B,A
2013-01-01,0.077861,0.353192,0.337919,0.091202
2013-01-02,1.188533,0.504735,1.12644,-0.039278
2013-01-03,-1.458126,0.954059,-1.131082,-0.122618
2013-01-04,-1.088401,-0.065757,-0.23901,-1.519394
2013-01-05,-0.360108,0.775831,-1.148109,-0.500202
2013-01-06,0.306564,0.976106,-0.717953,0.296046


In [36]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-05,-0.500202,-1.148109,0.775831,-0.360108
2013-01-03,-0.122618,-1.131082,0.954059,-1.458126
2013-01-06,0.296046,-0.717953,0.976106,0.306564
2013-01-04,-1.519394,-0.23901,-0.065757,-1.088401
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-02,-0.039278,1.12644,0.504735,1.188533


In [37]:
# Python / Numpy expressions for selecting and setting are fine, but 
# for production code use pandas data access methods:
# .at, .iat, .loc, .iloc and .ix

In [38]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-02,-0.039278,1.12644,0.504735,1.188533
2013-01-03,-0.122618,-1.131082,0.954059,-1.458126
2013-01-04,-1.519394,-0.23901,-0.065757,-1.088401
2013-01-05,-0.500202,-1.148109,0.775831,-0.360108
2013-01-06,0.296046,-0.717953,0.976106,0.306564


In [41]:
# Selecting a single column, which yields a Series, equivalent to df.A
# both are a pandas Series
df['A']

2013-01-01    0.091202
2013-01-02   -0.039278
2013-01-03   -0.122618
2013-01-04   -1.519394
2013-01-05   -0.500202
2013-01-06    0.296046
Freq: D, Name: A, dtype: float64

In [42]:
df.A

2013-01-01    0.091202
2013-01-02   -0.039278
2013-01-03   -0.122618
2013-01-04   -1.519394
2013-01-05   -0.500202
2013-01-06    0.296046
Freq: D, Name: A, dtype: float64

In [43]:
# select via [] which slices at indices
# 0th (inclusive) to the 2th (exclusive) row (so, row 0 and row 1)
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-02,-0.039278,1.12644,0.504735,1.188533


In [44]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-02,-0.039278,1.12644,0.504735,1.188533


In [47]:
# selection by label, returns a pandas Series
df.loc[dates[0]]

A    0.091202
B    0.337919
C    0.353192
D    0.077861
Name: 2013-01-01 00:00:00, dtype: float64

In [24]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [25]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [49]:
# returns first row from the df dataframe, a pandas Series
df.loc[dates[0]]

A    0.091202
B    0.337919
C    0.353192
D    0.077861
Name: 2013-01-01 00:00:00, dtype: float64

In [50]:
# here the ':' indicates all rows, and the list with ['A','B'] indicate columns A and B
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.091202,0.337919
2013-01-02,-0.039278,1.12644
2013-01-03,-0.122618,-1.131082
2013-01-04,-1.519394,-0.23901
2013-01-05,-0.500202,-1.148109
2013-01-06,0.296046,-0.717953


In [52]:
# same exact selection but using index labels
# note, index (rows) selection is inclusive
df.loc['20130101':'20130106',['A','B']]

Unnamed: 0,A,B
2013-01-01,0.091202,0.337919
2013-01-02,-0.039278,1.12644
2013-01-03,-0.122618,-1.131082
2013-01-04,-1.519394,-0.23901
2013-01-05,-0.500202,-1.148109
2013-01-06,0.296046,-0.717953


In [53]:
# just from a single row and columns A and B returns a pandas Series
df.loc['20130102', ['A','B']]

A   -0.039278
B    1.126440
Name: 2013-01-02 00:00:00, dtype: float64

In [55]:
# get a single value (scalar)
df.loc[dates[0],'A']

0.091202404420735156

In [57]:
# same exact value but using a specific label
df.loc['20130101','A']

0.091202404420735156

In [58]:
# .at is same as above
df.at[dates[0],'A']

0.091202404420735156

In [59]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-02,-0.039278,1.12644,0.504735,1.188533
2013-01-03,-0.122618,-1.131082,0.954059,-1.458126
2013-01-04,-1.519394,-0.23901,-0.065757,-1.088401
2013-01-05,-0.500202,-1.148109,0.775831,-0.360108
2013-01-06,0.296046,-0.717953,0.976106,0.306564


In [60]:
# show row at the 3th row
df.iloc[3]

A   -1.519394
B   -0.239010
C   -0.065757
D   -1.088401
Name: 2013-01-04 00:00:00, dtype: float64

In [61]:
# slice rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-1.519394,-0.23901
2013-01-05,-0.500202,-1.148109


In [62]:
# by lists of integer position locations
# shows the 1th, 2th and 4th rows, 0th and 2th columns
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.039278,0.504735
2013-01-03,-0.122618,0.954059
2013-01-05,-0.500202,0.775831


In [63]:
# for slicing columns explicitly
# all rows, 1th (inclusive) to 3th (exclusiv) columns
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,0.337919,0.353192
2013-01-02,1.12644,0.504735
2013-01-03,-1.131082,0.954059
2013-01-04,-0.23901,-0.065757
2013-01-05,-1.148109,0.775831
2013-01-06,-0.717953,0.976106


In [71]:
# explicitly get a value, returns a float64
df.iloc[1,1]

1.1264397039257281

In [72]:
# fast access to a scalar(same effect as above method), returns a float64
df.iat[1,1]

1.1264397039257281

In [73]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-02,-0.039278,1.12644,0.504735,1.188533
2013-01-03,-0.122618,-1.131082,0.954059,-1.458126
2013-01-04,-1.519394,-0.23901,-0.065757,-1.088401
2013-01-05,-0.500202,-1.148109,0.775831,-0.360108
2013-01-06,0.296046,-0.717953,0.976106,0.306564


In [74]:
# some boolean indexing
# "where any value in column A is greater than 0"
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-06,0.296046,-0.717953,0.976106,0.306564


In [75]:
# "anywhere where greater than 0"
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.091202,0.337919,0.353192,0.077861
2013-01-02,,1.12644,0.504735,1.188533
2013-01-03,,,0.954059,
2013-01-04,,,,
2013-01-05,,,0.775831,
2013-01-06,0.296046,,0.976106,0.306564


In [133]:
# filter through the dataframe using isin()

# make a copy of the dataframe called df2
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.933851,0.192824,-1.049753,0.258736,one
2013-01-02,1.884892,-0.158986,0.567058,0.64644,one
2013-01-03,-0.749961,-0.200973,0.589294,0.270939,two
2013-01-04,-0.117302,-0.6121,1.307215,-0.108438,three
2013-01-05,-1.874316,-0.993146,0.406705,1.623868,four
2013-01-06,0.058644,-0.835104,0.810017,-0.079706,three


In [150]:
# referencing column E, select only rows with certain keywords that match 'one' or 'four'
df2[df2['E'].isin(['one', 'four'])]

# !!!I'd like to know how the slice is made here. Question
# posted at: http://stackoverflow.com/questions/41733696/how-does-pandas-use-a-series-object-to-slice-a-data-frame

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.933851,0.192824,-1.049753,0.258736,one
2013-01-02,1.884892,-0.158986,0.567058,0.64644,one
2013-01-05,-1.874316,-0.993146,0.406705,1.623868,four


In [164]:
# the contents of df2[] below are a pandas Series of type bool
# wherever it is True, the corresponding label's row is returned
df2['E'].isin(['one','four'])

2013-01-01     True
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

#### Setting values in a Data Frame

In [165]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.933851,0.192824,-1.049753,0.258736
2013-01-02,1.884892,-0.158986,0.567058,0.64644
2013-01-03,-0.749961,-0.200973,0.589294,0.270939
2013-01-04,-0.117302,-0.6121,1.307215,-0.108438
2013-01-05,-1.874316,-0.993146,0.406705,1.623868
2013-01-06,0.058644,-0.835104,0.810017,-0.079706


In [167]:
# make a new Series that I can add to the dataframe
# !!! REMEMBER, a dataframe can be thought of as a dict of Series
# where the Series labels form a union if redundant
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [168]:
# now add it to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
# 2013-01-07 is dropped
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.933851,0.192824,-1.049753,0.258736,
2013-01-02,1.884892,-0.158986,0.567058,0.64644,1.0
2013-01-03,-0.749961,-0.200973,0.589294,0.270939,2.0
2013-01-04,-0.117302,-0.6121,1.307215,-0.108438,3.0
2013-01-05,-1.874316,-0.993146,0.406705,1.623868,4.0
2013-01-06,0.058644,-0.835104,0.810017,-0.079706,5.0


In [169]:
# now set values by label index and column label
df.at[dates[0], 'F'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.933851,0.192824,-1.049753,0.258736,0.0
2013-01-02,1.884892,-0.158986,0.567058,0.64644,1.0
2013-01-03,-0.749961,-0.200973,0.589294,0.270939,2.0
2013-01-04,-0.117302,-0.6121,1.307215,-0.108438,3.0
2013-01-05,-1.874316,-0.993146,0.406705,1.623868,4.0
2013-01-06,0.058644,-0.835104,0.810017,-0.079706,5.0


In [170]:
# set values by position index
df.iat[0,0] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.192824,-1.049753,0.258736,0.0
2013-01-02,1.884892,-0.158986,0.567058,0.64644,1.0
2013-01-03,-0.749961,-0.200973,0.589294,0.270939,2.0
2013-01-04,-0.117302,-0.6121,1.307215,-0.108438,3.0
2013-01-05,-1.874316,-0.993146,0.406705,1.623868,4.0
2013-01-06,0.058644,-0.835104,0.810017,-0.079706,5.0


In [178]:
# replaces all rows in column D with an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.192824,-1.049753,5,0.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0
2013-01-05,-1.874316,-0.993146,0.406705,5,4.0
2013-01-06,0.058644,-0.835104,0.810017,5,5.0


In [181]:
# conduct a where operation to replace any value greater
# than zero with its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.192824,-1.049753,-5,0.0
2013-01-02,-1.884892,-0.158986,-0.567058,-5,-1.0
2013-01-03,-0.749961,-0.200973,-0.589294,-5,-2.0
2013-01-04,-0.117302,-0.6121,-1.307215,-5,-3.0
2013-01-05,-1.874316,-0.993146,-0.406705,-5,-4.0
2013-01-06,-0.058644,-0.835104,-0.810017,-5,-5.0


#### Missing Data

In [182]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.192824,-1.049753,5,0.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0
2013-01-05,-1.874316,-0.993146,0.406705,5,4.0
2013-01-06,0.058644,-0.835104,0.810017,5,5.0


In [183]:
# reindex allow you to change/add/delete the index of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.192824,-1.049753,5,0.0,
2013-01-02,1.884892,-0.158986,0.567058,5,1.0,
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0,
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0,


In [184]:
# change the 0th and 1th values to 1.0
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.192824,-1.049753,5,0.0,1.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0,1.0
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0,
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0,


In [185]:
# drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.192824,-1.049753,5,0.0,1.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0,1.0


In [186]:
# fill in missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.192824,-1.049753,5,0.0,1.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0,1.0
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0,5.0
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0,5.0


In [187]:
# get boolean mask where values are nan
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### Operations

#### Stats

In [188]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.192824,-1.049753,5,0.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0
2013-01-05,-1.874316,-0.993146,0.406705,5,4.0
2013-01-06,0.058644,-0.835104,0.810017,5,5.0


In [189]:
# the mean of each column
df.mean()

A   -0.133007
B   -0.434581
C    0.438423
D    5.000000
F    2.500000
dtype: float64

In [190]:
# mean across rows
df.mean(1)

2013-01-01    0.828614
2013-01-02    1.658593
2013-01-03    1.327672
2013-01-04    1.715562
2013-01-05    1.307848
2013-01-06    2.006711
Freq: D, dtype: float64

In [191]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [192]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [193]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.192824,-1.049753,5,0.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0
2013-01-05,-1.874316,-0.993146,0.406705,5,4.0
2013-01-06,0.058644,-0.835104,0.810017,5,5.0


In [195]:
# deletes the values in Series s from the corresponding
# index from df
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.0,-0.807176,-2.049753,4.0,-1.0
2013-01-02,-1.115108,-3.158986,-2.432942,2.0,-2.0
2013-01-03,-5.749961,-5.200973,-4.410706,0.0,-3.0
2013-01-04,,,,,
2013-01-05,-7.874316,-6.993146,-5.593295,-1.0,-2.0
2013-01-06,-7.941356,-8.835104,-7.189983,-3.0,-3.0


#### Apply

In [196]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.192824,-1.049753,5,0.0
2013-01-02,1.884892,-0.158986,0.567058,5,1.0
2013-01-03,-0.749961,-0.200973,0.589294,5,2.0
2013-01-04,-0.117302,-0.6121,1.307215,5,3.0
2013-01-05,-1.874316,-0.993146,0.406705,5,4.0
2013-01-06,0.058644,-0.835104,0.810017,5,5.0


In [197]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.192824,-1.049753,5,0.0
2013-01-02,1.884892,0.033837,-0.482695,10,1.0
2013-01-03,1.134931,-0.167136,0.106599,15,3.0
2013-01-04,1.01763,-0.779236,1.413813,20,6.0
2013-01-05,-0.856687,-1.772383,1.820519,25,10.0
2013-01-06,-0.798043,-2.607487,2.630536,30,15.0


#### Histogramming

In [198]:
# numpy function np.random.randint(low inclusive, high exclusive, how many to make)
s = pd.Series(np.random.randint(0,7, size=10))
s

0    5
1    6
2    2
3    4
4    5
5    1
6    1
7    6
8    0
9    3
dtype: int64

In [199]:
# count frequency of each value
# value on left (x sub j), frequency on right
s.value_counts()

6    2
5    2
1    2
4    1
3    1
2    1
0    1
dtype: int64

#### String Methods

In [200]:
# Series can process strings. Uses regular expressions in many cases
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
# make all lowercase
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

#### Merge

In [201]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-2.819118,2.006837,2.435488,0.783915
1,-0.426165,1.703072,2.115495,-0.984837
2,0.064388,0.997028,0.709132,0.021503
3,0.368851,-0.142451,1.029008,0.241394
4,0.658756,-0.596412,-0.025259,0.606958
5,-0.155783,-0.687314,-0.510065,0.452354
6,-0.784527,-0.553008,-0.31166,1.154354
7,0.62532,0.508202,-2.688371,-0.352247
8,1.046959,-0.155165,-0.791084,0.036809
9,0.755011,-1.292141,1.415382,-0.043237


In [202]:
# break it into pieces via Python slicing - a list with slices
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list and add to the pd.concat function to 
# put it all together again
pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,-2.819118,2.006837,2.435488,0.783915
1,-0.426165,1.703072,2.115495,-0.984837
2,0.064388,0.997028,0.709132,0.021503
3,0.368851,-0.142451,1.029008,0.241394
4,0.658756,-0.596412,-0.025259,0.606958
5,-0.155783,-0.687314,-0.510065,0.452354
6,-0.784527,-0.553008,-0.31166,1.154354
7,0.62532,0.508202,-2.688371,-0.352247
8,1.046959,-0.155165,-0.791084,0.036809
9,0.755011,-1.292141,1.415382,-0.043237


#### Join

In [205]:
# SQL style merges
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval':[1,2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [206]:
right

Unnamed: 0,key,rval
0,foo,3
1,foo,4


In [209]:
# on='key' matches a value with others given the same key
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [212]:
# OR...
left = pd.DataFrame({'key':['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [213]:
right

Unnamed: 0,key,rval
0,foo,3
1,bar,4


In [214]:
# here, because there is only one matching 'foo', 1 merges with 4
# and 2 merges with 5
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,bar,2,4


#### Append rows to a dataframe

In [215]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each
# contents will be randomly generated floats from the z dist
print(np.random.randn(8, 4))

[[-0.32741504 -0.15948451  0.48355257 -1.04706492]
 [-0.24029203  0.22895894  0.323666    0.67470477]
 [ 0.83451821  0.20882423 -0.07734489 -0.52000357]
 [ 0.36961986 -1.9435036  -1.61116992 -0.299691  ]
 [ 0.76648373 -0.27879385 -0.90471285 -0.4640315 ]
 [-1.31226686  0.57050664  0.08085221 -0.52236362]
 [-0.61364306 -0.0392682   1.58463036 -1.17647027]
 [ 1.00339109 -0.7915924   0.13229926 -0.21513326]]


In [216]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,-0.208477,0.058265,0.676981,-0.228303
1,0.724662,0.717066,0.85459,-0.103653
2,1.322862,1.480823,-0.024636,0.064969
3,-1.381548,0.626771,1.134512,-1.008944
4,1.118668,1.53445,-1.761045,0.280328
5,1.1986,3.415574,0.662099,-0.812849
6,0.621098,0.868136,-0.6442,-1.538942
7,-0.389942,1.077828,-0.741401,2.002013


In [217]:
# retreive the 3th row
s = df.iloc[3]
s

A   -1.381548
B    0.626771
C    1.134512
D   -1.008944
Name: 3, dtype: float64

In [218]:
# copy into a new appended row in dataframe
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.208477,0.058265,0.676981,-0.228303
1,0.724662,0.717066,0.85459,-0.103653
2,1.322862,1.480823,-0.024636,0.064969
3,-1.381548,0.626771,1.134512,-1.008944
4,1.118668,1.53445,-1.761045,0.280328
5,1.1986,3.415574,0.662099,-0.812849
6,0.621098,0.868136,-0.6442,-1.538942
7,-0.389942,1.077828,-0.741401,2.002013
8,-1.381548,0.626771,1.134512,-1.008944


#### Grouping

In [219]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.1186,0.898088
1,bar,one,1.114275,-1.620672
2,foo,two,-1.251153,0.137693
3,bar,three,-1.067866,-2.052396
4,foo,two,0.800453,-0.57869
5,bar,two,0.944086,-1.585113
6,foo,one,-1.264519,0.143986
7,foo,three,-2.302335,0.98199


In [220]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.990496,-5.258181
foo,-5.136154,1.583067


In [221]:
# group hierarchically, then sum up results
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.114275,-1.620672
bar,three,-1.067866,-2.052396
bar,two,0.944086,-1.585113
foo,one,-2.38312,1.042074
foo,three,-2.302335,0.98199
foo,two,-0.4507,-0.440997


### Reshaping

In [222]:
# zip(*[[list of n lenght],[list of n length]]) converts things into tuples.
# list() adds everything to a list

# Here my goal is to make a stack like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [223]:
# now I make the multi index object
my_multi_index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
my_multi_index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [224]:
# now Mi make a dataframe with random numbers and the multi index
# I also give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.553676,-0.627131
bar,two,-1.389091,0.700304
baz,one,0.492627,1.897126
baz,two,-1.084886,0.517323
foo,one,0.108179,-0.617048
foo,two,0.588771,0.514054
qux,one,3.946621,-2.292334
qux,two,-0.11586,-0.081324


In [225]:
# I can slice it too if I want
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.553676,-0.627131
bar,two,-1.389091,0.700304
baz,one,0.492627,1.897126
baz,two,-1.084886,0.517323


In [226]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.553676
               B   -0.627131
       two     A   -1.389091
               B    0.700304
baz    one     A    0.492627
               B    1.897126
       two     A   -1.084886
               B    0.517323
dtype: float64

In [227]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.553676,-0.627131
bar,two,-1.389091,0.700304
baz,one,0.492627,1.897126
baz,two,-1.084886,0.517323


#### Pivot tables

In [228]:
# create my dataframe using a dictionary
df = pd.DataFrame({'A': ['one','one','two','three'] * 3,
                  'B': ['A','B','C'] * 4,
                  'C': ['foo','foo','foo','bar','bar','bar'] * 2,
                  'D': [1,2,3,4,5,6,7,8,9,10,11,12],
                  'E': [13,14,15,16,17,18,19,20,21,22,23,24]})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1,13
1,one,B,foo,2,14
2,two,C,foo,3,15
3,three,A,bar,4,16
4,one,B,bar,5,17
5,one,C,bar,6,18
6,two,A,foo,7,19
7,three,B,foo,8,20
8,one,C,foo,9,21
9,one,A,bar,10,22


In [229]:
# make the pivot table
pd.pivot_table(df, values='D', index=['A','B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,10.0,1.0
one,B,5.0,2.0
one,C,6.0,9.0
three,A,4.0,
three,B,,8.0
three,C,12.0,
two,A,,7.0
two,B,11.0,
two,C,,3.0


#### Time Series

In [230]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [231]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts

2012-01-01 00:00:00    234
2012-01-01 00:00:01    159
2012-01-01 00:00:02    499
2012-01-01 00:00:03    142
2012-01-01 00:00:04    226
2012-01-01 00:00:05    315
2012-01-01 00:00:06    174
2012-01-01 00:00:07      1
2012-01-01 00:00:08    166
2012-01-01 00:00:09    217
2012-01-01 00:00:10    426
2012-01-01 00:00:11    322
2012-01-01 00:00:12    254
2012-01-01 00:00:13    179
2012-01-01 00:00:14    446
2012-01-01 00:00:15    377
2012-01-01 00:00:16    203
2012-01-01 00:00:17    315
2012-01-01 00:00:18     56
2012-01-01 00:00:19    188
2012-01-01 00:00:20     53
2012-01-01 00:00:21    337
2012-01-01 00:00:22    453
2012-01-01 00:00:23    199
2012-01-01 00:00:24    342
2012-01-01 00:00:25    454
2012-01-01 00:00:26    467
2012-01-01 00:00:27    255
2012-01-01 00:00:28    443
2012-01-01 00:00:29    195
                      ... 
2012-01-01 00:01:10     51
2012-01-01 00:01:11    149
2012-01-01 00:01:12    362
2012-01-01 00:01:13    144
2012-01-01 00:01:14      0
2012-01-01 00:01:15     75
2

In [232]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc

2012-01-01 00:00:00+00:00    234
2012-01-01 00:00:01+00:00    159
2012-01-01 00:00:02+00:00    499
2012-01-01 00:00:03+00:00    142
2012-01-01 00:00:04+00:00    226
2012-01-01 00:00:05+00:00    315
2012-01-01 00:00:06+00:00    174
2012-01-01 00:00:07+00:00      1
2012-01-01 00:00:08+00:00    166
2012-01-01 00:00:09+00:00    217
2012-01-01 00:00:10+00:00    426
2012-01-01 00:00:11+00:00    322
2012-01-01 00:00:12+00:00    254
2012-01-01 00:00:13+00:00    179
2012-01-01 00:00:14+00:00    446
2012-01-01 00:00:15+00:00    377
2012-01-01 00:00:16+00:00    203
2012-01-01 00:00:17+00:00    315
2012-01-01 00:00:18+00:00     56
2012-01-01 00:00:19+00:00    188
2012-01-01 00:00:20+00:00     53
2012-01-01 00:00:21+00:00    337
2012-01-01 00:00:22+00:00    453
2012-01-01 00:00:23+00:00    199
2012-01-01 00:00:24+00:00    342
2012-01-01 00:00:25+00:00    454
2012-01-01 00:00:26+00:00    467
2012-01-01 00:00:27+00:00    255
2012-01-01 00:00:28+00:00    443
2012-01-01 00:00:29+00:00    195
          

In [233]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern')

2011-12-31 19:00:00-05:00    234
2011-12-31 19:00:01-05:00    159
2011-12-31 19:00:02-05:00    499
2011-12-31 19:00:03-05:00    142
2011-12-31 19:00:04-05:00    226
2011-12-31 19:00:05-05:00    315
2011-12-31 19:00:06-05:00    174
2011-12-31 19:00:07-05:00      1
2011-12-31 19:00:08-05:00    166
2011-12-31 19:00:09-05:00    217
2011-12-31 19:00:10-05:00    426
2011-12-31 19:00:11-05:00    322
2011-12-31 19:00:12-05:00    254
2011-12-31 19:00:13-05:00    179
2011-12-31 19:00:14-05:00    446
2011-12-31 19:00:15-05:00    377
2011-12-31 19:00:16-05:00    203
2011-12-31 19:00:17-05:00    315
2011-12-31 19:00:18-05:00     56
2011-12-31 19:00:19-05:00    188
2011-12-31 19:00:20-05:00     53
2011-12-31 19:00:21-05:00    337
2011-12-31 19:00:22-05:00    453
2011-12-31 19:00:23-05:00    199
2011-12-31 19:00:24-05:00    342
2011-12-31 19:00:25-05:00    454
2011-12-31 19:00:26-05:00    467
2011-12-31 19:00:27-05:00    255
2011-12-31 19:00:28-05:00    443
2011-12-31 19:00:29-05:00    195
          

#### Stopped at 'Categoricals'