# 10 Minutes to Pandas Tutorial
From pandas [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) Tutorial.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### pandas Series
A Series in Pandas is a "One-dimensional ndarray with axis labels (including time series)"

In [3]:
# create a pandas Series
# here labels are 0 to 5
# notice it is default type float64
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# other pandas objects, like date_range() make an ndarray for you
# date_range() object returns a filled-in DatetimeIndex object
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### pandas Data Frame

Basically, this tutorial is all about pandas dataframes. They can be thought of as "a dict-like container for Series objects."

In [5]:
# pandas DataFrame() object creates a dataframe for me
# numpy's np.random.randn(n,m) returns an ndarray of shape n x m with random values from standard normal distribution
# dates becomes the index (rows) of the dataframe
# column labels generated via Python list() function

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.915844,0.432312,0.117305,0.757036
2013-01-02,1.772771,-2.589984,0.456796,2.179586
2013-01-03,-1.13223,0.0477,-0.222861,-0.697437
2013-01-04,-1.165251,1.20752,-1.165081,-0.319259
2013-01-05,2.13975,0.493247,0.374598,1.065821
2013-01-06,-1.425911,0.431814,-0.126879,1.836498


In [6]:
# reorder the columns
df[['D','C','B','A']]

Unnamed: 0,D,C,B,A
2013-01-01,0.757036,0.117305,0.432312,0.915844
2013-01-02,2.179586,0.456796,-2.589984,1.772771
2013-01-03,-0.697437,-0.222861,0.0477,-1.13223
2013-01-04,-0.319259,-1.165081,1.20752,-1.165251
2013-01-05,1.065821,0.374598,0.493247,2.13975
2013-01-06,1.836498,-0.126879,0.431814,-1.425911


In [7]:
# put order back
df[['A','B','C','D']]

Unnamed: 0,A,B,C,D
2013-01-01,0.915844,0.432312,0.117305,0.757036
2013-01-02,1.772771,-2.589984,0.456796,2.179586
2013-01-03,-1.13223,0.0477,-0.222861,-0.697437
2013-01-04,-1.165251,1.20752,-1.165081,-0.319259
2013-01-05,2.13975,0.493247,0.374598,1.065821
2013-01-06,-1.425911,0.431814,-0.126879,1.836498


In [5]:
# use DataFrame() object with a Python dict to create the dataframe
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list('ABCD'),dtype='float32'),
                    # notice dict item C creates my index labels
                   'D': np.array([3] * 4,dtype='int32'),
                    # D is a numpy array with number 3 given 4 times
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo',
                   'G': [1,2,3,4]})
                    # all dict values must have same number of items
                    # or be a item (value which is copied 4 times)
df2

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [6]:
# show the data types of each dict item in the dataframe
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [7]:
# see first n rows
df2.head(n=3)

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3


In [8]:
# see bottom n rows
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F,G
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [9]:
# return index (rows) which is an immutable ndarray
df2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
# display columns which is an immutable ndarray
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [11]:
# back to the df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189
2013-01-03,-0.042177,0.061421,0.366279,-0.806102
2013-01-04,0.617173,2.255137,-1.648285,0.684689
2013-01-05,0.131346,1.549538,-0.22126,-0.805079
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761


In [12]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.023787,0.195825,-0.169923,0.044852
std,0.504992,1.404158,0.927274,0.70891
min,-0.684051,-1.222262,-1.648285,-0.806102
25%,-0.406577,-0.790525,-0.488961,-0.582262
50%,0.044584,-0.280429,-0.135477,0.190421
75%,0.305109,1.177509,0.262286,0.58718
max,0.617173,2.255137,1.111617,0.814761


In [13]:
# transpose the data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.363031,-0.528043,-0.042177,0.617173,0.131346,-0.684051
B,-0.622278,-1.222262,0.061421,2.255137,1.549538,-0.846607
C,-0.578195,1.111617,0.366279,-1.648285,-0.22126,-0.049693
D,0.294653,0.086189,-0.806102,0.684689,-0.805079,0.814761


In [14]:
# sort by an axis (by rows or by columns)
df.sort_index(axis=1, ascending=False)
# axis=1 sorts by row labels, axis=0 by column labels
# ascending=False sorts in descending order

Unnamed: 0,D,C,B,A
2013-01-01,0.294653,-0.578195,-0.622278,0.363031
2013-01-02,0.086189,1.111617,-1.222262,-0.528043
2013-01-03,-0.806102,0.366279,0.061421,-0.042177
2013-01-04,0.684689,-1.648285,2.255137,0.617173
2013-01-05,-0.805079,-0.22126,1.549538,0.131346
2013-01-06,0.814761,-0.049693,-0.846607,-0.684051


In [15]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-02,-0.528043,-1.222262,1.111617,0.086189
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-03,-0.042177,0.061421,0.366279,-0.806102
2013-01-05,0.131346,1.549538,-0.22126,-0.805079
2013-01-04,0.617173,2.255137,-1.648285,0.684689


In [16]:
# Python / Numpy expressions for selecting and setting are fine, but 
# for production code use pandas data access methods:
# .at, .iat, .loc, .iloc and .ix

In [17]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189
2013-01-03,-0.042177,0.061421,0.366279,-0.806102
2013-01-04,0.617173,2.255137,-1.648285,0.684689
2013-01-05,0.131346,1.549538,-0.22126,-0.805079
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761


In [18]:
# Selecting a single column, which yields a Series, equivalent to df.A
# both are a pandas Series
df['A']

2013-01-01    0.363031
2013-01-02   -0.528043
2013-01-03   -0.042177
2013-01-04    0.617173
2013-01-05    0.131346
2013-01-06   -0.684051
Freq: D, Name: A, dtype: float64

In [19]:
df.A

2013-01-01    0.363031
2013-01-02   -0.528043
2013-01-03   -0.042177
2013-01-04    0.617173
2013-01-05    0.131346
2013-01-06   -0.684051
Freq: D, Name: A, dtype: float64

In [20]:
# select via [] which slices at indices
# 0th (inclusive) to the 2th (exclusive) row (so, row 0 and row 1)
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189


In [21]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189


In [22]:
# selection by label, returns a pandas Series
df.loc[dates[0]]

A    0.363031
B   -0.622278
C   -0.578195
D    0.294653
Name: 2013-01-01 00:00:00, dtype: float64

In [23]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189
2013-01-03,-0.042177,0.061421,0.366279,-0.806102
2013-01-04,0.617173,2.255137,-1.648285,0.684689
2013-01-05,0.131346,1.549538,-0.22126,-0.805079
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761


In [24]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [25]:
# returns first row from the df dataframe, a pandas Series
df.loc[dates[0]]

A    0.363031
B   -0.622278
C   -0.578195
D    0.294653
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:
# here the ':' indicates all rows, and the list with ['A','B'] indicate columns A and B
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.363031,-0.622278
2013-01-02,-0.528043,-1.222262
2013-01-03,-0.042177,0.061421
2013-01-04,0.617173,2.255137
2013-01-05,0.131346,1.549538
2013-01-06,-0.684051,-0.846607


In [27]:
# same exact selection but using index labels
# note, index (rows) selection is inclusive
df.loc['20130101':'20130106',['A','B']]

Unnamed: 0,A,B
2013-01-01,0.363031,-0.622278
2013-01-02,-0.528043,-1.222262
2013-01-03,-0.042177,0.061421
2013-01-04,0.617173,2.255137
2013-01-05,0.131346,1.549538
2013-01-06,-0.684051,-0.846607


In [28]:
# just from a single row and columns A and B returns a pandas Series
df.loc['20130102', ['A','B']]

A   -0.528043
B   -1.222262
Name: 2013-01-02 00:00:00, dtype: float64

In [29]:
# get a single value (scalar)
df.loc[dates[0],'A']

0.3630307525434972

In [45]:
# same exact value but using a specific label
df.loc['20130101','A']

0.3630307525434972

In [46]:
# .at is same as above
df.at[dates[0],'A']

0.3630307525434972

In [47]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189
2013-01-03,-0.042177,0.061421,0.366279,-0.806102
2013-01-04,0.617173,2.255137,-1.648285,0.684689
2013-01-05,0.131346,1.549538,-0.22126,-0.805079
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761


In [48]:
# show row at the 3th row
df.iloc[3]

A    0.617173
B    2.255137
C   -1.648285
D    0.684689
Name: 2013-01-04 00:00:00, dtype: float64

In [49]:
# slice rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.617173,2.255137
2013-01-05,0.131346,1.549538


In [50]:
# by lists of integer position locations
# shows the 1th, 2th and 4th rows, 0th and 2th columns
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.528043,1.111617
2013-01-03,-0.042177,0.366279
2013-01-05,0.131346,-0.22126


In [51]:
# for slicing columns explicitly
# all rows, 1th (inclusive) to 3th (exclusiv) columns
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.622278,-0.578195
2013-01-02,-1.222262,1.111617
2013-01-03,0.061421,0.366279
2013-01-04,2.255137,-1.648285
2013-01-05,1.549538,-0.22126
2013-01-06,-0.846607,-0.049693


In [52]:
# explicitly get a value, returns a float64
df.iloc[1,2]

1.1116171292019339

In [53]:
# fast access to a scalar(same effect as above method), returns a float64
df.iat[1,1]

-1.2222617620433931

In [54]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189
2013-01-03,-0.042177,0.061421,0.366279,-0.806102
2013-01-04,0.617173,2.255137,-1.648285,0.684689
2013-01-05,0.131346,1.549538,-0.22126,-0.805079
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761


In [55]:
# some boolean indexing
# "where any value in column A is greater than 0"
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-04,0.617173,2.255137,-1.648285,0.684689
2013-01-05,0.131346,1.549538,-0.22126,-0.805079


In [56]:
# "anywhere where greater than 0"
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,,,0.294653
2013-01-02,,,1.111617,0.086189
2013-01-03,,0.061421,0.366279,
2013-01-04,0.617173,2.255137,,0.684689
2013-01-05,0.131346,1.549538,,
2013-01-06,,,,0.814761


In [57]:
# filter through the dataframe using isin()

# make a copy of the dataframe called df2
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2i

NameError: name 'df2i' is not defined

In [58]:
# referencing column E, select only rows with certain keywords that match 'one' or 'four'
df2[df2['E'].isin(['one', 'four'])]

# !!!I'd like to know how the slice is made here. Question
# posted at: http://stackoverflow.com/questions/41733696/how-does-pandas-use-a-series-object-to-slice-a-data-frame

Unnamed: 0,A,B,C,D,E
2013-01-01,0.363031,-0.622278,-0.578195,0.294653,one
2013-01-02,-0.528043,-1.222262,1.111617,0.086189,one
2013-01-05,0.131346,1.549538,-0.22126,-0.805079,four


In [59]:
# the contents of df2[] below are a pandas Series of type bool
# wherever it is True, the corresponding label's row is returned
df2['E'].isin(['one','four'])

2013-01-01     True
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

##### Setting values in a Data Frame

In [60]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.363031,-0.622278,-0.578195,0.294653
2013-01-02,-0.528043,-1.222262,1.111617,0.086189
2013-01-03,-0.042177,0.061421,0.366279,-0.806102
2013-01-04,0.617173,2.255137,-1.648285,0.684689
2013-01-05,0.131346,1.549538,-0.22126,-0.805079
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761


In [61]:
# make a new Series that I can add to the dataframe
# !!! REMEMBER, a dataframe can be thought of as a dict of Series
# where the Series labels form a union if redundant
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [62]:
# now add it to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
# 2013-01-07 is dropped
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.363031,-0.622278,-0.578195,0.294653,
2013-01-02,-0.528043,-1.222262,1.111617,0.086189,1.0
2013-01-03,-0.042177,0.061421,0.366279,-0.806102,2.0
2013-01-04,0.617173,2.255137,-1.648285,0.684689,3.0
2013-01-05,0.131346,1.549538,-0.22126,-0.805079,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761,5.0


In [63]:
# now set values by label index and column label
df.at[dates[0], 'F'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.363031,-0.622278,-0.578195,0.294653,0.0
2013-01-02,-0.528043,-1.222262,1.111617,0.086189,1.0
2013-01-03,-0.042177,0.061421,0.366279,-0.806102,2.0
2013-01-04,0.617173,2.255137,-1.648285,0.684689,3.0
2013-01-05,0.131346,1.549538,-0.22126,-0.805079,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761,5.0


In [64]:
# set values by position index
df.iat[0,0] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,0.294653,0.0
2013-01-02,-0.528043,-1.222262,1.111617,0.086189,1.0
2013-01-03,-0.042177,0.061421,0.366279,-0.806102,2.0
2013-01-04,0.617173,2.255137,-1.648285,0.684689,3.0
2013-01-05,0.131346,1.549538,-0.22126,-0.805079,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,0.814761,5.0


In [65]:
# replaces all rows in column D with an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,5,0.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0
2013-01-03,-0.042177,0.061421,0.366279,5,2.0
2013-01-04,0.617173,2.255137,-1.648285,5,3.0
2013-01-05,0.131346,1.549538,-0.22126,5,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,5,5.0


In [66]:
# conduct a where operation to replace any value greater
# than zero with its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,-5,0.0
2013-01-02,-0.528043,-1.222262,-1.111617,-5,-1.0
2013-01-03,-0.042177,-0.061421,-0.366279,-5,-2.0
2013-01-04,-0.617173,-2.255137,-1.648285,-5,-3.0
2013-01-05,-0.131346,-1.549538,-0.22126,-5,-4.0
2013-01-06,-0.684051,-0.846607,-0.049693,-5,-5.0


##### Missing Data

In [67]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,5,0.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0
2013-01-03,-0.042177,0.061421,0.366279,5,2.0
2013-01-04,0.617173,2.255137,-1.648285,5,3.0
2013-01-05,0.131346,1.549538,-0.22126,5,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,5,5.0


In [68]:
# reindex allow you to change/add/delete the index of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.622278,-0.578195,5,0.0,
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0,
2013-01-03,-0.042177,0.061421,0.366279,5,2.0,
2013-01-04,0.617173,2.255137,-1.648285,5,3.0,


In [69]:
# change the 0th and 1th values to 1.0
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.622278,-0.578195,5,0.0,1.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0,1.0
2013-01-03,-0.042177,0.061421,0.366279,5,2.0,
2013-01-04,0.617173,2.255137,-1.648285,5,3.0,


In [70]:
# drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.622278,-0.578195,5,0.0,1.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0,1.0


In [71]:
# fill in missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.622278,-0.578195,5,0.0,1.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0,1.0
2013-01-03,-0.042177,0.061421,0.366279,5,2.0,5.0
2013-01-04,0.617173,2.255137,-1.648285,5,3.0,5.0


In [72]:
# get boolean mask where values are nan
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


##### Operations

In [73]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,5,0.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0
2013-01-03,-0.042177,0.061421,0.366279,5,2.0
2013-01-04,0.617173,2.255137,-1.648285,5,3.0
2013-01-05,0.131346,1.549538,-0.22126,5,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,5,5.0


In [74]:
# the mean of each column
df.mean()

A   -0.084292
B    0.195825
C   -0.169923
D    5.000000
F    2.500000
dtype: float64

In [75]:
# mean across rows
df.mean(1)

2013-01-01    0.759905
2013-01-02    1.072262
2013-01-03    1.477104
2013-01-04    1.844805
2013-01-05    2.091925
2013-01-06    1.683930
Freq: D, dtype: float64

In [76]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [77]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [78]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,5,0.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0
2013-01-03,-0.042177,0.061421,0.366279,5,2.0
2013-01-04,0.617173,2.255137,-1.648285,5,3.0
2013-01-05,0.131346,1.549538,-0.22126,5,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,5,5.0


In [79]:
# deletes the values in Series s from the corresponding
# index from df
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.0,-1.622278,-1.578195,4.0,-1.0
2013-01-02,-3.528043,-4.222262,-1.888383,2.0,-2.0
2013-01-03,-5.042177,-4.938579,-4.633721,0.0,-3.0
2013-01-04,,,,,
2013-01-05,-5.868654,-4.450462,-6.22126,-1.0,-2.0
2013-01-06,-8.684051,-8.846607,-8.049693,-3.0,-3.0


In [80]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,5,0.0
2013-01-02,-0.528043,-1.222262,1.111617,5,1.0
2013-01-03,-0.042177,0.061421,0.366279,5,2.0
2013-01-04,0.617173,2.255137,-1.648285,5,3.0
2013-01-05,0.131346,1.549538,-0.22126,5,4.0
2013-01-06,-0.684051,-0.846607,-0.049693,5,5.0


In [81]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.622278,-0.578195,5,0.0
2013-01-02,-0.528043,-1.84454,0.533422,10,1.0
2013-01-03,-0.57022,-1.783119,0.899701,15,3.0
2013-01-04,0.046953,0.472018,-0.748584,20,6.0
2013-01-05,0.178298,2.021556,-0.969844,25,10.0
2013-01-06,-0.505753,1.174949,-1.019537,30,15.0


In [84]:
# numpy function np.random.randint(low inclusive, high exclusive, how many to make)
s = pd.Series(np.random.randint(0,7, size=10))
s

0    4
1    1
2    5
3    5
4    4
5    3
6    2
7    4
8    1
9    5
dtype: int64

In [85]:
# count frequency of each value
# value on left (x sub j), frequency on right
s.value_counts()

5    3
4    3
1    2
3    1
2    1
dtype: int64

##### String Methods

In [86]:
# Series can process strings. Uses regular expressions in many cases
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
# make all lowercase
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

##### Merge

In [87]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-1.406932,0.342999,0.31005,-0.116854
1,0.717276,-2.134852,0.163821,-0.704848
2,-0.76515,0.680493,-0.263261,-0.941271
3,-1.504529,-1.440949,0.729434,-0.358079
4,-1.323547,-0.001959,0.014809,0.26603
5,0.074848,0.180547,0.079596,0.33189
6,1.1146,2.055887,1.160615,0.622254
7,-0.408884,0.401451,-2.587313,1.315982
8,0.216742,-0.284501,0.079818,-1.068732
9,-0.434299,-0.072437,0.373559,0.274928


In [88]:
# break it into pieces via Python slicing - a list with slices
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list and add to the pd.concat function to 
# put it all together again
pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,-1.406932,0.342999,0.31005,-0.116854
1,0.717276,-2.134852,0.163821,-0.704848
2,-0.76515,0.680493,-0.263261,-0.941271
3,-1.504529,-1.440949,0.729434,-0.358079
4,-1.323547,-0.001959,0.014809,0.26603
5,0.074848,0.180547,0.079596,0.33189
6,1.1146,2.055887,1.160615,0.622254
7,-0.408884,0.401451,-2.587313,1.315982
8,0.216742,-0.284501,0.079818,-1.068732
9,-0.434299,-0.072437,0.373559,0.274928


##### Join

In [89]:
# SQL style merges
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval':[1,2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [90]:
right

Unnamed: 0,key,rval
0,foo,3
1,foo,4


In [91]:
# on='key' matches a value with others given the same key
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [92]:
# OR...
left = pd.DataFrame({'key':['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [93]:
right

Unnamed: 0,key,rval
0,foo,3
1,bar,4


In [94]:
# here, because there is only one matching 'foo', 1 merges with 4
# and 2 merges with 5
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,bar,2,4


##### Append rows to a dataframe

In [95]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each
# contents will be randomly generated floats from the z dist
print(np.random.randn(8, 4))

[[-0.85795499 -1.19183388  0.21090411 -0.84788193]
 [ 1.41043802 -0.75200721 -1.39380749  1.73056149]
 [-2.472078    1.83537708  0.16682537 -0.91648941]
 [ 0.28366086 -0.21021403  1.06777     0.91505396]
 [-1.5033105  -1.31238037  0.56754998 -0.42982238]
 [-0.22809996 -0.01364287  0.40285641 -0.50402855]
 [ 0.0231636   1.2904533   0.49788783 -1.01877528]
 [-0.10261184  0.35538098 -1.58190052 -0.54977033]]


In [96]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,0.992703,-0.38726,0.065098,1.636055
1,0.007157,-0.64554,-0.56914,-0.031072
2,-0.857926,-0.104764,-0.313879,1.283316
3,-1.442374,-0.088353,3.160009,0.474673
4,-1.411477,0.929003,0.511171,0.707991
5,0.44671,-0.300139,-0.824393,-0.559144
6,-0.581578,1.344484,0.412607,-0.548701
7,0.259862,1.112726,-0.303854,-0.607923


In [97]:
# retreive the 3th row
s = df.iloc[3]
s

A   -1.442374
B   -0.088353
C    3.160009
D    0.474673
Name: 3, dtype: float64

In [98]:
# copy into a new appended row in dataframe
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.992703,-0.38726,0.065098,1.636055
1,0.007157,-0.64554,-0.56914,-0.031072
2,-0.857926,-0.104764,-0.313879,1.283316
3,-1.442374,-0.088353,3.160009,0.474673
4,-1.411477,0.929003,0.511171,0.707991
5,0.44671,-0.300139,-0.824393,-0.559144
6,-0.581578,1.344484,0.412607,-0.548701
7,0.259862,1.112726,-0.303854,-0.607923
8,-1.442374,-0.088353,3.160009,0.474673


##### Grouping

In [100]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.345191,1.811288
1,bar,one,1.217849,0.620877
2,foo,two,1.709822,0.0367
3,bar,three,0.077352,0.049278
4,foo,two,0.415581,-0.317812
5,bar,two,1.924399,-1.934832
6,foo,one,0.992111,0.497744
7,foo,three,0.2257,0.352213


In [101]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,3.219601,-1.264676
foo,3.688405,2.380133


In [102]:
# group hierarchically, then sum up results
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.217849,0.620877
bar,three,0.077352,0.049278
bar,two,1.924399,-1.934832
foo,one,1.337302,2.309032
foo,three,0.2257,0.352213
foo,two,2.125403,-0.281112


##### Reshaping

In [103]:
# zip(*[[list of n lenght],[list of n length]]) converts things into tuples.
# list() adds everything to a list

# Here my goal is to make a stack like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [104]:
# now I make the multi index object
my_multi_index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
my_multi_index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [105]:
# now Mi make a dataframe with random numbers and the multi index
# I also give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.794189,-1.218326
bar,two,0.151023,0.865103
baz,one,-1.808528,0.233589
baz,two,-0.455102,-0.197476
foo,one,-2.030309,-0.070583
foo,two,1.324822,-1.441124
qux,one,0.266335,1.514752
qux,two,-2.036072,-0.22577


In [106]:
# I can slice it too if I want
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.794189,-1.218326
bar,two,0.151023,0.865103
baz,one,-1.808528,0.233589
baz,two,-0.455102,-0.197476


In [107]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

first  second   
bar    one     A   -1.794189
               B   -1.218326
       two     A    0.151023
               B    0.865103
baz    one     A   -1.808528
               B    0.233589
       two     A   -0.455102
               B   -0.197476
dtype: float64

In [108]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.794189,-1.218326
bar,two,0.151023,0.865103
baz,one,-1.808528,0.233589
baz,two,-0.455102,-0.197476


##### Pivot tables

In [109]:
# create my dataframe using a dictionary
df = pd.DataFrame({'A': ['one','one','two','three'] * 3,
                  'B': ['A','B','C'] * 4,
                  'C': ['foo','foo','foo','bar','bar','bar'] * 2,
                  'D': [1,2,3,4,5,6,7,8,9,10,11,12],
                  'E': [13,14,15,16,17,18,19,20,21,22,23,24]})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1,13
1,one,B,foo,2,14
2,two,C,foo,3,15
3,three,A,bar,4,16
4,one,B,bar,5,17
5,one,C,bar,6,18
6,two,A,foo,7,19
7,three,B,foo,8,20
8,one,C,foo,9,21
9,one,A,bar,10,22


In [110]:
# make the pivot table
pd.pivot_table(df, values='D', index=['A','B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,10.0,1.0
one,B,5.0,2.0
one,C,6.0,9.0
three,A,4.0,
three,B,,8.0
three,C,12.0,
two,A,,7.0
two,B,11.0,
two,C,,3.0


##### Time Series

In [119]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [117]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()

2012-01-01 00:00:00    314
2012-01-01 00:00:01    369
2012-01-01 00:00:02    254
2012-01-01 00:00:03    269
2012-01-01 00:00:04    486
Freq: S, dtype: int64

In [116]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc.head()

2012-01-01 00:00:00+00:00    324
2012-01-01 00:00:01+00:00    497
2012-01-01 00:00:02+00:00    298
2012-01-01 00:00:03+00:00    433
2012-01-01 00:00:04+00:00    381
Freq: S, dtype: int64

In [115]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern').head()

2011-12-31 19:00:00-05:00    324
2011-12-31 19:00:01-05:00    497
2011-12-31 19:00:02-05:00    298
2011-12-31 19:00:03-05:00    433
2011-12-31 19:00:04-05:00    381
Freq: S, dtype: int64

# Stopped at 'Categoricals'