# 10 Minutes to Pandas Tutorial
From pandas [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) Tutorial.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### pandas Series
A Series in Pandas is a "One-dimensional ndarray with axis labels (including time series)"

In [3]:
# create a pandas Series
# here labels are 0 to 5
# notice it is default type float64
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# other pandas objects, like date_range() make an ndarray for you
# date_range() object returns a filled-in DatetimeIndex object
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### pandas Data Frame

Basically, this tutorial is all about pandas dataframes. They can be thought of as "a dict-like container for Series objects."

In [5]:
# pandas DataFrame() object creates a dataframe for me
# numpy's np.random.randn(n,m) returns an ndarray of shape n x m with random values from standard normal distribution
# dates becomes the index (rows) of the dataframe
# column labels generated via Python list() function

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [6]:
# reorder the columns
df[['D','C','B','A']]

Unnamed: 0,D,C,B,A
2013-01-01,0.721031,0.766115,-0.252391,0.727515
2013-01-02,-0.357798,1.832172,-0.6753,-2.162846
2013-01-03,-1.092772,0.822476,0.741309,0.256676
2013-01-04,1.533295,-1.080235,-1.52727,-1.187702
2013-01-05,-0.102859,0.309604,-0.395123,0.515146
2013-01-06,0.624993,-1.921772,0.770467,-0.027531


In [7]:
# put order back
df[['A','B','C','D']]

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [8]:
# use DataFrame() object with a Python dict to create the dataframe
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list('ABCD'),dtype='float32'),
                    # notice dict item C creates my index labels
                   'D': np.array([3] * 4,dtype='int32'),
                    # D is a numpy array with number 3 given 4 times
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo',
                   'G': [1,2,3,4]})
                    # all dict values must have same number of items
                    # or be a item (value which is copied 4 times)
df2

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [9]:
# show the data types of each dict item in the dataframe
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [10]:
# see first n rows
df2.head(n=3)

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3


In [11]:
# see bottom n rows
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F,G
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [12]:
# return index (rows) which is an immutable ndarray
df2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
# display columns which is an immutable ndarray
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [14]:
# back to the df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [15]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.313124,-0.223051,0.121393,0.220981
std,1.127967,0.877844,1.37752,0.927496
min,-2.162846,-1.52727,-1.921772,-1.092772
25%,-0.897659,-0.605256,-0.732775,-0.294064
50%,0.114572,-0.323757,0.537859,0.261067
75%,0.450528,0.492884,0.808386,0.697021
max,0.727515,0.770467,1.832172,1.533295


In [16]:
# transpose the data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.727515,-2.162846,0.256676,-1.187702,0.515146,-0.027531
B,-0.252391,-0.6753,0.741309,-1.52727,-0.395123,0.770467
C,0.766115,1.832172,0.822476,-1.080235,0.309604,-1.921772
D,0.721031,-0.357798,-1.092772,1.533295,-0.102859,0.624993


In [17]:
# sort by an axis (by rows or by columns)
df.sort_index(axis=1, ascending=False)
# axis=1 sorts by row labels, axis=0 by column labels
# ascending=False sorts in descending order

Unnamed: 0,D,C,B,A
2013-01-01,0.721031,0.766115,-0.252391,0.727515
2013-01-02,-0.357798,1.832172,-0.6753,-2.162846
2013-01-03,-1.092772,0.822476,0.741309,0.256676
2013-01-04,1.533295,-1.080235,-1.52727,-1.187702
2013-01-05,-0.102859,0.309604,-0.395123,0.515146
2013-01-06,0.624993,-1.921772,0.770467,-0.027531


In [18]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [19]:
# Python / Numpy expressions for selecting and setting are fine, but 
# for production code use pandas data access methods:
# .at, .iat, .loc, .iloc and .ix

In [20]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [21]:
# Selecting a single column, which yields a Series, equivalent to df.A
# both are a pandas Series
df['A']

2013-01-01    0.727515
2013-01-02   -2.162846
2013-01-03    0.256676
2013-01-04   -1.187702
2013-01-05    0.515146
2013-01-06   -0.027531
Freq: D, Name: A, dtype: float64

In [22]:
df.A

2013-01-01    0.727515
2013-01-02   -2.162846
2013-01-03    0.256676
2013-01-04   -1.187702
2013-01-05    0.515146
2013-01-06   -0.027531
Freq: D, Name: A, dtype: float64

In [23]:
# select via [] which slices at indices
# 0th (inclusive) to the 2th (exclusive) row (so, row 0 and row 1)
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798


In [24]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798


In [25]:
# selection by label, returns a pandas Series
df.loc[dates[0]]

A    0.727515
B   -0.252391
C    0.766115
D    0.721031
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [27]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [28]:
# returns first row from the df dataframe, a pandas Series
df.loc[dates[0]]

A    0.727515
B   -0.252391
C    0.766115
D    0.721031
Name: 2013-01-01 00:00:00, dtype: float64

In [29]:
# here the ':' indicates all rows, and the list with ['A','B'] indicate columns A and B
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.727515,-0.252391
2013-01-02,-2.162846,-0.6753
2013-01-03,0.256676,0.741309
2013-01-04,-1.187702,-1.52727
2013-01-05,0.515146,-0.395123
2013-01-06,-0.027531,0.770467


In [30]:
# same exact selection but using index labels
# note, index (rows) selection is inclusive
df.loc['20130101':'20130106',['A','B']]

Unnamed: 0,A,B
2013-01-01,0.727515,-0.252391
2013-01-02,-2.162846,-0.6753
2013-01-03,0.256676,0.741309
2013-01-04,-1.187702,-1.52727
2013-01-05,0.515146,-0.395123
2013-01-06,-0.027531,0.770467


In [31]:
# just from a single row and columns A and B returns a pandas Series
df.loc['20130102', ['A','B']]

A   -2.162846
B   -0.675300
Name: 2013-01-02 00:00:00, dtype: float64

In [32]:
# get a single value (scalar)
df.loc[dates[0],'A']

0.72751486594547443

In [33]:
# same exact value but using a specific label
df.loc['20130101','A']

0.72751486594547443

In [34]:
# .at is same as above
df.at[dates[0],'A']

0.72751486594547443

In [35]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [36]:
# show row at the 3th row
df.iloc[3]

A   -1.187702
B   -1.527270
C   -1.080235
D    1.533295
Name: 2013-01-04 00:00:00, dtype: float64

In [37]:
# slice rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-1.187702,-1.52727
2013-01-05,0.515146,-0.395123


In [38]:
# by lists of integer position locations
# shows the 1th, 2th and 4th rows, 0th and 2th columns
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-2.162846,1.832172
2013-01-03,0.256676,0.822476
2013-01-05,0.515146,0.309604


In [39]:
# for slicing columns explicitly
# all rows, 1th (inclusive) to 3th (exclusiv) columns
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-0.252391,0.766115
2013-01-02,-0.6753,1.832172
2013-01-03,0.741309,0.822476
2013-01-04,-1.52727,-1.080235
2013-01-05,-0.395123,0.309604
2013-01-06,0.770467,-1.921772


In [40]:
# explicitly get a value, returns a float64
df.iloc[1,2]

1.8321721857906972

In [41]:
# fast access to a scalar(same effect as above method), returns a float64
df.iat[1,1]

-0.67529999414161002

In [42]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [43]:
# some boolean indexing
# "where any value in column A is greater than 0"
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-05,0.515146,-0.395123,0.309604,-0.102859


In [44]:
# "anywhere where greater than 0"
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,,0.766115,0.721031
2013-01-02,,,1.832172,
2013-01-03,0.256676,0.741309,0.822476,
2013-01-04,,,,1.533295
2013-01-05,0.515146,,0.309604,
2013-01-06,,0.770467,,0.624993


In [45]:
# filter through the dataframe using isin()

# make a copy of the dataframe called df2
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.727515,-0.252391,0.766115,0.721031,one
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798,one
2013-01-03,0.256676,0.741309,0.822476,-1.092772,two
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295,three
2013-01-05,0.515146,-0.395123,0.309604,-0.102859,four
2013-01-06,-0.027531,0.770467,-1.921772,0.624993,three


In [46]:
# referencing column E, select only rows with certain keywords that match 'one' or 'four'
df2[df2['E'].isin(['one', 'four'])]

# !!!I'd like to know how the slice is made here. Question
# posted at: http://stackoverflow.com/questions/41733696/how-does-pandas-use-a-series-object-to-slice-a-data-frame

Unnamed: 0,A,B,C,D,E
2013-01-01,0.727515,-0.252391,0.766115,0.721031,one
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798,one
2013-01-05,0.515146,-0.395123,0.309604,-0.102859,four


In [47]:
# the contents of df2[] below are a pandas Series of type bool
# wherever it is True, the corresponding label's row is returned
df2['E'].isin(['one','four'])

2013-01-01     True
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

##### Setting values in a Data Frame

In [48]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.727515,-0.252391,0.766115,0.721031
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798
2013-01-03,0.256676,0.741309,0.822476,-1.092772
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295
2013-01-05,0.515146,-0.395123,0.309604,-0.102859
2013-01-06,-0.027531,0.770467,-1.921772,0.624993


In [49]:
# make a new Series that I can add to the dataframe
# !!! REMEMBER, a dataframe can be thought of as a dict of Series
# where the Series labels form a union if redundant
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [50]:
# now add it to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
# 2013-01-07 is dropped
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.727515,-0.252391,0.766115,0.721031,
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798,1.0
2013-01-03,0.256676,0.741309,0.822476,-1.092772,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295,3.0
2013-01-05,0.515146,-0.395123,0.309604,-0.102859,4.0
2013-01-06,-0.027531,0.770467,-1.921772,0.624993,5.0


In [51]:
# now set values by label index and column label
df.at[dates[0], 'F'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.727515,-0.252391,0.766115,0.721031,0.0
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798,1.0
2013-01-03,0.256676,0.741309,0.822476,-1.092772,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295,3.0
2013-01-05,0.515146,-0.395123,0.309604,-0.102859,4.0
2013-01-06,-0.027531,0.770467,-1.921772,0.624993,5.0


In [52]:
# set values by position index
df.iat[0,0] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,0.766115,0.721031,0.0
2013-01-02,-2.162846,-0.6753,1.832172,-0.357798,1.0
2013-01-03,0.256676,0.741309,0.822476,-1.092772,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,1.533295,3.0
2013-01-05,0.515146,-0.395123,0.309604,-0.102859,4.0
2013-01-06,-0.027531,0.770467,-1.921772,0.624993,5.0


In [53]:
# replaces all rows in column D with an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,0.766115,5,0.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0
2013-01-03,0.256676,0.741309,0.822476,5,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0
2013-01-05,0.515146,-0.395123,0.309604,5,4.0
2013-01-06,-0.027531,0.770467,-1.921772,5,5.0


In [54]:
# conduct a where operation to replace any value greater
# than zero with its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,-0.766115,-5,0.0
2013-01-02,-2.162846,-0.6753,-1.832172,-5,-1.0
2013-01-03,-0.256676,-0.741309,-0.822476,-5,-2.0
2013-01-04,-1.187702,-1.52727,-1.080235,-5,-3.0
2013-01-05,-0.515146,-0.395123,-0.309604,-5,-4.0
2013-01-06,-0.027531,-0.770467,-1.921772,-5,-5.0


##### Missing Data

In [55]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,0.766115,5,0.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0
2013-01-03,0.256676,0.741309,0.822476,5,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0
2013-01-05,0.515146,-0.395123,0.309604,5,4.0
2013-01-06,-0.027531,0.770467,-1.921772,5,5.0


In [56]:
# reindex allow you to change/add/delete the index of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.252391,0.766115,5,0.0,
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0,
2013-01-03,0.256676,0.741309,0.822476,5,2.0,
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0,


In [57]:
# change the 0th and 1th values to 1.0
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.252391,0.766115,5,0.0,1.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0,1.0
2013-01-03,0.256676,0.741309,0.822476,5,2.0,
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0,


In [58]:
# drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.252391,0.766115,5,0.0,1.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0,1.0


In [59]:
# fill in missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,-0.252391,0.766115,5,0.0,1.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0,1.0
2013-01-03,0.256676,0.741309,0.822476,5,2.0,5.0
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0,5.0


In [60]:
# get boolean mask where values are nan
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


##### Operations

In [61]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,0.766115,5,0.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0
2013-01-03,0.256676,0.741309,0.822476,5,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0
2013-01-05,0.515146,-0.395123,0.309604,5,4.0
2013-01-06,-0.027531,0.770467,-1.921772,5,5.0


In [62]:
# the mean of each column
df.mean()

A   -0.434376
B   -0.223051
C    0.121393
D    5.000000
F    2.500000
dtype: float64

In [63]:
# mean across rows
df.mean(1)

2013-01-01    1.102745
2013-01-02    0.998805
2013-01-03    1.764092
2013-01-04    0.840959
2013-01-05    1.885925
2013-01-06    1.764233
Freq: D, dtype: float64

In [64]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [65]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [66]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,0.766115,5,0.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0
2013-01-03,0.256676,0.741309,0.822476,5,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0
2013-01-05,0.515146,-0.395123,0.309604,5,4.0
2013-01-06,-0.027531,0.770467,-1.921772,5,5.0


In [67]:
# deletes the values in Series s from the corresponding
# index from df
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.0,-1.252391,-0.233885,4.0,-1.0
2013-01-02,-5.162846,-3.6753,-1.167828,2.0,-2.0
2013-01-03,-4.743324,-4.258691,-4.177524,0.0,-3.0
2013-01-04,,,,,
2013-01-05,-5.484854,-6.395123,-5.690396,-1.0,-2.0
2013-01-06,-8.027531,-7.229533,-9.921772,-3.0,-3.0


In [68]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,0.766115,5,0.0
2013-01-02,-2.162846,-0.6753,1.832172,5,1.0
2013-01-03,0.256676,0.741309,0.822476,5,2.0
2013-01-04,-1.187702,-1.52727,-1.080235,5,3.0
2013-01-05,0.515146,-0.395123,0.309604,5,4.0
2013-01-06,-0.027531,0.770467,-1.921772,5,5.0


In [69]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.252391,0.766115,5,0.0
2013-01-02,-2.162846,-0.927691,2.598287,10,1.0
2013-01-03,-1.90617,-0.186382,3.420763,15,3.0
2013-01-04,-3.093872,-1.713652,2.340528,20,6.0
2013-01-05,-2.578726,-2.108775,2.650132,25,10.0
2013-01-06,-2.606257,-1.338308,0.72836,30,15.0


In [70]:
# numpy function np.random.randint(low inclusive, high exclusive, how many to make)
s = pd.Series(np.random.randint(0,7, size=10))
s

0    5
1    6
2    2
3    2
4    0
5    4
6    5
7    4
8    5
9    5
dtype: int64

In [71]:
# count frequency of each value
# value on left (x sub j), frequency on right
s.value_counts()

5    4
4    2
2    2
6    1
0    1
dtype: int64

##### String Methods

In [72]:
# Series can process strings. Uses regular expressions in many cases
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
# make all lowercase
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

##### Merge

In [73]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-1.300041,0.237477,0.26871,-0.703694
1,-0.754716,-0.676873,1.996214,1.455247
2,0.790318,-0.583483,0.117954,2.092209
3,-0.532003,0.693451,2.759444,-1.34838
4,-1.297474,-1.448724,-0.89554,0.458141
5,0.133318,-0.77474,0.267018,-0.177364
6,1.483012,-0.879678,-1.325054,0.617366
7,-0.797036,-1.360276,-0.285229,-0.107332
8,-0.843846,-0.641718,1.178792,-0.043386
9,-1.40348,0.692844,1.402061,0.901803


In [74]:
# break it into pieces via Python slicing - a list with slices
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list and add to the pd.concat function to 
# put it all together again
pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,-1.300041,0.237477,0.26871,-0.703694
1,-0.754716,-0.676873,1.996214,1.455247
2,0.790318,-0.583483,0.117954,2.092209
3,-0.532003,0.693451,2.759444,-1.34838
4,-1.297474,-1.448724,-0.89554,0.458141
5,0.133318,-0.77474,0.267018,-0.177364
6,1.483012,-0.879678,-1.325054,0.617366
7,-0.797036,-1.360276,-0.285229,-0.107332
8,-0.843846,-0.641718,1.178792,-0.043386
9,-1.40348,0.692844,1.402061,0.901803


##### Join

In [75]:
# SQL style merges
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval':[1,2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [76]:
right

Unnamed: 0,key,rval
0,foo,3
1,foo,4


In [77]:
# on='key' matches a value with others given the same key
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [78]:
# OR...
left = pd.DataFrame({'key':['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [79]:
right

Unnamed: 0,key,rval
0,foo,3
1,bar,4


In [80]:
# here, because there is only one matching 'foo', 1 merges with 4
# and 2 merges with 5
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,bar,2,4


##### Append rows to a dataframe

In [81]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each
# contents will be randomly generated floats from the z dist
print(np.random.randn(8, 4))

[[ 1.21385598 -0.02028812  0.06306844 -0.63703625]
 [ 0.62554572 -0.09417844 -0.30668918  1.11368153]
 [-0.61952177 -0.73234064 -0.7470764  -0.78285515]
 [-2.41935364 -0.36534354  1.05703472 -1.03894017]
 [ 0.56870499  0.51052538 -0.65828545 -0.28313294]
 [-1.25221917  0.90460067 -0.88453411  0.9192298 ]
 [-0.56615939  0.10113978 -0.33897329 -3.29582724]
 [ 1.65859022  1.47104261 -0.1625937  -0.66075555]]


In [82]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,-0.062376,-0.545103,-0.822478,1.157143
1,0.158378,0.481811,-0.476434,-0.725969
2,0.949163,1.651687,-0.195493,-0.48144
3,1.588213,0.845789,0.642966,-1.039444
4,0.56523,0.015662,-1.768665,-1.396207
5,1.309254,-0.691161,1.155984,0.333283
6,1.145553,-0.992009,-0.622469,-1.039835
7,0.602083,0.323218,1.241887,-1.451135


In [83]:
# retreive the 3th row
s = df.iloc[3]
s

A    1.588213
B    0.845789
C    0.642966
D   -1.039444
Name: 3, dtype: float64

In [84]:
# copy into a new appended row in dataframe
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.062376,-0.545103,-0.822478,1.157143
1,0.158378,0.481811,-0.476434,-0.725969
2,0.949163,1.651687,-0.195493,-0.48144
3,1.588213,0.845789,0.642966,-1.039444
4,0.56523,0.015662,-1.768665,-1.396207
5,1.309254,-0.691161,1.155984,0.333283
6,1.145553,-0.992009,-0.622469,-1.039835
7,0.602083,0.323218,1.241887,-1.451135
8,1.588213,0.845789,0.642966,-1.039444


##### Grouping

In [85]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.555017,0.313362
1,bar,one,0.010808,-0.790416
2,foo,two,-0.712754,-0.481773
3,bar,three,0.20035,0.390229
4,foo,two,-0.289432,-0.677768
5,bar,two,-0.896616,-1.346923
6,foo,one,0.027958,1.103076
7,foo,three,-1.390875,1.811341


In [86]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.685458,-1.74711
foo,-2.92012,2.068238


In [87]:
# group hierarchically, then sum up results
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.010808,-0.790416
bar,three,0.20035,0.390229
bar,two,-0.896616,-1.346923
foo,one,-0.527059,1.416438
foo,three,-1.390875,1.811341
foo,two,-1.002186,-1.159542


In [91]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.555017,0.313362
1,bar,one,0.010808,-0.790416
2,foo,two,-0.712754,-0.481773
3,bar,three,0.20035,0.390229
4,foo,two,-0.289432,-0.677768
5,bar,two,-0.896616,-1.346923
6,foo,one,0.027958,1.103076
7,foo,three,-1.390875,1.811341


In [92]:
# group by and sum
df.groupby(df.A).sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.685458,-1.74711
foo,-2.92012,2.068238


##### Grouping using other method (what method is this?)

In [184]:
df = pd.DataFrame({'A':[0,1,1,0],
                   'B': ['A','B','C','D']})
df

Unnamed: 0,A,B
0,0,A
1,1,B
2,1,C
3,0,D


In [185]:
# create a Series
df.A

0    0
1    1
2    1
3    0
Name: A, dtype: int64

In [186]:
# create a series of bool that matches a condition
df.A == 1

0    False
1     True
2     True
3    False
Name: A, dtype: bool

In [187]:
# create dataframe with a condition met
only_ones = df[df.A == 1]
only_ones

Unnamed: 0,A,B
1,1,B
2,1,C


##### Reshaping

In [188]:
# zip(*[[list of n lenght],[list of n length]]) converts things into tuples.
# list() adds everything to a list

# Here my goal is to make a stack like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [189]:
# now I make the multi index object
my_multi_index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
my_multi_index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [190]:
# now Mi make a dataframe with random numbers and the multi index
# I also give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.057274,1.373607
bar,two,-0.72837,-0.070684
baz,one,-0.256057,-0.318328
baz,two,-0.578671,-2.307406
foo,one,0.561558,0.421839
foo,two,-1.044868,-1.674899
qux,one,-1.7621,0.058622
qux,two,0.272999,-0.620191


In [191]:
# I can slice it too if I want
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.057274,1.373607
bar,two,-0.72837,-0.070684
baz,one,-0.256057,-0.318328
baz,two,-0.578671,-2.307406


In [192]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.057274
               B    1.373607
       two     A   -0.728370
               B   -0.070684
baz    one     A   -0.256057
               B   -0.318328
       two     A   -0.578671
               B   -2.307406
dtype: float64

In [193]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.057274,1.373607
bar,two,-0.72837,-0.070684
baz,one,-0.256057,-0.318328
baz,two,-0.578671,-2.307406


##### Time Series

In [201]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [202]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()

2012-01-01 00:00:00    409
2012-01-01 00:00:01    459
2012-01-01 00:00:02    414
2012-01-01 00:00:03    170
2012-01-01 00:00:04    455
Freq: S, dtype: int64

In [203]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc.head()

2012-01-01 00:00:00+00:00    409
2012-01-01 00:00:01+00:00    459
2012-01-01 00:00:02+00:00    414
2012-01-01 00:00:03+00:00    170
2012-01-01 00:00:04+00:00    455
Freq: S, dtype: int64

In [204]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern').head()

2011-12-31 19:00:00-05:00    409
2011-12-31 19:00:01-05:00    459
2011-12-31 19:00:02-05:00    414
2011-12-31 19:00:03-05:00    170
2011-12-31 19:00:04-05:00    455
Freq: S, dtype: int64

# Stopped doing tutorial at 'Categoricals'