# 10 Minutes to Pandas Tutorial
From pandas [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) Tutorial.

In [1]:
# import declarations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Object Creation

##### Making Pandas Series
A Series in Pandas is a "One-dimensional ndarray with axis labels (including time series)"

In [6]:
# make a pandas Series
# here labels are 0 to 5
# notice it is default type float64 and one value is numpy NaN
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [136]:
# make a Series with a date range
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

##### Make Pandas Data Frame

Basically, this tutorial is all about pandas dataframes. They can be thought of as "a dict-like container for Series objects."

In [8]:
# create DataFrame using DataFrame() object
# numpy's np.random.randn(n,m) returns an ndarray of shape n x m with random values from standard normal distribution
# dates (above) becomes the index (rows) of the dataframe
# column labels generated via Python list() function

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [9]:
# reorder columns
df[['D','C','B','A']]

Unnamed: 0,D,C,B,A
2013-01-01,-1.077556,-1.481339,-2.029136,0.150418
2013-01-02,0.9061,-0.639974,-0.861873,-0.637539
2013-01-03,-0.666769,0.275506,0.271093,-0.859411
2013-01-04,1.8405,-0.203266,-0.70465,-0.316656
2013-01-05,-2.945005,-0.759203,-2.045237,-0.874535
2013-01-06,-0.538173,1.0411,-0.051847,0.042315


In [10]:
# put order back
df[['A','B','C','D']]

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [11]:
# make DataFrame() object using a Python dict
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list('ABCD'),dtype='float32'),
                    # notice dict item C creates my index labels
                   'D': np.array([3] * 4,dtype='int32'),
                    # D is a numpy array with number 3 given 4 times
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo',
                   'G': [1,2,3,4]})
                    # all dict values must have same number of items
                    # or be a item (value which is copied 4 times)
df2

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


### Viewing Data/Changing DataFrames

In [12]:
# show data types of each dict item in the dataframe
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [13]:
# see first n rows
df2.head(n=3)

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3


In [15]:
# see bottom n rows
df2.tail(n=2)

Unnamed: 0,A,B,C,D,E,F,G
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [16]:
# return index (rows) which is an immutable ndarray
df2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
# display columns which is an immutable ndarray
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [18]:
# return column string, which is indexed starting at 0
df2.columns[0]

'A'

In [19]:
# back to the df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [20]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.415901,-0.903608,-0.294529,-0.413484
std,0.446475,0.971275,0.878743,1.658375
min,-0.874535,-2.045237,-1.481339,-2.945005
25%,-0.803943,-1.73732,-0.729396,-0.974859
50%,-0.477098,-0.783261,-0.42162,-0.602471
75%,-0.047428,-0.215048,0.155813,0.545032
max,0.150418,0.271093,1.0411,1.8405


In [21]:
# transpose the data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.150418,-0.637539,-0.859411,-0.316656,-0.874535,0.042315
B,-2.029136,-0.861873,0.271093,-0.70465,-2.045237,-0.051847
C,-1.481339,-0.639974,0.275506,-0.203266,-0.759203,1.0411
D,-1.077556,0.9061,-0.666769,1.8405,-2.945005,-0.538173


In [22]:
# sort by an axis (by rows or by columns)
# axis=1 sorts by row labels, axis=0 by column labels
# ascending=False sorts in descending order
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.077556,-1.481339,-2.029136,0.150418
2013-01-02,0.9061,-0.639974,-0.861873,-0.637539
2013-01-03,-0.666769,0.275506,0.271093,-0.859411
2013-01-04,1.8405,-0.203266,-0.70465,-0.316656
2013-01-05,-2.945005,-0.759203,-2.045237,-0.874535
2013-01-06,-0.538173,1.0411,-0.051847,0.042315


In [23]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-06,0.042315,-0.051847,1.0411,-0.538173
2013-01-03,-0.859411,0.271093,0.275506,-0.666769


### Selecting in DataFrames using Pandas functions
Python / Numpy expressions for selecting and setting are fine, but for production code use pandas data access methods: .at, .iat, .loc, .iloc and .ix

##### Python/Numpy expressions

In [36]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [37]:
# Select a single column, which yields a Series
# equivalent to df.A
df['A']

2013-01-01    0.150418
2013-01-02   -0.637539
2013-01-03   -0.859411
2013-01-04   -0.316656
2013-01-05   -0.874535
2013-01-06    0.042315
Freq: D, Name: A, dtype: float64

In [38]:
# Select via [] which slices at indices
# 0th (inclusive) to the 2th (exclusive) row (so, row 0 and row 1)
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061


In [39]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061


##### Pandas Data Access Methods: .loc and .at (label-based selection functions)

In [46]:
# .loc is label-based selection
# select by label, returns a pandas Series
df.loc[dates[0]]

A    0.150418
B   -2.029136
C   -1.481339
D   -1.077556
Name: 2013-01-01 00:00:00, dtype: float64

In [47]:
# select by label (same as above)
df.loc['2013-01-01']

A    0.150418
B   -2.029136
C   -1.481339
D   -1.077556
Name: 2013-01-01 00:00:00, dtype: float64

In [48]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [49]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [52]:
# select all rows with columns labeled 'A' and 'B'
# .loc labels are always inclusive
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.150418,-2.029136
2013-01-02,-0.637539,-0.861873
2013-01-03,-0.859411,0.271093
2013-01-04,-0.316656,-0.70465
2013-01-05,-0.874535,-2.045237
2013-01-06,0.042315,-0.051847


In [53]:
# select a single row and columns A and B (returns a pandas Series)
df.loc['20130102', ['A','B']]

A   -0.637539
B   -0.861873
Name: 2013-01-02 00:00:00, dtype: float64

In [55]:
# select a single value (scalar)
df.loc[dates[0],'A']

0.15041755013805783

In [56]:
# select a single value (scalar) using labels
df.loc['20130101','A']

0.15041755013805783

In [34]:
# .at does same as .loc
df.at[dates[0],'A']

0.72751486594547443

In [90]:
# select based on keyewords (labels) using .isin()
# focusing on column E, select only rows with certain keywords that match 'one' or 'four'
df2[df2['E'].isin(['one', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556,one
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061,one
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005,four


In [91]:
# using .isin(), the contents of column E are rendered a pandas Series of type bool
df2['E'].isin(['one','four'])

2013-01-01     True
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

In [93]:
# therefore, when placed in brackets, wherever it is True, the corresponding label's row is returned
df2[df2['E'].isin(['one', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556,one
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061,one
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005,four


##### Pandas Data Access Method: .iloc (index-based selection function)

In [82]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [83]:
# select 3th row
df.iloc[3]

A   -0.316656
B   -0.704650
C   -0.203266
D    1.840500
Name: 2013-01-04 00:00:00, dtype: float64

In [84]:
# select rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.316656,-0.70465
2013-01-05,-0.874535,-2.045237


In [85]:
# select by lists of integer position locations
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.637539,-0.639974
2013-01-03,-0.859411,0.275506
2013-01-05,-0.874535,-0.759203


In [86]:
# select a scalar (returns a float64)
df.iloc[1,2]

-0.63997379659155007

In [87]:
# select with fast access to a scalar (same effect as above method)
df.iat[1,2]

-0.63997379659155007

In [88]:
# create a new dataframe
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556,one
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061,one
2013-01-03,-0.859411,0.271093,0.275506,-0.666769,two
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405,three
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005,four
2013-01-06,0.042315,-0.051847,1.0411,-0.538173,three


##### Selection Using Boolean Indexing

In [70]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [71]:
# select using boolean indexing
# select "only rows where any value in column A is greater than 0"
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [72]:
# select all rows but only show "anywhere where greater than 0"
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,,,
2013-01-02,,,,0.9061
2013-01-03,,0.271093,0.275506,
2013-01-04,,,,1.8405
2013-01-05,,,,
2013-01-06,0.042315,,1.0411,


### Setting Values in Pandas DataFrames

In [94]:
# show df again
df

Unnamed: 0,A,B,C,D
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061
2013-01-03,-0.859411,0.271093,0.275506,-0.666769
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005
2013-01-06,0.042315,-0.051847,1.0411,-0.538173


In [98]:
# make a new Series that I can add to the dataframe
# first make a date range
dates = pd.date_range('20130102', periods=6)
dates

DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05',
               '2013-01-06', '2013-01-07'],
              dtype='datetime64[ns]', freq='D')

In [99]:
# throw the date range into a new Pandas Series
s1 = pd.Series([1,2,3,4,5,6], index=dates)
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [100]:
# now add the Series to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
# 2013-01-07 is dropped
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.150418,-2.029136,-1.481339,-1.077556,
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061,1.0
2013-01-03,-0.859411,0.271093,0.275506,-0.666769,2.0
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405,3.0
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005,4.0
2013-01-06,0.042315,-0.051847,1.0411,-0.538173,5.0


##### Pandas Data Setting Methods: .at (label-based), .iat (index-based)

In [105]:
# set value at labels
df.at[dates[0],'F'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-2.029136,-1.481339,-1.077556,
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061,0.0
2013-01-03,-0.859411,0.271093,0.275506,-0.666769,2.0
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405,3.0
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005,4.0
2013-01-06,0.042315,-0.051847,1.0411,-0.538173,5.0


In [106]:
# set values by position index
df.iat[0,0] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-2.029136,-1.481339,-1.077556,
2013-01-02,-0.637539,-0.861873,-0.639974,0.9061,0.0
2013-01-03,-0.859411,0.271093,0.275506,-0.666769,2.0
2013-01-04,-0.316656,-0.70465,-0.203266,1.8405,3.0
2013-01-05,-0.874535,-2.045237,-0.759203,-2.945005,4.0
2013-01-06,0.042315,-0.051847,1.0411,-0.538173,5.0


##### Pandas Data Setting Methods: .loc (label-based), .iloc (index-based)

In [108]:
# .loc to set all rows in column D with a value from an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-2.029136,-1.481339,5,
2013-01-02,-0.637539,-0.861873,-0.639974,5,0.0
2013-01-03,-0.859411,0.271093,0.275506,5,2.0
2013-01-04,-0.316656,-0.70465,-0.203266,5,3.0
2013-01-05,-0.874535,-2.045237,-0.759203,5,4.0
2013-01-06,0.042315,-0.051847,1.0411,5,5.0


In [111]:
# .iloc to set all rows in column D with a value from an ndarray of 5's of length of df dataframe
df.iloc[:,0] = np.array([1] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1,-2.029136,-1.481339,5,
2013-01-02,1,-0.861873,-0.639974,5,0.0
2013-01-03,1,0.271093,0.275506,5,2.0
2013-01-04,1,-0.70465,-0.203266,5,3.0
2013-01-05,1,-2.045237,-0.759203,5,4.0
2013-01-06,1,-0.051847,1.0411,5,5.0


### Working with Missing Data

In [140]:
# show dates range again
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [141]:
# show original dataframe again
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1,-2.029136,-1.481339,5,
2013-01-02,1,-0.861873,-0.639974,5,0.0
2013-01-03,1,0.271093,0.275506,5,2.0
2013-01-04,1,-0.70465,-0.203266,5,3.0
2013-01-05,1,-2.045237,-0.759203,5,4.0
2013-01-06,1,-0.051847,1.0411,5,5.0


In [144]:
# reindex allow you to change/add/delete the indices of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,1,-2.029136,-1.481339,5,,
2013-01-02,1,-0.861873,-0.639974,5,0.0,
2013-01-03,1,0.271093,0.275506,5,2.0,
2013-01-04,1,-0.70465,-0.203266,5,3.0,


In [145]:
# change the 0th and 1th values in a certain column
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,1,-2.029136,-1.481339,5,,1.0
2013-01-02,1,-0.861873,-0.639974,5,0.0,1.0
2013-01-03,1,0.271093,0.275506,5,2.0,
2013-01-04,1,-0.70465,-0.203266,5,3.0,


In [148]:
# drop any rows that have any missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1,-0.861873,-0.639974,5,0.0,1.0


In [150]:
# fill in any NaN
df1.fillna(value=100000)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,1,-2.029136,-1.481339,5,100000.0,1.0
2013-01-02,1,-0.861873,-0.639974,5,0.0,1.0
2013-01-03,1,0.271093,0.275506,5,2.0,100000.0
2013-01-04,1,-0.70465,-0.203266,5,3.0,100000.0


In [152]:
# get boolean mask where values are NaN
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### Operations on DataFrames

In [154]:
# Replace any value greater than zero with its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-2.029136,-1.481339,-5,
2013-01-02,-1,-0.861873,-0.639974,-5,0.0
2013-01-03,-1,-0.271093,-0.275506,-5,-2.0
2013-01-04,-1,-0.70465,-0.203266,-5,-3.0
2013-01-05,-1,-2.045237,-0.759203,-5,-4.0
2013-01-06,-1,-0.051847,-1.0411,-5,-5.0


In [156]:
# calculate mean of each column
df2.mean()

A   -1.000000
B   -0.993972
C   -0.733398
D   -5.000000
F   -2.800000
dtype: float64

In [158]:
# calculate mean across rows
df2.mean(1)

2013-01-01   -2.377619
2013-01-02   -1.500369
2013-01-03   -1.709320
2013-01-04   -1.981583
2013-01-05   -2.560888
2013-01-06   -2.418589
Freq: D, dtype: float64

In [159]:
# create a new Series called "s" using dates as index
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [160]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [164]:
# show new dataframe again
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-2.029136,-1.481339,-5,
2013-01-02,-1,-0.861873,-0.639974,-5,0.0
2013-01-03,-1,-0.271093,-0.275506,-5,-2.0
2013-01-04,-1,-0.70465,-0.203266,-5,-3.0
2013-01-05,-1,-2.045237,-0.759203,-5,-4.0
2013-01-06,-1,-0.051847,-1.0411,-5,-5.0


In [165]:
# delete each value from the Series s from each item in the corresponding index from the DataFrame
df2.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,-2.0,-3.029136,-2.481339,-6.0,
2013-01-02,-4.0,-3.861873,-3.639974,-8.0,-3.0
2013-01-03,-6.0,-5.271093,-5.275506,-10.0,-7.0
2013-01-04,,,,,
2013-01-05,-7.0,-8.045237,-6.759203,-11.0,-10.0
2013-01-06,-9.0,-8.051847,-9.0411,-13.0,-13.0


In [170]:
# show df2 again
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-2.029136,-1.481339,-5,
2013-01-02,-1,-0.861873,-0.639974,-5,0.0
2013-01-03,-1,-0.271093,-0.275506,-5,-2.0
2013-01-04,-1,-0.70465,-0.203266,-5,-3.0
2013-01-05,-1,-2.045237,-0.759203,-5,-4.0
2013-01-06,-1,-0.051847,-1.0411,-5,-5.0


In [175]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df2.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-2.029136,-1.481339,-5,
2013-01-02,-2,-2.891009,-2.121313,-10,0.0
2013-01-03,-3,-3.162102,-2.396819,-15,-2.0
2013-01-04,-4,-3.866751,-2.600085,-20,-5.0
2013-01-05,-5,-5.911988,-3.359288,-25,-9.0
2013-01-06,-6,-5.963835,-4.400387,-30,-14.0


In [177]:
# create an ndarray of size 10 with random integers between 0 (inclusive) and 7 (exclusive)
nd_arr = np.random.randint(0,7, size=10)
nd_arr

array([2, 2, 6, 0, 0, 0, 5, 3, 1, 1])

In [178]:
# create a Series using nd_arr
s = pd.Series(nd_arr)
s

0    2
1    2
2    6
3    0
4    0
5    0
6    5
7    3
8    1
9    1
dtype: int64

In [185]:
# count frequency of each value
# frequency on left, value counted on right
freq_count = s.value_counts()
freq_count

0    3
2    2
1    2
6    1
5    1
3    1
dtype: int64

### String Methods

In [None]:
# create a new Series
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [187]:
# make all lowercase
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

### Merge DataFrames

In [188]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,1.449203,1.252648,0.755138,-0.255721
1,-0.144607,-0.520571,-1.858436,-0.489573
2,0.861474,0.083702,-1.253261,0.370475
3,1.809813,0.755276,0.062147,-0.042861
4,0.082099,-1.218299,-0.903259,-0.650057
5,1.075693,-0.424684,-1.0317,0.689532
6,-0.027208,-0.179658,0.565576,1.262178
7,-0.094734,-1.791766,0.331683,0.094901
8,-1.172111,-0.079189,0.328237,-0.58405
9,-1.45478,0.982263,-1.726348,-0.827764


In [189]:
# break the DataFrame up into pieces via Python slicing
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list
# use pd.concat function to put it all together again
pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,1.449203,1.252648,0.755138,-0.255721
1,-0.144607,-0.520571,-1.858436,-0.489573
2,0.861474,0.083702,-1.253261,0.370475
3,1.809813,0.755276,0.062147,-0.042861
4,0.082099,-1.218299,-0.903259,-0.650057
5,1.075693,-0.424684,-1.0317,0.689532
6,-0.027208,-0.179658,0.565576,1.262178
7,-0.094734,-1.791766,0.331683,0.094901
8,-1.172111,-0.079189,0.328237,-0.58405
9,-1.45478,0.982263,-1.726348,-0.827764


### SQL-Style Merges

In [205]:
# create 'left' DataFrame with same values in 'key'
left = pd.DataFrame({'key': ['foo', 'foo'], 'left_value':[1,2]})
left

Unnamed: 0,key,left_value
0,foo,1
1,foo,2


In [206]:
# create 'right' DataFrame with same values in 'key'
right = pd.DataFrame({'key': ['foo', 'foo'], 'right_value':[3,4]})
right

Unnamed: 0,key,right_value
0,foo,3
1,foo,4


In [207]:
# merge based certain 'key' column
# notice each value in left_value is paired up with each of the two values in right_value
pd.merge(left, right, on='key')

Unnamed: 0,key,left_value,right_value
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [212]:
# create 'left' DataFrame with different values in 'key'
left = pd.DataFrame({'key':['foo', 'bar'], 'left_value':[1,2]})
left

Unnamed: 0,key,left_value
0,foo,1
1,bar,2


In [213]:
# create 'right' DataFrame with different values in 'key'
right = pd.DataFrame({'key':['foo','bar'], 'right_value':[3,4]})
right

Unnamed: 0,key,right_value
0,foo,3
1,bar,4


In [215]:
# merge based on 'key' column
# here, the value in 'key' aligns with each value from left_value and right_value
pd.merge(left, right, on='key')

Unnamed: 0,key,left_value,right_value
0,foo,1,3
1,bar,2,4


### Append rows to a dataframe

In [216]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each.
# contents will be randomly generated floats from the z distribution
print(np.random.randn(8, 4))

[[-0.02985109 -0.55302197  0.14713673 -1.85305735]
 [-1.18451343 -1.309431    0.69259974 -0.07688126]
 [ 0.37940599  1.16019361  1.11837864  0.30633562]
 [ 0.86842624  0.3833286  -0.25656944  0.65993949]
 [ 0.80547859 -0.75537613  0.65751036  1.63255012]
 [-0.65805026  0.04365358  0.93388965 -0.81351411]
 [-0.08671602  0.31807271 -2.69304628 -1.40059837]
 [ 0.61549661  0.46653227  0.73738814 -1.4807375 ]]


In [217]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,1.276949,0.683266,-0.328178,-0.298663
1,-0.09595,-2.78263,-0.295843,-0.695164
2,-0.543235,-1.401638,0.487333,-0.656329
3,-1.066229,-2.512431,-1.058879,0.14321
4,0.042367,0.532735,1.661634,-0.625917
5,-0.250367,-0.955773,1.362214,-0.075811
6,-0.756959,-0.887568,-0.068428,-2.729244
7,-0.708553,1.646146,1.98941,0.148916


In [222]:
# retreive the 3th row as a Series
s = df.iloc[3]
s

A   -1.066229
B   -2.512431
C   -1.058879
D    0.143210
Name: 3, dtype: float64

In [223]:
# copy that Series into a new appended row in dataframe
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,1.276949,0.683266,-0.328178,-0.298663
1,-0.09595,-2.78263,-0.295843,-0.695164
2,-0.543235,-1.401638,0.487333,-0.656329
3,-1.066229,-2.512431,-1.058879,0.14321
4,0.042367,0.532735,1.661634,-0.625917
5,-0.250367,-0.955773,1.362214,-0.075811
6,-0.756959,-0.887568,-0.068428,-2.729244
7,-0.708553,1.646146,1.98941,0.148916
8,-1.066229,-2.512431,-1.058879,0.14321


### Grouping with groupby

In [224]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.648366,-1.567385
1,bar,one,0.974624,0.953101
2,foo,two,-2.07744,0.426757
3,bar,three,0.918227,0.485403
4,foo,two,3.580954,-0.537036
5,bar,two,-0.43727,1.329695
6,foo,one,0.742259,0.163458
7,foo,three,2.520842,0.044279


In [225]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.455581,2.768199
foo,4.118248,-1.469927


In [226]:
# group multiple columns, then sum up results
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.974624,0.953101
bar,three,0.918227,0.485403
bar,two,-0.43727,1.329695
foo,one,0.093892,-1.403926
foo,three,2.520842,0.044279
foo,two,1.503514,-0.110279


### Other Grouping Methods

In [229]:
# create a new dataframe
df = pd.DataFrame({'A':[0,1,1,0],
                   'B': ['A','B','C','D']})
df

Unnamed: 0,A,B
0,0,A
1,1,B
2,1,C
3,0,D


In [230]:
# return a Series
df.A

0    0
1    1
2    1
3    0
Name: A, dtype: int64

In [231]:
# boolean check to see which values in the Series are equal to a value
df.A == 1

0    False
1     True
2     True
3    False
Name: A, dtype: bool

In [233]:
# create a new DataFrame only with rows in which the condition was met
only_ones = df[df.A == 1]
only_ones

Unnamed: 0,A,B
1,1,B
2,1,C


### Working with MultiIndex

In [235]:
# zip(*[[list of n length],[list of n length]]) converts two lists into paired tuples.
# list() makes everything into a list

# Here my goal is to make a MultiIndex DataFrame like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
my_tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
my_tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [240]:
# make the multi index object using .from_tuples method
my_multi_index = pd.MultiIndex.from_tuples(my_tuples, names=['first','second'])
my_multi_index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [241]:
# make a DataFrame from the MultiIndex object
# give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.68518,-0.459511
bar,two,0.440734,1.191328
baz,one,-0.092164,0.434279
baz,two,-0.25561,-1.262614
foo,one,1.434601,0.714548
foo,two,-0.818639,-0.537218
qux,one,0.197244,0.855465
qux,two,-1.072757,-0.18098


In [244]:
# slice the MultiIndexed DataFrame
df = df[:4]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.68518,-0.459511
bar,two,0.440734,1.191328
baz,one,-0.092164,0.434279
baz,two,-0.25561,-1.262614


In [245]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

first  second   
bar    one     A   -1.685180
               B   -0.459511
       two     A    0.440734
               B    1.191328
baz    one     A   -0.092164
               B    0.434279
       two     A   -0.255610
               B   -1.262614
dtype: float64

In [246]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.68518,-0.459511
bar,two,0.440734,1.191328
baz,one,-0.092164,0.434279
baz,two,-0.25561,-1.262614


### Time Series

In [249]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=10, freq='S')
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09'],
              dtype='datetime64[ns]', freq='S')

In [250]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()

2012-01-01 00:00:00     45
2012-01-01 00:00:01    297
2012-01-01 00:00:02     97
2012-01-01 00:00:03    285
2012-01-01 00:00:04    129
Freq: S, dtype: int64

In [251]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc.head()

2012-01-01 00:00:00+00:00     45
2012-01-01 00:00:01+00:00    297
2012-01-01 00:00:02+00:00     97
2012-01-01 00:00:03+00:00    285
2012-01-01 00:00:04+00:00    129
Freq: S, dtype: int64

In [252]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern').head()

2011-12-31 19:00:00-05:00     45
2011-12-31 19:00:01-05:00    297
2011-12-31 19:00:02-05:00     97
2011-12-31 19:00:03-05:00    285
2011-12-31 19:00:04-05:00    129
Freq: S, dtype: int64

# Stopped doing tutorial at 'Categoricals' section