# 10 Minutes to Pandas Tutorial

From pandas [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) Tutorial.


In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### pandas Series
A Series in Pandas is a "One-dimensional ndarray with axis labels (including time series)"

In [94]:
# create a pandas Series
# here labels are 0 to 5
# notice it is default type float64
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [95]:
# other pandas objects, like date_range() make an ndarray for you
# date_range() object returns a filled-in DatetimeIndex object
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### pandas Data Frame

Basically, this tutorial is all about pandas dataframes. They can be thought of as "a dict-like container for Series objects."

In [96]:
# pandas DataFrame() object creates a dataframe for me
# numpy's np.random.randn(n,m) returns an ndarray of shape n x m with random values from standard normal distribution
# dates becomes the index (rows) of the dataframe
# column labels generated via Python list() function

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [144]:
# use DataFrame() object with a Python dict to create the dataframe
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list('ABCD'),dtype='float32'),
                    # notice dict item C creates my index labels
                   'D': np.array([3] * 4,dtype='int32'),
                    # D is a numpy array with number 3 given 4 times
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo',
                   'G': [1,2,3,4]})
                    # all dict values must have same number of items
                    # or be a item (value which is copied 4 times)
df2

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [98]:
# show the data types of each dict item in the dataframe
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [99]:
# see first n rows
df2.head(n=3)

Unnamed: 0,A,B,C,D,E,F
A,1.0,2013-01-02,1.0,3,test,foo
B,1.0,2013-01-02,1.0,3,train,foo
C,1.0,2013-01-02,1.0,3,test,foo


In [100]:
# see bottom n rows
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F
C,1.0,2013-01-02,1.0,3,test,foo
D,1.0,2013-01-02,1.0,3,train,foo


In [101]:
# return index (rows) which is an immutable ndarray
df2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [102]:
# display columns which is an immutable ndarray
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [103]:
# back to the df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [104]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.386469,-0.189484,-0.059919,0.068884
std,1.164113,0.955775,1.636833,0.889634
min,-0.949104,-1.723352,-2.441679,-0.8612
25%,-0.591397,-0.58378,-0.453349,-0.604781
50%,0.500316,-0.075842,-0.27465,-0.025654
75%,1.010939,0.380163,0.33925,0.475053
max,2.04144,0.959061,2.600497,1.481682


In [105]:
# transpose the data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.949104,0.076911,-0.814166,0.923721,2.04144,1.040011
B,0.343394,-0.613348,-0.495077,-1.723352,0.39242,0.959061
C,0.535402,-0.504433,2.600497,-0.249205,-2.441679,-0.300095
D,0.498422,-0.8612,-0.654291,-0.456253,1.481682,0.404944


In [106]:
# sort by an axis (by rows or by columns)
df.sort_index(axis=1, ascending=False)
# axis=1 sorts by row labels, axis=0 by column labels
# ascending=False sorts in descending order

Unnamed: 0,D,C,B,A
2013-01-01,0.498422,0.535402,0.343394,-0.949104
2013-01-02,-0.8612,-0.504433,-0.613348,0.076911
2013-01-03,-0.654291,2.600497,-0.495077,-0.814166
2013-01-04,-0.456253,-0.249205,-1.723352,0.923721
2013-01-05,1.481682,-2.441679,0.39242,2.04144
2013-01-06,0.404944,-0.300095,0.959061,1.040011


In [107]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [108]:
# Python / Numpy expressions for selecting and setting are fine, but 
# for production code use pandas data access methods:
# .at, .iat, .loc, .iloc and .ix

In [109]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [110]:
# Selecting a single column, which yields a Series, equivalent to df.A
# both are a pandas Series
df['A']

2013-01-01   -0.949104
2013-01-02    0.076911
2013-01-03   -0.814166
2013-01-04    0.923721
2013-01-05    2.041440
2013-01-06    1.040011
Freq: D, Name: A, dtype: float64

In [111]:
df.A

2013-01-01   -0.949104
2013-01-02    0.076911
2013-01-03   -0.814166
2013-01-04    0.923721
2013-01-05    2.041440
2013-01-06    1.040011
Freq: D, Name: A, dtype: float64

In [112]:
# select via [] which slices at indices
# 0th (inclusive) to the 2th (exclusive) row (so, row 0 and row 1)
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612


In [113]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612


In [114]:
# selection by label, returns a pandas Series
df.loc[dates[0]]

A   -0.949104
B    0.343394
C    0.535402
D    0.498422
Name: 2013-01-01 00:00:00, dtype: float64

In [115]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [116]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [117]:
# returns first row from the df dataframe, a pandas Series
df.loc[dates[0]]

A   -0.949104
B    0.343394
C    0.535402
D    0.498422
Name: 2013-01-01 00:00:00, dtype: float64

In [118]:
# here the ':' indicates all rows, and the list with ['A','B'] indicate columns A and B
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.949104,0.343394
2013-01-02,0.076911,-0.613348
2013-01-03,-0.814166,-0.495077
2013-01-04,0.923721,-1.723352
2013-01-05,2.04144,0.39242
2013-01-06,1.040011,0.959061


In [119]:
# same exact selection but using index labels
# note, index (rows) selection is inclusive
df.loc['20130101':'20130106',['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.949104,0.343394
2013-01-02,0.076911,-0.613348
2013-01-03,-0.814166,-0.495077
2013-01-04,0.923721,-1.723352
2013-01-05,2.04144,0.39242
2013-01-06,1.040011,0.959061


In [120]:
# just from a single row and columns A and B returns a pandas Series
df.loc['20130102', ['A','B']]

A    0.076911
B   -0.613348
Name: 2013-01-02 00:00:00, dtype: float64

In [121]:
# get a single value (scalar)
df.loc[dates[0],'A']

-0.94910372153375233

In [122]:
# same exact value but using a specific label
df.loc['20130101','A']

-0.94910372153375233

In [123]:
# .at is same as above
df.at[dates[0],'A']

-0.94910372153375233

In [124]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [125]:
# show row at the 3th row
df.iloc[3]

A    0.923721
B   -1.723352
C   -0.249205
D   -0.456253
Name: 2013-01-04 00:00:00, dtype: float64

In [126]:
# slice rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.923721,-1.723352
2013-01-05,2.04144,0.39242


In [127]:
# by lists of integer position locations
# shows the 1th, 2th and 4th rows, 0th and 2th columns
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.076911,-0.504433
2013-01-03,-0.814166,2.600497
2013-01-05,2.04144,-2.441679


In [128]:
# for slicing columns explicitly
# all rows, 1th (inclusive) to 3th (exclusiv) columns
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,0.343394,0.535402
2013-01-02,-0.613348,-0.504433
2013-01-03,-0.495077,2.600497
2013-01-04,-1.723352,-0.249205
2013-01-05,0.39242,-2.441679
2013-01-06,0.959061,-0.300095


In [129]:
# explicitly get a value, returns a float64
df.iloc[1,2]

-0.50443309585800211

In [130]:
# fast access to a scalar(same effect as above method), returns a float64
df.iat[1,1]

-0.61334806471991277

In [131]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.949104,0.343394,0.535402,0.498422
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [132]:
# some boolean indexing
# "where any value in column A is greater than 0"
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253
2013-01-05,2.04144,0.39242,-2.441679,1.481682
2013-01-06,1.040011,0.959061,-0.300095,0.404944


In [133]:
# "anywhere where greater than 0"
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.343394,0.535402,0.498422
2013-01-02,0.076911,,,
2013-01-03,,,2.600497,
2013-01-04,0.923721,,,
2013-01-05,2.04144,0.39242,,1.481682
2013-01-06,1.040011,0.959061,,0.404944


In [134]:
# filter through the dataframe using isin()

# make a copy of the dataframe called df2
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2i

NameError: name 'df2i' is not defined

In [None]:
# referencing column E, select only rows with certain keywords that match 'one' or 'four'
df2[df2['E'].isin(['one', 'four'])]

# !!!I'd like to know how the slice is made here. Question
# posted at: http://stackoverflow.com/questions/41733696/how-does-pandas-use-a-series-object-to-slice-a-data-frame

In [None]:
# the contents of df2[] below are a pandas Series of type bool
# wherever it is True, the corresponding label's row is returned
df2['E'].isin(['one','four'])

#### Setting values in a Data Frame

In [None]:
df

In [135]:
# make a new Series that I can add to the dataframe
# !!! REMEMBER, a dataframe can be thought of as a dict of Series
# where the Series labels form a union if redundant
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [136]:
# now add it to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
# 2013-01-07 is dropped
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.949104,0.343394,0.535402,0.498422,
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612,1.0
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291,2.0
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253,3.0
2013-01-05,2.04144,0.39242,-2.441679,1.481682,4.0
2013-01-06,1.040011,0.959061,-0.300095,0.404944,5.0


In [137]:
# now set values by label index and column label
df.at[dates[0], 'F'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.949104,0.343394,0.535402,0.498422,0.0
2013-01-02,0.076911,-0.613348,-0.504433,-0.8612,1.0
2013-01-03,-0.814166,-0.495077,2.600497,-0.654291,2.0
2013-01-04,0.923721,-1.723352,-0.249205,-0.456253,3.0
2013-01-05,2.04144,0.39242,-2.441679,1.481682,4.0
2013-01-06,1.040011,0.959061,-0.300095,0.404944,5.0


In [None]:
# set values by position index
df.iat[0,0] = 0
df

In [None]:
# replaces all rows in column D with an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

In [None]:
# conduct a where operation to replace any value greater
# than zero with its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

#### Missing Data

In [None]:
# show original dataframe
df

In [None]:
# reindex allow you to change/add/delete the index of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

In [None]:
# change the 0th and 1th values to 1.0
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

In [None]:
# drop any rows that have missing data
df1.dropna(how='any')

In [None]:
# fill in missing data
df1.fillna(value=5)

In [None]:
# get boolean mask where values are nan
pd.isnull(df1)

### Operations

#### Stats

In [None]:
# show original dataframe
df

In [None]:
# the mean of each column
df.mean()

In [None]:
# mean across rows
df.mean(1)

In [None]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

In [None]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

In [None]:
# show df dataframe again
df

In [None]:
# deletes the values in Series s from the corresponding
# index from df
df.sub(s, axis='index')

#### Apply

In [None]:
df

In [None]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df.apply(np.cumsum)

#### Histogramming

In [None]:
# numpy function np.random.randint(low inclusive, high exclusive, how many to make)
s = pd.Series(np.random.randint(0,7, size=10))
s

In [None]:
# count frequency of each value
# value on left (x sub j), frequency on right
s.value_counts()

#### String Methods

In [None]:
# Series can process strings. Uses regular expressions in many cases
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
# make all lowercase
s.str.lower()

#### Merge

In [None]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

In [None]:
# break it into pieces via Python slicing - a list with slices
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list and add to the pd.concat function to 
# put it all together again
pd.concat([piece1, piece2, piece3])

#### Join

In [None]:
# SQL style merges
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval':[1,2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval':[3,4]})
left

In [None]:
right

In [None]:
# on='key' matches a value with others given the same key
pd.merge(left, right, on='key')

In [None]:
# OR...
left = pd.DataFrame({'key':['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'], 'rval':[3,4]})
left

In [None]:
right

In [None]:
# here, because there is only one matching 'foo', 1 merges with 4
# and 2 merges with 5
pd.merge(left, right, on='key')

#### Append rows to a dataframe

In [None]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each
# contents will be randomly generated floats from the z dist
print(np.random.randn(8, 4))

In [None]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

In [None]:
# retreive the 3th row
s = df.iloc[3]
s

In [None]:
# copy into a new appended row in dataframe
df.append(s, ignore_index=True)

#### Grouping

In [None]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

In [None]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

In [None]:
# group hierarchically, then sum up results
df.groupby(['A','B']).sum()

### Reshaping

In [None]:
# zip(*[[list of n lenght],[list of n length]]) converts things into tuples.
# list() adds everything to a list

# Here my goal is to make a stack like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
tuples

In [None]:
# now I make the multi index object
my_multi_index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
my_multi_index

In [None]:
# now Mi make a dataframe with random numbers and the multi index
# I also give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

In [None]:
# I can slice it too if I want
df2 = df[:4]
df2

In [None]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

In [None]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

#### Pivot tables

In [None]:
# create my dataframe using a dictionary
df = pd.DataFrame({'A': ['one','one','two','three'] * 3,
                  'B': ['A','B','C'] * 4,
                  'C': ['foo','foo','foo','bar','bar','bar'] * 2,
                  'D': [1,2,3,4,5,6,7,8,9,10,11,12],
                  'E': [13,14,15,16,17,18,19,20,21,22,23,24]})
df

In [None]:
# make the pivot table
pd.pivot_table(df, values='D', index=['A','B'], columns=['C'])

#### Time Series

In [None]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng

In [None]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts

In [None]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc

In [None]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern')

#### Stopped at 'Categoricals'