# 10 Minutes to Pandas Tutorial
From pandas [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) Tutorial.

In [1]:
# import declarations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Object Creation

##### Making Pandas Series
A Series in Pandas is a "One-dimensional ndarray with axis labels (including time series)"

In [2]:
# make a pandas Series
# here labels are 0 to 5
# notice it is default type float64 and one value is numpy NaN
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# make a Series with a date range
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

##### Make Pandas Data Frame

Basically, this tutorial is all about pandas dataframes. They can be thought of as "a dict-like container for Series objects."

In [4]:
# create DataFrame using DataFrame() object
# numpy's np.random.randn(n,m) returns an ndarray of shape n x m with random values from standard normal distribution
# dates (above) becomes the index (rows) of the dataframe
# column labels generated via Python list() function

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [5]:
# reorder columns
df[['D','C','B','A']]

Unnamed: 0,D,C,B,A
2013-01-01,0.491316,0.158277,-0.99796,-1.471273
2013-01-02,-2.186124,-0.104695,0.424673,-1.042766
2013-01-03,-0.482412,0.344959,0.229958,-0.634885
2013-01-04,0.391039,-1.598003,-0.332753,-0.097947
2013-01-05,-0.856687,-0.217042,-0.327554,1.000791
2013-01-06,-1.421834,0.599156,-1.023509,0.670908


In [6]:
# put order back
df[['A','B','C','D']]

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [7]:
# make DataFrame() object using a Python dict
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list('ABCD'),dtype='float32'),
                    # notice dict item C creates my index labels
                   'D': np.array([3] * 4,dtype='int32'),
                    # D is a numpy array with number 3 given 4 times
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo',
                   'G': [1,2,3,4]})
                    # all dict values must have same number of items
                    # or be a item (value which is copied 4 times)
df2

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


### Viewing Data/Changing DataFrames

In [8]:
# show data types of each dict item in the dataframe
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [9]:
# see first n rows
df2.head(n=3)

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3


In [10]:
# see bottom n rows
df2.tail(n=2)

Unnamed: 0,A,B,C,D,E,F,G
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [11]:
# return index (rows) which is an immutable ndarray
df2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# display columns which is an immutable ndarray
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [13]:
# return column string, which is indexed starting at 0
df2.columns[0]

'A'

In [14]:
# back to the df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [15]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.262529,-0.337858,-0.136225,-0.67745
std,0.96983,0.60164,0.775032,1.040005
min,-1.471273,-1.023509,-1.598003,-2.186124
25%,-0.940796,-0.831658,-0.188955,-1.280547
50%,-0.366416,-0.330154,0.026791,-0.669549
75%,0.478694,0.09058,0.298288,0.172677
max,1.000791,0.424673,0.599156,0.491316


In [16]:
# transpose the data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-1.471273,-1.042766,-0.634885,-0.097947,1.000791,0.670908
B,-0.99796,0.424673,0.229958,-0.332753,-0.327554,-1.023509
C,0.158277,-0.104695,0.344959,-1.598003,-0.217042,0.599156
D,0.491316,-2.186124,-0.482412,0.391039,-0.856687,-1.421834


In [17]:
# sort by an axis (by rows or by columns)
# axis=1 sorts by row labels, axis=0 by column labels
# ascending=False sorts in descending order
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.491316,0.158277,-0.99796,-1.471273
2013-01-02,-2.186124,-0.104695,0.424673,-1.042766
2013-01-03,-0.482412,0.344959,0.229958,-0.634885
2013-01-04,0.391039,-1.598003,-0.332753,-0.097947
2013-01-05,-0.856687,-0.217042,-0.327554,1.000791
2013-01-06,-1.421834,0.599156,-1.023509,0.670908


In [18]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-06,0.670908,-1.023509,0.599156,-1.421834
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124


### Selecting in DataFrames using Pandas functions
Python / Numpy expressions for selecting and setting are fine, but for production code use pandas data access methods: .at, .iat, .loc, .iloc and .ix

##### Python/Numpy expressions

In [19]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [20]:
# Select a single column, which yields a Series
# equivalent to df.A
df['A']

2013-01-01   -1.471273
2013-01-02   -1.042766
2013-01-03   -0.634885
2013-01-04   -0.097947
2013-01-05    1.000791
2013-01-06    0.670908
Freq: D, Name: A, dtype: float64

In [21]:
# Select via [] which slices at indices
# 0th (inclusive) to the 2th (exclusive) row (so, row 0 and row 1)
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124


In [22]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124


##### Pandas Data Access Methods: .loc and .at (label-based selection functions)

In [23]:
# .loc is label-based selection
# select by label, returns a pandas Series
df.loc[dates[0]]

A   -1.471273
B   -0.997960
C    0.158277
D    0.491316
Name: 2013-01-01 00:00:00, dtype: float64

In [24]:
# select by label (same as above)
df.loc['2013-01-01']

A   -1.471273
B   -0.997960
C    0.158277
D    0.491316
Name: 2013-01-01 00:00:00, dtype: float64

In [25]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [26]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [27]:
# select all rows with columns labeled 'A' and 'B'
# .loc labels are always inclusive
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.471273,-0.99796
2013-01-02,-1.042766,0.424673
2013-01-03,-0.634885,0.229958
2013-01-04,-0.097947,-0.332753
2013-01-05,1.000791,-0.327554
2013-01-06,0.670908,-1.023509


In [28]:
# select a single row and columns A and B (returns a pandas Series)
df.loc['20130102', ['A','B']]

A   -1.042766
B    0.424673
Name: 2013-01-02 00:00:00, dtype: float64

In [29]:
# select a single value (scalar)
df.loc[dates[0],'A']

-1.471272789825415

In [30]:
# select a single value (scalar) using labels
df.loc['20130101','A']

-1.471272789825415

In [31]:
# .at does same as .loc
df.at[dates[0],'A']

-1.471272789825415

In [32]:
# select based on keyewords (labels) using .isin()
# focusing on column E, select only rows with certain keywords that match 'one' or 'four'
df2[df2['E'].isin(['one', 'four'])]

Unnamed: 0,A,B,C,D,E,F,G


In [33]:
# using .isin(), the contents of column E are rendered a pandas Series of type bool
df2['E'].isin(['one','four'])

A    False
B    False
C    False
D    False
Name: E, dtype: bool

In [34]:
# therefore, when placed in brackets, wherever it is True, the corresponding label's row is returned
df2[df2['E'].isin(['one', 'four'])]

Unnamed: 0,A,B,C,D,E,F,G


##### Pandas Data Access Method: .iloc (index-based selection function)

In [35]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [36]:
# select 3th row
df.iloc[3]

A   -0.097947
B   -0.332753
C   -1.598003
D    0.391039
Name: 2013-01-04 00:00:00, dtype: float64

In [37]:
# select rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.097947,-0.332753
2013-01-05,1.000791,-0.327554


In [38]:
# select by lists of integer position locations
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-1.042766,-0.104695
2013-01-03,-0.634885,0.344959
2013-01-05,1.000791,-0.217042


In [39]:
# select a scalar (returns a float64)
df.iloc[1,2]

-0.10469496007682833

In [40]:
# select with fast access to a scalar (same effect as above method)
df.iat[1,2]

-0.10469496007682833

In [41]:
# create a new dataframe
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.471273,-0.99796,0.158277,0.491316,one
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124,one
2013-01-03,-0.634885,0.229958,0.344959,-0.482412,two
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039,three
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687,four
2013-01-06,0.670908,-1.023509,0.599156,-1.421834,three


##### Selection Using Boolean Indexing

In [42]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [43]:
# select using boolean indexing
# select "only rows where any value in column A is greater than 0"
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [44]:
# select all rows but only show "anywhere where greater than 0"
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,0.158277,0.491316
2013-01-02,,0.424673,,
2013-01-03,,0.229958,0.344959,
2013-01-04,,,,0.391039
2013-01-05,1.000791,,,
2013-01-06,0.670908,,0.599156,


### Setting Values in Pandas DataFrames

In [45]:
# show df again
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.471273,-0.99796,0.158277,0.491316
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124
2013-01-03,-0.634885,0.229958,0.344959,-0.482412
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687
2013-01-06,0.670908,-1.023509,0.599156,-1.421834


In [46]:
# make a new Series that I can add to the dataframe
# first make a date range
dates = pd.date_range('20130102', periods=6)
dates

DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05',
               '2013-01-06', '2013-01-07'],
              dtype='datetime64[ns]', freq='D')

In [47]:
# throw the date range into a new Pandas Series
s1 = pd.Series([1,2,3,4,5,6], index=dates)
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [48]:
# now add the Series to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
# 2013-01-07 is dropped
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.471273,-0.99796,0.158277,0.491316,
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124,1.0
2013-01-03,-0.634885,0.229958,0.344959,-0.482412,2.0
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039,3.0
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687,4.0
2013-01-06,0.670908,-1.023509,0.599156,-1.421834,5.0


##### Pandas Data Setting Methods: .at (label-based), .iat (index-based)

In [49]:
# set value at labels
df.at[dates[0],'F'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.471273,-0.99796,0.158277,0.491316,
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124,0.0
2013-01-03,-0.634885,0.229958,0.344959,-0.482412,2.0
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039,3.0
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687,4.0
2013-01-06,0.670908,-1.023509,0.599156,-1.421834,5.0


In [50]:
# set values by position index
df.iat[0,0] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.99796,0.158277,0.491316,
2013-01-02,-1.042766,0.424673,-0.104695,-2.186124,0.0
2013-01-03,-0.634885,0.229958,0.344959,-0.482412,2.0
2013-01-04,-0.097947,-0.332753,-1.598003,0.391039,3.0
2013-01-05,1.000791,-0.327554,-0.217042,-0.856687,4.0
2013-01-06,0.670908,-1.023509,0.599156,-1.421834,5.0


##### Pandas Data Setting Methods: .loc (label-based), .iloc (index-based)

In [51]:
# .loc to set all rows in column D with a value from an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.99796,0.158277,5,
2013-01-02,-1.042766,0.424673,-0.104695,5,0.0
2013-01-03,-0.634885,0.229958,0.344959,5,2.0
2013-01-04,-0.097947,-0.332753,-1.598003,5,3.0
2013-01-05,1.000791,-0.327554,-0.217042,5,4.0
2013-01-06,0.670908,-1.023509,0.599156,5,5.0


In [52]:
# .iloc to set all rows in column D with a value from an ndarray of 5's of length of df dataframe
df.iloc[:,0] = np.array([1] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1,-0.99796,0.158277,5,
2013-01-02,1,0.424673,-0.104695,5,0.0
2013-01-03,1,0.229958,0.344959,5,2.0
2013-01-04,1,-0.332753,-1.598003,5,3.0
2013-01-05,1,-0.327554,-0.217042,5,4.0
2013-01-06,1,-1.023509,0.599156,5,5.0


### Working with Missing Data

In [53]:
# show dates range again
dates

DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05',
               '2013-01-06', '2013-01-07'],
              dtype='datetime64[ns]', freq='D')

In [54]:
# show original dataframe again
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1,-0.99796,0.158277,5,
2013-01-02,1,0.424673,-0.104695,5,0.0
2013-01-03,1,0.229958,0.344959,5,2.0
2013-01-04,1,-0.332753,-1.598003,5,3.0
2013-01-05,1,-0.327554,-0.217042,5,4.0
2013-01-06,1,-1.023509,0.599156,5,5.0


In [55]:
# reindex allow you to change/add/delete the indices of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1,0.424673,-0.104695,5,0.0,
2013-01-03,1,0.229958,0.344959,5,2.0,
2013-01-04,1,-0.332753,-1.598003,5,3.0,
2013-01-05,1,-0.327554,-0.217042,5,4.0,


In [56]:
# change the 0th and 1th values in a certain column
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1,0.424673,-0.104695,5,0.0,1.0
2013-01-03,1,0.229958,0.344959,5,2.0,1.0
2013-01-04,1,-0.332753,-1.598003,5,3.0,
2013-01-05,1,-0.327554,-0.217042,5,4.0,


In [57]:
# drop any rows that have any missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1,0.424673,-0.104695,5,0.0,1.0
2013-01-03,1,0.229958,0.344959,5,2.0,1.0


In [58]:
# fill in any NaN
df1.fillna(value=100000)

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1,0.424673,-0.104695,5,0.0,1.0
2013-01-03,1,0.229958,0.344959,5,2.0,1.0
2013-01-04,1,-0.332753,-1.598003,5,3.0,100000.0
2013-01-05,1,-0.327554,-0.217042,5,4.0,100000.0


In [59]:
# get boolean mask where values are NaN
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,False
2013-01-04,False,False,False,False,False,True
2013-01-05,False,False,False,False,False,True


### Operations on DataFrames

In [60]:
# Replace any value greater than zero with its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-0.99796,-0.158277,-5,
2013-01-02,-1,-0.424673,-0.104695,-5,0.0
2013-01-03,-1,-0.229958,-0.344959,-5,-2.0
2013-01-04,-1,-0.332753,-1.598003,-5,-3.0
2013-01-05,-1,-0.327554,-0.217042,-5,-4.0
2013-01-06,-1,-1.023509,-0.599156,-5,-5.0


In [61]:
# calculate mean of each column
df2.mean()

A   -1.000000
B   -0.556068
C   -0.503689
D   -5.000000
F   -2.800000
dtype: float64

In [62]:
# calculate mean across rows
df2.mean(1)

2013-01-01   -1.789059
2013-01-02   -1.305874
2013-01-03   -1.714983
2013-01-04   -2.186151
2013-01-05   -2.108919
2013-01-06   -2.524533
Freq: D, dtype: float64

In [63]:
# create a new Series called "s" using dates as index
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-02    1.0
2013-01-03    3.0
2013-01-04    5.0
2013-01-05    NaN
2013-01-06    6.0
2013-01-07    8.0
Freq: D, dtype: float64

In [64]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

2013-01-02    NaN
2013-01-03    NaN
2013-01-04    1.0
2013-01-05    3.0
2013-01-06    5.0
2013-01-07    NaN
Freq: D, dtype: float64

In [65]:
# show new dataframe again
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-0.99796,-0.158277,-5,
2013-01-02,-1,-0.424673,-0.104695,-5,0.0
2013-01-03,-1,-0.229958,-0.344959,-5,-2.0
2013-01-04,-1,-0.332753,-1.598003,-5,-3.0
2013-01-05,-1,-0.327554,-0.217042,-5,-4.0
2013-01-06,-1,-1.023509,-0.599156,-5,-5.0


In [66]:
# delete each value from the Series s from each item in the corresponding index from the DataFrame
df2.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,-2.0,-1.424673,-1.104695,-6.0,-1.0
2013-01-03,-4.0,-3.229958,-3.344959,-8.0,-5.0
2013-01-04,-6.0,-5.332753,-6.598003,-10.0,-8.0
2013-01-05,,,,,
2013-01-06,-7.0,-7.023509,-6.599156,-11.0,-11.0
2013-01-07,,,,,


In [67]:
# show df2 again
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-0.99796,-0.158277,-5,
2013-01-02,-1,-0.424673,-0.104695,-5,0.0
2013-01-03,-1,-0.229958,-0.344959,-5,-2.0
2013-01-04,-1,-0.332753,-1.598003,-5,-3.0
2013-01-05,-1,-0.327554,-0.217042,-5,-4.0
2013-01-06,-1,-1.023509,-0.599156,-5,-5.0


In [68]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df2.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,-1,-0.99796,-0.158277,-5,
2013-01-02,-2,-1.422632,-0.262972,-10,0.0
2013-01-03,-3,-1.652591,-0.607931,-15,-2.0
2013-01-04,-4,-1.985344,-2.205934,-20,-5.0
2013-01-05,-5,-2.312898,-2.422976,-25,-9.0
2013-01-06,-6,-3.336407,-3.022132,-30,-14.0


In [69]:
# create an ndarray of size 10 with random integers between 0 (inclusive) and 7 (exclusive)
nd_arr = np.random.randint(0,7, size=10)
nd_arr

array([5, 6, 5, 1, 6, 6, 1, 4, 5, 5])

In [70]:
# create a Series using nd_arr
s = pd.Series(nd_arr)
s

0    5
1    6
2    5
3    1
4    6
5    6
6    1
7    4
8    5
9    5
dtype: int64

In [71]:
# count frequency of each value
# frequency on left, value counted on right
freq_count = s.value_counts()
freq_count

5    4
6    3
1    2
4    1
dtype: int64

### String Methods

In [72]:
# create a new Series
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [73]:
# make all lowercase
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

### Merge DataFrames

In [74]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,0.155363,-2.03533,1.765712,-0.541034
1,-1.858936,2.219272,-1.78492,-0.386984
2,0.113371,2.198677,0.257074,-1.15553
3,-0.296417,0.300888,0.237896,0.124705
4,0.780164,0.511484,-0.287396,-0.465423
5,1.812912,0.15592,0.359084,1.205338
6,-0.907467,1.171502,-0.902072,-1.234397
7,0.275188,-0.595032,-0.154225,1.594004
8,2.046189,0.783518,1.237867,-0.33194
9,0.497181,-0.934858,-1.03094,0.434017


In [75]:
# break the DataFrame up into pieces via Python slicing
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list
# use pd.concat function to put it all together again
pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,0.155363,-2.03533,1.765712,-0.541034
1,-1.858936,2.219272,-1.78492,-0.386984
2,0.113371,2.198677,0.257074,-1.15553
3,-0.296417,0.300888,0.237896,0.124705
4,0.780164,0.511484,-0.287396,-0.465423
5,1.812912,0.15592,0.359084,1.205338
6,-0.907467,1.171502,-0.902072,-1.234397
7,0.275188,-0.595032,-0.154225,1.594004
8,2.046189,0.783518,1.237867,-0.33194
9,0.497181,-0.934858,-1.03094,0.434017


### SQL-Style Merges

In [76]:
# create 'left' DataFrame with same values in 'key'
left = pd.DataFrame({'key': ['foo', 'foo'], 'left_value':[1,2]})
left

Unnamed: 0,key,left_value
0,foo,1
1,foo,2


In [77]:
# create 'right' DataFrame with same values in 'key'
right = pd.DataFrame({'key': ['foo', 'foo'], 'right_value':[3,4]})
right

Unnamed: 0,key,right_value
0,foo,3
1,foo,4


In [78]:
# merge based certain 'key' column
# notice each value in left_value is paired up with each of the two values in right_value
pd.merge(left, right, on='key')

Unnamed: 0,key,left_value,right_value
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [79]:
# create 'left' DataFrame with different values in 'key'
left = pd.DataFrame({'key':['foo', 'bar'], 'left_value':[1,2]})
left

Unnamed: 0,key,left_value
0,foo,1
1,bar,2


In [80]:
# create 'right' DataFrame with different values in 'key'
right = pd.DataFrame({'key':['foo','bar'], 'right_value':[3,4]})
right

Unnamed: 0,key,right_value
0,foo,3
1,bar,4


In [81]:
# merge based on 'key' column
# here, the value in 'key' aligns with each value from left_value and right_value
pd.merge(left, right, on='key')

Unnamed: 0,key,left_value,right_value
0,foo,1,3
1,bar,2,4


### Append rows to a dataframe

In [82]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each.
# contents will be randomly generated floats from the z distribution
print(np.random.randn(8, 4))

[[-1.18015073  0.03150943  1.67929806  1.06925492]
 [-0.84988999  0.94301899 -0.83143271 -0.42113916]
 [-0.30591412  0.54911466 -1.62847523 -0.18314092]
 [-1.77298638 -1.08732465  1.37648168 -0.75272782]
 [ 0.96113291  0.22015491 -0.2236096   0.82095913]
 [ 0.29162966 -1.18075792  0.14927499 -0.994416  ]
 [ 0.0594584  -1.46628076 -0.32747839 -0.40710315]
 [-0.76307168 -1.30866565 -0.80832299 -0.3871937 ]]


In [83]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,-0.426557,-1.215287,0.168398,1.117709
1,-1.917093,1.453677,-0.81167,-0.132779
2,-0.2926,-1.51177,0.772565,-1.562025
3,1.216355,0.067317,0.305577,-0.575279
4,-0.821764,-0.235865,-0.00198,-0.31183
5,-0.257812,-0.632816,-0.058521,-0.966496
6,-2.016405,-0.094683,1.458095,0.341253
7,1.005716,-0.405661,1.527876,0.619978


In [84]:
# retreive the 3th row as a Series
s = df.iloc[3]
s

A    1.216355
B    0.067317
C    0.305577
D   -0.575279
Name: 3, dtype: float64

In [85]:
# copy that Series into a new appended row in dataframe
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.426557,-1.215287,0.168398,1.117709
1,-1.917093,1.453677,-0.81167,-0.132779
2,-0.2926,-1.51177,0.772565,-1.562025
3,1.216355,0.067317,0.305577,-0.575279
4,-0.821764,-0.235865,-0.00198,-0.31183
5,-0.257812,-0.632816,-0.058521,-0.966496
6,-2.016405,-0.094683,1.458095,0.341253
7,1.005716,-0.405661,1.527876,0.619978
8,1.216355,0.067317,0.305577,-0.575279


### Grouping with groupby

In [87]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.42766,0.183651
1,bar,one,-0.12652,-0.550344
2,foo,two,1.353811,1.518257
3,bar,three,-0.334085,0.535048
4,foo,two,-2.089337,1.251697
5,bar,two,1.200004,0.776113
6,foo,one,-0.954357,1.911812
7,foo,three,-0.34811,0.203763


In [91]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.739399,0.760817
foo,-1.610334,5.06918


In [92]:
# group multiple columns, then sum up results
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.12652,-0.550344
bar,three,-0.334085,0.535048
bar,two,1.200004,0.776113
foo,one,-0.526698,2.095463
foo,three,-0.34811,0.203763
foo,two,-0.735526,2.769954


### Other Grouping Methods

In [None]:
# create a new dataframe
df = pd.DataFrame({'A':[0,1,1,0],
                   'B': ['A','B','C','D']})
df

In [None]:
# return a Series
df.A

In [None]:
# boolean check to see which values in the Series are equal to a value
df.A == 1

In [None]:
# create a new DataFrame only with rows in which the condition was met
only_ones = df[df.A == 1]
only_ones

### Working with MultiIndex

In [None]:
# zip(*[[list of n length],[list of n length]]) converts two lists into paired tuples.
# list() makes everything into a list

# Here my goal is to make a MultiIndex DataFrame like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
my_tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
my_tuples

In [None]:
# make the multi index object using .from_tuples method
my_multi_index = pd.MultiIndex.from_tuples(my_tuples, names=['first','second'])
my_multi_index

In [None]:
# make a DataFrame from the MultiIndex object
# give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

In [None]:
# slice the MultiIndexed DataFrame
df = df[:4]
df

In [None]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

In [None]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

### Time Series

In [None]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=10, freq='S')
rng

In [None]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()

In [None]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc.head()

In [None]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern').head()

# Stopped doing tutorial at 'Categoricals' section