# 10 Minutes to Pandas Tutorial
From pandas [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) Tutorial.

In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### pandas Series
A Series in Pandas is a "One-dimensional ndarray with axis labels (including time series)"

In [99]:
# create a pandas Series
# here labels are 0 to 5
# notice it is default type float64
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [100]:
# other pandas objects, like date_range() make an ndarray for you
# date_range() object returns a filled-in DatetimeIndex object
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### pandas Data Frame

Basically, this tutorial is all about pandas dataframes. They can be thought of as "a dict-like container for Series objects."

In [101]:
# pandas DataFrame() object creates a dataframe for me
# numpy's np.random.randn(n,m) returns an ndarray of shape n x m with random values from standard normal distribution
# dates becomes the index (rows) of the dataframe
# column labels generated via Python list() function

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [102]:
# reorder the columns
df[['D','C','B','A']]

Unnamed: 0,D,C,B,A
2013-01-01,0.251435,-0.362747,0.583195,1.134056
2013-01-02,0.093,-0.978983,0.271105,1.039313
2013-01-03,0.005967,-0.265295,0.37577,0.242209
2013-01-04,-1.392337,0.502278,-0.825912,-1.662372
2013-01-05,-0.765169,-0.641277,0.394504,0.674477
2013-01-06,0.522565,-1.454142,0.380369,0.331899


In [103]:
# put order back
df[['A','B','C','D']]

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [104]:
# use DataFrame() object with a Python dict to create the dataframe
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list('ABCD'),dtype='float32'),
                    # notice dict item C creates my index labels
                   'D': np.array([3] * 4,dtype='int32'),
                    # D is a numpy array with number 3 given 4 times
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo',
                   'G': [1,2,3,4]})
                    # all dict values must have same number of items
                    # or be a item (value which is copied 4 times)
df2

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [105]:
# show the data types of each dict item in the dataframe
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G             int64
dtype: object

In [106]:
# see first n rows
df2.head(n=3)

Unnamed: 0,A,B,C,D,E,F,G
A,1.0,2013-01-02,1.0,3,test,foo,1
B,1.0,2013-01-02,1.0,3,train,foo,2
C,1.0,2013-01-02,1.0,3,test,foo,3


In [107]:
# see bottom n rows
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F,G
C,1.0,2013-01-02,1.0,3,test,foo,3
D,1.0,2013-01-02,1.0,3,train,foo,4


In [108]:
# return index (rows) which is an immutable ndarray
df2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [109]:
# display columns which is an immutable ndarray
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')

In [110]:
# back to the df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [111]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.293264,0.196505,-0.533361,-0.21409
std,1.02347,0.510994,0.668069,0.720286
min,-1.662372,-0.825912,-1.454142,-1.392337
25%,0.264632,0.297271,-0.894557,-0.572385
50%,0.503188,0.37807,-0.502012,0.049484
75%,0.948104,0.39097,-0.289658,0.211827
max,1.134056,0.583195,0.502278,0.522565


In [112]:
# transpose the data
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.134056,1.039313,0.242209,-1.662372,0.674477,0.331899
B,0.583195,0.271105,0.37577,-0.825912,0.394504,0.380369
C,-0.362747,-0.978983,-0.265295,0.502278,-0.641277,-1.454142
D,0.251435,0.093,0.005967,-1.392337,-0.765169,0.522565


In [113]:
# sort by an axis (by rows or by columns)
df.sort_index(axis=1, ascending=False)
# axis=1 sorts by row labels, axis=0 by column labels
# ascending=False sorts in descending order

Unnamed: 0,D,C,B,A
2013-01-01,0.251435,-0.362747,0.583195,1.134056
2013-01-02,0.093,-0.978983,0.271105,1.039313
2013-01-03,0.005967,-0.265295,0.37577,0.242209
2013-01-04,-1.392337,0.502278,-0.825912,-1.662372
2013-01-05,-0.765169,-0.641277,0.394504,0.674477
2013-01-06,0.522565,-1.454142,0.380369,0.331899


In [114]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-06,0.331899,0.380369,-1.454142,0.522565
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-01,1.134056,0.583195,-0.362747,0.251435


In [115]:
# Python / Numpy expressions for selecting and setting are fine, but 
# for production code use pandas data access methods:
# .at, .iat, .loc, .iloc and .ix

In [116]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [117]:
# Selecting a single column, which yields a Series, equivalent to df.A
# both are a pandas Series
df['A']

2013-01-01    1.134056
2013-01-02    1.039313
2013-01-03    0.242209
2013-01-04   -1.662372
2013-01-05    0.674477
2013-01-06    0.331899
Freq: D, Name: A, dtype: float64

In [118]:
df.A

2013-01-01    1.134056
2013-01-02    1.039313
2013-01-03    0.242209
2013-01-04   -1.662372
2013-01-05    0.674477
2013-01-06    0.331899
Freq: D, Name: A, dtype: float64

In [119]:
# select via [] which slices at indices
# 0th (inclusive) to the 2th (exclusive) row (so, row 0 and row 1)
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093


In [120]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093


In [121]:
# selection by label, returns a pandas Series
df.loc[dates[0]]

A    1.134056
B    0.583195
C   -0.362747
D    0.251435
Name: 2013-01-01 00:00:00, dtype: float64

In [122]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [123]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [124]:
# returns first row from the df dataframe, a pandas Series
df.loc[dates[0]]

A    1.134056
B    0.583195
C   -0.362747
D    0.251435
Name: 2013-01-01 00:00:00, dtype: float64

In [125]:
# here the ':' indicates all rows, and the list with ['A','B'] indicate columns A and B
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,1.134056,0.583195
2013-01-02,1.039313,0.271105
2013-01-03,0.242209,0.37577
2013-01-04,-1.662372,-0.825912
2013-01-05,0.674477,0.394504
2013-01-06,0.331899,0.380369


In [126]:
# same exact selection but using index labels
# note, index (rows) selection is inclusive
df.loc['20130101':'20130106',['A','B']]

Unnamed: 0,A,B
2013-01-01,1.134056,0.583195
2013-01-02,1.039313,0.271105
2013-01-03,0.242209,0.37577
2013-01-04,-1.662372,-0.825912
2013-01-05,0.674477,0.394504
2013-01-06,0.331899,0.380369


In [127]:
# just from a single row and columns A and B returns a pandas Series
df.loc['20130102', ['A','B']]

A    1.039313
B    0.271105
Name: 2013-01-02 00:00:00, dtype: float64

In [128]:
# get a single value (scalar)
df.loc[dates[0],'A']

1.1340562369008351

In [129]:
# same exact value but using a specific label
df.loc['20130101','A']

1.1340562369008351

In [130]:
# .at is same as above
df.at[dates[0],'A']

1.1340562369008351

In [131]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [132]:
# show row at the 3th row
df.iloc[3]

A   -1.662372
B   -0.825912
C    0.502278
D   -1.392337
Name: 2013-01-04 00:00:00, dtype: float64

In [133]:
# slice rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-1.662372,-0.825912
2013-01-05,0.674477,0.394504


In [134]:
# by lists of integer position locations
# shows the 1th, 2th and 4th rows, 0th and 2th columns
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.039313,-0.978983
2013-01-03,0.242209,-0.265295
2013-01-05,0.674477,-0.641277


In [135]:
# for slicing columns explicitly
# all rows, 1th (inclusive) to 3th (exclusiv) columns
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,0.583195,-0.362747
2013-01-02,0.271105,-0.978983
2013-01-03,0.37577,-0.265295
2013-01-04,-0.825912,0.502278
2013-01-05,0.394504,-0.641277
2013-01-06,0.380369,-1.454142


In [136]:
# explicitly get a value, returns a float64
df.iloc[1,2]

-0.97898298232812964

In [137]:
# fast access to a scalar(same effect as above method), returns a float64
df.iat[1,1]

0.2711047802515591

In [138]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [139]:
# some boolean indexing
# "where any value in column A is greater than 0"
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [140]:
# "anywhere where greater than 0"
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,,0.251435
2013-01-02,1.039313,0.271105,,0.093
2013-01-03,0.242209,0.37577,,0.005967
2013-01-04,,,0.502278,
2013-01-05,0.674477,0.394504,,
2013-01-06,0.331899,0.380369,,0.522565


In [141]:
# filter through the dataframe using isin()

# make a copy of the dataframe called df2
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.134056,0.583195,-0.362747,0.251435,one
2013-01-02,1.039313,0.271105,-0.978983,0.093,one
2013-01-03,0.242209,0.37577,-0.265295,0.005967,two
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337,three
2013-01-05,0.674477,0.394504,-0.641277,-0.765169,four
2013-01-06,0.331899,0.380369,-1.454142,0.522565,three


In [142]:
# referencing column E, select only rows with certain keywords that match 'one' or 'four'
df2[df2['E'].isin(['one', 'four'])]

# !!!I'd like to know how the slice is made here. Question
# posted at: http://stackoverflow.com/questions/41733696/how-does-pandas-use-a-series-object-to-slice-a-data-frame

Unnamed: 0,A,B,C,D,E
2013-01-01,1.134056,0.583195,-0.362747,0.251435,one
2013-01-02,1.039313,0.271105,-0.978983,0.093,one
2013-01-05,0.674477,0.394504,-0.641277,-0.765169,four


In [143]:
# the contents of df2[] below are a pandas Series of type bool
# wherever it is True, the corresponding label's row is returned
df2['E'].isin(['one','four'])

2013-01-01     True
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool

##### Setting values in a Data Frame

In [144]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.134056,0.583195,-0.362747,0.251435
2013-01-02,1.039313,0.271105,-0.978983,0.093
2013-01-03,0.242209,0.37577,-0.265295,0.005967
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337
2013-01-05,0.674477,0.394504,-0.641277,-0.765169
2013-01-06,0.331899,0.380369,-1.454142,0.522565


In [145]:
# make a new Series that I can add to the dataframe
# !!! REMEMBER, a dataframe can be thought of as a dict of Series
# where the Series labels form a union if redundant
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [146]:
# now add it to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
# 2013-01-07 is dropped
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.134056,0.583195,-0.362747,0.251435,
2013-01-02,1.039313,0.271105,-0.978983,0.093,1.0
2013-01-03,0.242209,0.37577,-0.265295,0.005967,2.0
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337,3.0
2013-01-05,0.674477,0.394504,-0.641277,-0.765169,4.0
2013-01-06,0.331899,0.380369,-1.454142,0.522565,5.0


In [147]:
# now set values by label index and column label
df.at[dates[0], 'F'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.134056,0.583195,-0.362747,0.251435,0.0
2013-01-02,1.039313,0.271105,-0.978983,0.093,1.0
2013-01-03,0.242209,0.37577,-0.265295,0.005967,2.0
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337,3.0
2013-01-05,0.674477,0.394504,-0.641277,-0.765169,4.0
2013-01-06,0.331899,0.380369,-1.454142,0.522565,5.0


In [148]:
# set values by position index
df.iat[0,0] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.583195,-0.362747,0.251435,0.0
2013-01-02,1.039313,0.271105,-0.978983,0.093,1.0
2013-01-03,0.242209,0.37577,-0.265295,0.005967,2.0
2013-01-04,-1.662372,-0.825912,0.502278,-1.392337,3.0
2013-01-05,0.674477,0.394504,-0.641277,-0.765169,4.0
2013-01-06,0.331899,0.380369,-1.454142,0.522565,5.0


In [149]:
# replaces all rows in column D with an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.583195,-0.362747,5,0.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0
2013-01-03,0.242209,0.37577,-0.265295,5,2.0
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0
2013-01-05,0.674477,0.394504,-0.641277,5,4.0
2013-01-06,0.331899,0.380369,-1.454142,5,5.0


In [150]:
# conduct a where operation to replace any value greater
# than zero with its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.583195,-0.362747,-5,0.0
2013-01-02,-1.039313,-0.271105,-0.978983,-5,-1.0
2013-01-03,-0.242209,-0.37577,-0.265295,-5,-2.0
2013-01-04,-1.662372,-0.825912,-0.502278,-5,-3.0
2013-01-05,-0.674477,-0.394504,-0.641277,-5,-4.0
2013-01-06,-0.331899,-0.380369,-1.454142,-5,-5.0


##### Missing Data

In [151]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.583195,-0.362747,5,0.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0
2013-01-03,0.242209,0.37577,-0.265295,5,2.0
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0
2013-01-05,0.674477,0.394504,-0.641277,5,4.0
2013-01-06,0.331899,0.380369,-1.454142,5,5.0


In [152]:
# reindex allow you to change/add/delete the index of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.583195,-0.362747,5,0.0,
2013-01-02,1.039313,0.271105,-0.978983,5,1.0,
2013-01-03,0.242209,0.37577,-0.265295,5,2.0,
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0,


In [153]:
# change the 0th and 1th values to 1.0
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.583195,-0.362747,5,0.0,1.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0,1.0
2013-01-03,0.242209,0.37577,-0.265295,5,2.0,
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0,


In [154]:
# drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.583195,-0.362747,5,0.0,1.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0,1.0


In [155]:
# fill in missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.583195,-0.362747,5,0.0,1.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0,1.0
2013-01-03,0.242209,0.37577,-0.265295,5,2.0,5.0
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0,5.0


In [156]:
# get boolean mask where values are nan
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


##### Operations

In [157]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.583195,-0.362747,5,0.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0
2013-01-03,0.242209,0.37577,-0.265295,5,2.0
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0
2013-01-05,0.674477,0.394504,-0.641277,5,4.0
2013-01-06,0.331899,0.380369,-1.454142,5,5.0


In [158]:
# the mean of each column
df.mean()

A    0.104255
B    0.196505
C   -0.533361
D    5.000000
F    2.500000
dtype: float64

In [159]:
# mean across rows
df.mean(1)

2013-01-01    1.044090
2013-01-02    1.266287
2013-01-03    1.470537
2013-01-04    1.202799
2013-01-05    1.885541
2013-01-06    1.851625
Freq: D, dtype: float64

In [160]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [161]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [162]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.583195,-0.362747,5,0.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0
2013-01-03,0.242209,0.37577,-0.265295,5,2.0
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0
2013-01-05,0.674477,0.394504,-0.641277,5,4.0
2013-01-06,0.331899,0.380369,-1.454142,5,5.0


In [163]:
# deletes the values in Series s from the corresponding
# index from df
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.0,-0.416805,-1.362747,4.0,-1.0
2013-01-02,-1.960687,-2.728895,-3.978983,2.0,-2.0
2013-01-03,-4.757791,-4.62423,-5.265295,0.0,-3.0
2013-01-04,,,,,
2013-01-05,-5.325523,-5.605496,-6.641277,-1.0,-2.0
2013-01-06,-7.668101,-7.619631,-9.454142,-3.0,-3.0


In [164]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.583195,-0.362747,5,0.0
2013-01-02,1.039313,0.271105,-0.978983,5,1.0
2013-01-03,0.242209,0.37577,-0.265295,5,2.0
2013-01-04,-1.662372,-0.825912,0.502278,5,3.0
2013-01-05,0.674477,0.394504,-0.641277,5,4.0
2013-01-06,0.331899,0.380369,-1.454142,5,5.0


In [165]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.583195,-0.362747,5,0.0
2013-01-02,1.039313,0.8543,-1.34173,10,1.0
2013-01-03,1.281523,1.23007,-1.607025,15,3.0
2013-01-04,-0.380849,0.404158,-1.104746,20,6.0
2013-01-05,0.293628,0.798662,-1.746023,25,10.0
2013-01-06,0.625527,1.17903,-3.200165,30,15.0


In [166]:
# numpy function np.random.randint(low inclusive, high exclusive, how many to make)
s = pd.Series(np.random.randint(0,7, size=10))
s

0    6
1    2
2    1
3    0
4    1
5    2
6    3
7    4
8    3
9    2
dtype: int64

In [167]:
# count frequency of each value
# value on left (x sub j), frequency on right
s.value_counts()

2    3
3    2
1    2
6    1
4    1
0    1
dtype: int64

##### String Methods

In [168]:
# Series can process strings. Uses regular expressions in many cases
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
# make all lowercase
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

##### Merge

In [169]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,2.354649,-1.547589,-0.483099,-0.542639
1,-0.647923,1.353381,-1.045847,-1.182717
2,1.000625,0.459642,0.487795,-0.878343
3,0.800032,-1.22468,-0.026757,-0.741876
4,-0.78235,-2.157844,0.156882,-0.752975
5,-1.292551,0.112704,0.116512,0.929721
6,-1.01679,-0.413457,-1.026239,2.103214
7,0.567829,-0.857907,0.274123,0.151765
8,0.221618,-0.570034,0.873421,-0.865566
9,-0.566478,0.537711,1.156914,1.612913


In [170]:
# break it into pieces via Python slicing - a list with slices
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list and add to the pd.concat function to 
# put it all together again
pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,2.354649,-1.547589,-0.483099,-0.542639
1,-0.647923,1.353381,-1.045847,-1.182717
2,1.000625,0.459642,0.487795,-0.878343
3,0.800032,-1.22468,-0.026757,-0.741876
4,-0.78235,-2.157844,0.156882,-0.752975
5,-1.292551,0.112704,0.116512,0.929721
6,-1.01679,-0.413457,-1.026239,2.103214
7,0.567829,-0.857907,0.274123,0.151765
8,0.221618,-0.570034,0.873421,-0.865566
9,-0.566478,0.537711,1.156914,1.612913


##### Join

In [171]:
# SQL style merges
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval':[1,2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [172]:
right

Unnamed: 0,key,rval
0,foo,3
1,foo,4


In [173]:
# on='key' matches a value with others given the same key
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,foo,1,4
2,foo,2,3
3,foo,2,4


In [174]:
# OR...
left = pd.DataFrame({'key':['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'], 'rval':[3,4]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [175]:
right

Unnamed: 0,key,rval
0,foo,3
1,bar,4


In [176]:
# here, because there is only one matching 'foo', 1 merges with 4
# and 2 merges with 5
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,3
1,bar,2,4


##### Append rows to a dataframe

In [177]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each
# contents will be randomly generated floats from the z dist
print(np.random.randn(8, 4))

[[ 0.36766002 -0.23609819  1.08832203  0.91136673]
 [ 0.89645291 -0.63449463 -0.39480051  0.35227924]
 [-0.89629686  1.85862908 -1.68030911 -0.70832531]
 [ 0.21088853  0.54215428 -0.58993911  0.24373631]
 [ 0.03903496  1.61468461 -0.30965602 -0.27457555]
 [ 0.29494487  0.56438524  1.28698097  1.00864596]
 [ 0.07535351 -0.6965786   0.71674706 -0.35239393]
 [-0.0312235   0.52892386 -1.70994012 -0.13855686]]


In [178]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,2.243061,1.377183,-1.516951,0.420813
1,0.905176,-1.505662,0.084437,0.038139
2,0.055388,-0.072788,-0.939363,0.340169
3,0.210321,-0.277973,1.32969,-0.285486
4,-0.592334,-1.323727,-1.148889,-1.151923
5,0.105723,-1.211078,-0.699572,-1.000774
6,-1.331925,-0.05075,-0.941621,-0.356076
7,0.349731,-0.855498,-0.575056,-0.104365


In [179]:
# retreive the 3th row
s = df.iloc[3]
s

A    0.210321
B   -0.277973
C    1.329690
D   -0.285486
Name: 3, dtype: float64

In [180]:
# copy into a new appended row in dataframe
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,2.243061,1.377183,-1.516951,0.420813
1,0.905176,-1.505662,0.084437,0.038139
2,0.055388,-0.072788,-0.939363,0.340169
3,0.210321,-0.277973,1.32969,-0.285486
4,-0.592334,-1.323727,-1.148889,-1.151923
5,0.105723,-1.211078,-0.699572,-1.000774
6,-1.331925,-0.05075,-0.941621,-0.356076
7,0.349731,-0.855498,-0.575056,-0.104365
8,0.210321,-0.277973,1.32969,-0.285486


##### Grouping

In [181]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.639881,-0.169539
1,bar,one,0.969369,0.70671
2,foo,two,-1.17617,0.218652
3,bar,three,1.005545,0.698175
4,foo,two,-1.708607,-0.034317
5,bar,two,1.241071,-0.033484
6,foo,one,-1.407505,1.24492
7,foo,three,1.224055,-1.404028


In [182]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,3.215985,1.371401
foo,-3.708109,-0.144311


In [183]:
# group hierarchically, then sum up results
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.969369,0.70671
bar,three,1.005545,0.698175
bar,two,1.241071,-0.033484
foo,one,-2.047386,1.075381
foo,three,1.224055,-1.404028
foo,two,-2.884778,0.184336


##### Grouping using other method (what method is this?)

In [184]:
df = pd.DataFrame({'A':[0,1,1,0],
                   'B': ['A','B','C','D']})
df

Unnamed: 0,A,B
0,0,A
1,1,B
2,1,C
3,0,D


In [185]:
# create a Series
df.A

0    0
1    1
2    1
3    0
Name: A, dtype: int64

In [186]:
# create a series of bool that matches a condition
df.A == 1

0    False
1     True
2     True
3    False
Name: A, dtype: bool

In [187]:
# create dataframe with a condition met
only_ones = df[df.A == 1]
only_ones

Unnamed: 0,A,B
1,1,B
2,1,C


##### Reshaping

In [188]:
# zip(*[[list of n lenght],[list of n length]]) converts things into tuples.
# list() adds everything to a list

# Here my goal is to make a stack like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [189]:
# now I make the multi index object
my_multi_index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
my_multi_index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [190]:
# now Mi make a dataframe with random numbers and the multi index
# I also give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.057274,1.373607
bar,two,-0.72837,-0.070684
baz,one,-0.256057,-0.318328
baz,two,-0.578671,-2.307406
foo,one,0.561558,0.421839
foo,two,-1.044868,-1.674899
qux,one,-1.7621,0.058622
qux,two,0.272999,-0.620191


In [191]:
# I can slice it too if I want
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.057274,1.373607
bar,two,-0.72837,-0.070684
baz,one,-0.256057,-0.318328
baz,two,-0.578671,-2.307406


In [192]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.057274
               B    1.373607
       two     A   -0.728370
               B   -0.070684
baz    one     A   -0.256057
               B   -0.318328
       two     A   -0.578671
               B   -2.307406
dtype: float64

In [193]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.057274,1.373607
bar,two,-0.72837,-0.070684
baz,one,-0.256057,-0.318328
baz,two,-0.578671,-2.307406


##### Time Series

In [201]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [202]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()

2012-01-01 00:00:00    409
2012-01-01 00:00:01    459
2012-01-01 00:00:02    414
2012-01-01 00:00:03    170
2012-01-01 00:00:04    455
Freq: S, dtype: int64

In [203]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc.head()

2012-01-01 00:00:00+00:00    409
2012-01-01 00:00:01+00:00    459
2012-01-01 00:00:02+00:00    414
2012-01-01 00:00:03+00:00    170
2012-01-01 00:00:04+00:00    455
Freq: S, dtype: int64

In [204]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern').head()

2011-12-31 19:00:00-05:00    409
2011-12-31 19:00:01-05:00    459
2011-12-31 19:00:02-05:00    414
2011-12-31 19:00:03-05:00    170
2011-12-31 19:00:04-05:00    455
Freq: S, dtype: int64

# Stopped doing tutorial at 'Categoricals'