In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# create a Series
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# use objects like date_range (returns a DatetimeIndex immutable ndarray) 
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
# dates become row labels and a list made from a string become column labels
# note here np.random.randn(6,4) returns values randomly from z dist with 6,4 dimensions
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [6]:
# a dict of objects, all either extensible infinitely or containing length of 4
df2 = pd.DataFrame({'A':1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1,index=list(range(4)),dtype='float32'),
                   'D': np.array([3] * 4,dtype='int32'),
                   'E': pd.Categorical(["test","train","test","train"]),
                   'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
# show the data types
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
# see top number of rows
df2.head(n=2)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo


In [9]:
# see bottom number of rows
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [10]:
# display index (rows), columns and underlying numpy data
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [11]:
# display columns
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [12]:
# show a quick statistical summary of the data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.151875,-0.329889,-0.372748,0.40268
std,0.996744,1.000478,0.531146,0.835715
min,-1.529953,-1.39022,-1.322253,-0.745599
25%,-0.925017,-1.028072,-0.481258,0.031228
50%,0.145516,-0.487959,-0.296,0.317226
75%,0.64357,0.091997,-0.050033,0.725664
max,0.799942,1.309599,0.18882,1.729523


In [13]:
# transpose the data
# original data
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [14]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-1.189493,-1.529953,0.422622,0.717219,-0.131589,0.799942
B,-1.39022,-0.047242,1.309599,-0.928676,-1.061204,0.13841
C,-1.322253,-0.013585,-0.432622,-0.497471,-0.159379,0.18882
D,0.253678,1.729523,0.840627,-0.745599,0.380774,-0.042922


In [15]:
# sort by an axis
df.sort_index(axis=1, ascending=False)
# axis=0 sorts by rows in either ascending or decending (ascending=True or ascending=Falase)
# axis = 1 sorts by columns

Unnamed: 0,D,C,B,A
2013-01-01,0.253678,-1.322253,-1.39022,-1.189493
2013-01-02,1.729523,-0.013585,-0.047242,-1.529953
2013-01-03,0.840627,-0.432622,1.309599,0.422622
2013-01-04,-0.745599,-0.497471,-0.928676,0.717219
2013-01-05,0.380774,-0.159379,-1.061204,-0.131589
2013-01-06,-0.042922,0.18882,0.13841,0.799942


In [16]:
# sort by values of a particular column
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-06,0.799942,0.13841,0.18882,-0.042922
2013-01-03,0.422622,1.309599,-0.432622,0.840627


In [17]:
# Python / Numpy expressions for selecting and setting are fine, but 
# for production code use pandas data access methods:
# .at, .iat, .loc, .iloc and .ix

In [18]:
# show original df dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [19]:
# Selecting a single column, which yields a Series, equivalent to df.A
df['A']

2013-01-01   -1.189493
2013-01-02   -1.529953
2013-01-03    0.422622
2013-01-04    0.717219
2013-01-05   -0.131589
2013-01-06    0.799942
Freq: D, Name: A, dtype: float64

In [20]:
df.A

2013-01-01   -1.189493
2013-01-02   -1.529953
2013-01-03    0.422622
2013-01-04    0.717219
2013-01-05   -0.131589
2013-01-06    0.799942
Freq: D, Name: A, dtype: float64

In [21]:
# select via [] which slices at indices
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523


In [22]:
# slice using index keywords
# note endpoint is included
df['20130101':'20130102']

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523


In [23]:
# selection by label
df.loc[dates[0]]

A   -1.189493
B   -1.390220
C   -1.322253
D    0.253678
Name: 2013-01-01 00:00:00, dtype: float64

In [24]:
# show original set again
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [25]:
# remember, that the index was made via the variable 'dates'
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [26]:
# get a cross section with the first label in dates
df.loc[dates[0]]

A   -1.189493
B   -1.390220
C   -1.322253
D    0.253678
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
# select all the rows and just columns A and B
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.189493,-1.39022
2013-01-02,-1.529953,-0.047242
2013-01-03,0.422622,1.309599
2013-01-04,0.717219,-0.928676
2013-01-05,-0.131589,-1.061204
2013-01-06,0.799942,0.13841


In [28]:
# same selection but using index labels
df.loc['20130101':'20130106',['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.189493,-1.39022
2013-01-02,-1.529953,-0.047242
2013-01-03,0.422622,1.309599
2013-01-04,0.717219,-0.928676
2013-01-05,-0.131589,-1.061204
2013-01-06,0.799942,0.13841


In [29]:
# just from a single row and columns A and B
df.loc['20130102', ['A','B']]

A   -1.529953
B   -0.047242
Name: 2013-01-02 00:00:00, dtype: float64

In [30]:
# get a single value (scalar)
df.loc[dates[0],'A']

-1.1894929599149628

In [31]:
# note, can also use the label
df.loc['20130101','A']

-1.1894929599149628

In [32]:
# also, this is same as above two
df.at[dates[0],'A']

-1.1894929599149628

#### selection by Position

In [33]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [34]:
# show row at the 3th
df.iloc[3]

A    0.717219
B   -0.928676
C   -0.497471
D   -0.745599
Name: 2013-01-04 00:00:00, dtype: float64

In [35]:
# slice rows and columns in numpy/python style
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.717219,-0.928676
2013-01-05,-0.131589,-1.061204


In [36]:
# by lists of integer position locations
# shows the 1th, 2th and 4th rows, 0th and 2th columns
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-1.529953,-0.013585
2013-01-03,0.422622,-0.432622
2013-01-05,-0.131589,-0.159379


In [37]:
# for slicing columns explicitly
# all rows, 1th (inclusive) to 3th (exclusiv) columns
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-1.39022,-1.322253
2013-01-02,-0.047242,-0.013585
2013-01-03,1.309599,-0.432622
2013-01-04,-0.928676,-0.497471
2013-01-05,-1.061204,-0.159379
2013-01-06,0.13841,0.18882


In [38]:
# explicitly get a value
df.iloc[1,1]

-0.047242397545546655

In [39]:
# fast access to a scalar(same effect as above method)
df.iat[1,1]

-0.047242397545546655

#### Boolean indexing

In [40]:
# show original dataframe
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [41]:
# using a single column's values to select data
# all rows where value in column A is greater than 0
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [42]:
# only cells greater than 0
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,0.253678
2013-01-02,,,,1.729523
2013-01-03,0.422622,1.309599,,0.840627
2013-01-04,0.717219,,,
2013-01-05,,,,0.380774
2013-01-06,0.799942,0.13841,0.18882,


In [43]:
# filter through the datatable using isin()

# make a copy of the dataframe
df2 = df.copy()
# add a new column
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
# select only rows with certain keywords that match
df2[df2['E'].isin(['one', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678,one
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523,one
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774,four


#### Setting

In [44]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523
2013-01-03,0.422622,1.309599,-0.432622,0.840627
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774
2013-01-06,0.799942,0.13841,0.18882,-0.042922


In [45]:
# make a new Series that I can add to the dataframe
# !!! REMEMBER, a dataframe can be thought of as a dict of Series
# where the Series labels for a union if redundant
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [46]:
# now add it to df dataframe as column 'F'
# notice that column F didn't start from 20130101 so its value there is Nan
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.189493,-1.39022,-1.322253,0.253678,
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523,1.0
2013-01-03,0.422622,1.309599,-0.432622,0.840627,2.0
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774,4.0
2013-01-06,0.799942,0.13841,0.18882,-0.042922,5.0


In [47]:
# now set values by label index and column label
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-1.39022,-1.322253,0.253678,
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523,1.0
2013-01-03,0.422622,1.309599,-0.432622,0.840627,2.0
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774,4.0
2013-01-06,0.799942,0.13841,0.18882,-0.042922,5.0


In [48]:
# set values by position index
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,0.253678,
2013-01-02,-1.529953,-0.047242,-0.013585,1.729523,1.0
2013-01-03,0.422622,1.309599,-0.432622,0.840627,2.0
2013-01-04,0.717219,-0.928676,-0.497471,-0.745599,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,0.380774,4.0
2013-01-06,0.799942,0.13841,0.18882,-0.042922,5.0


In [49]:
# replaces all rows in column D with an ndarray of 5's of length of df dataframe
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,5,
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0
2013-01-03,0.422622,1.309599,-0.432622,5,2.0
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,5,4.0
2013-01-06,0.799942,0.13841,0.18882,5,5.0


In [50]:
# conduct a where operation to replace any value greater than zero with 
# its inverse
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,-5,
2013-01-02,-1.529953,-0.047242,-0.013585,-5,-1.0
2013-01-03,-0.422622,-1.309599,-0.432622,-5,-2.0
2013-01-04,-0.717219,-0.928676,-0.497471,-5,-3.0
2013-01-05,-0.131589,-1.061204,-0.159379,-5,-4.0
2013-01-06,-0.799942,-0.13841,-0.18882,-5,-5.0


#### Missing Data

In [51]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,5,
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0
2013-01-03,0.422622,1.309599,-0.432622,5,2.0
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,5,4.0
2013-01-06,0.799942,0.13841,0.18882,5,5.0


In [52]:
# reindex allow you to change/add/delete the index of a specified axis

# reindexes to only include 0th, 1th, 2th, 3th index (rows)
# uses same columns but adds one call 'E' that initially has NaN in all
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-1.322253,5,,
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0,
2013-01-03,0.422622,1.309599,-0.432622,5,2.0,
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0,


In [53]:
# change the 0th and 1th values to 1.0
df1.loc[dates[0]:dates[1], 'E'] = 1 # notice enpoint on rows is inclusive
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-1.322253,5,,1.0
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0,1.0
2013-01-03,0.422622,1.309599,-0.432622,5,2.0,
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0,


In [54]:
# drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0,1.0


In [55]:
# fill in missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-1.322253,5,5.0,1.0
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0,1.0
2013-01-03,0.422622,1.309599,-0.432622,5,2.0,5.0
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0,5.0


In [56]:
# get boolean mask where values are nan
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### Operations

#### Stats

In [57]:
# show original dataframe
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,5,
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0
2013-01-03,0.422622,1.309599,-0.432622,5,2.0
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,5,4.0
2013-01-06,0.799942,0.13841,0.18882,5,5.0


In [58]:
# the mean of each column
df.mean()

A    0.046373
B   -0.098186
C   -0.372748
D    5.000000
F    3.000000
dtype: float64

In [59]:
# mean across rows
df.mean(1)

2013-01-01    0.919437
2013-01-02    0.881844
2013-01-03    1.659920
2013-01-04    1.458215
2013-01-05    1.529566
2013-01-06    2.225434
Freq: D, dtype: float64

In [60]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates)
s

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [61]:
# shift all values down 2 (newly freed values are now NaN)
s.shift(2)

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [62]:
# show df dataframe again
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,5,
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0
2013-01-03,0.422622,1.309599,-0.432622,5,2.0
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,5,4.0
2013-01-06,0.799942,0.13841,0.18882,5,5.0


In [63]:
# s is a series. .sub with the 'index' axis means to subtract each corresponding
# value across each item across the indices (rows)
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.0,-1.0,-2.322253,4.0,
2013-01-02,-4.529953,-3.047242,-3.013585,2.0,-2.0
2013-01-03,-4.577378,-3.690401,-5.432622,0.0,-3.0
2013-01-04,,,,,
2013-01-05,-6.131589,-7.061204,-6.159379,-1.0,-2.0
2013-01-06,-7.200058,-7.86159,-7.81118,-3.0,-3.0


#### Apply

In [64]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,5,
2013-01-02,-1.529953,-0.047242,-0.013585,5,1.0
2013-01-03,0.422622,1.309599,-0.432622,5,2.0
2013-01-04,0.717219,-0.928676,-0.497471,5,3.0
2013-01-05,-0.131589,-1.061204,-0.159379,5,4.0
2013-01-06,0.799942,0.13841,0.18882,5,5.0


In [65]:
# .apply applies a function along the input axis ('index' by default)
# np.cumsum is cumulative sum function from numpy, which adds up as it goes down
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.322253,5,
2013-01-02,-1.529953,-0.047242,-1.335837,10,1.0
2013-01-03,-1.107332,1.262356,-1.768459,15,3.0
2013-01-04,-0.390112,0.333681,-2.26593,20,6.0
2013-01-05,-0.521702,-0.727523,-2.425308,25,10.0
2013-01-06,0.27824,-0.589114,-2.236489,30,15.0


#### Histogramming

In [66]:
# numpy function np.random.randint(low inclusive, high exclusive, how many to make)
s = pd.Series(np.random.randint(0,7, size=10))
s

0    5
1    3
2    5
3    5
4    6
5    5
6    0
7    3
8    0
9    4
dtype: int64

In [67]:
# count frequency of each value
# value on left (x sub j), frequency on right
s.value_counts()

5    4
3    2
0    2
6    1
4    1
dtype: int64

#### String Methods

In [68]:
# Series can process strings. Uses regular expressions in many cases
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
# make all lowercase
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

#### Merge

In [69]:
# create a pandas dataframe with 10 x 4 random numbers (via numpy function)
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,0.258373,0.173749,0.308842,-0.880909
1,-1.043235,0.082614,-1.010609,0.552225
2,-0.337871,-0.307367,-0.120029,-1.178602
3,1.31701,-0.460084,-0.964466,-0.069482
4,-0.694961,2.107041,0.09154,0.448991
5,-0.597445,0.368627,0.516787,0.867636
6,1.828728,1.432103,0.647725,0.06328
7,0.451698,0.218438,-0.701768,0.189568
8,-2.316664,0.474668,-2.195772,-2.529044
9,-2.249526,-0.611861,0.116151,-0.410878


In [70]:
# break it into pieces - a list with slices
piece1 = df[:3]
piece2 = df[3:7]
piece3 = df[7:]

# throw the pieces into a list and add to the pd.concat function to 
# put it all together again
pd.concat([piece1, piece2, piece3])

Unnamed: 0,0,1,2,3
0,0.258373,0.173749,0.308842,-0.880909
1,-1.043235,0.082614,-1.010609,0.552225
2,-0.337871,-0.307367,-0.120029,-1.178602
3,1.31701,-0.460084,-0.964466,-0.069482
4,-0.694961,2.107041,0.09154,0.448991
5,-0.597445,0.368627,0.516787,0.867636
6,1.828728,1.432103,0.647725,0.06328
7,0.451698,0.218438,-0.701768,0.189568
8,-2.316664,0.474668,-2.195772,-2.529044
9,-2.249526,-0.611861,0.116151,-0.410878


#### Join

In [71]:
# SQL style merges
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval':[1,2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval':[4,5]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [72]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [73]:
# on='key' matches a value with others given the same key
# so 1 merges with 4 and 5, and then 2 merges with 4 and 4 as well
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [74]:
# OR...
left = pd.DataFrame({'key':['foo', 'bar'], 'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'], 'rval':[4,5]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [75]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [76]:
# here, because there is only one matching 'foo', 1 merges with 4
# and 2 merges with 5
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


#### Append rows to a dataframe

In [77]:
# first remember that np.random.randn(8, 4) will create an 8x4 ndarray
# which is just a list of 8 lists with 4 items in each
# contents will be randomly generated floats from the z dist
print(np.random.randn(8, 4))

[[-1.20714896 -1.14787762  1.52423189  0.40803505]
 [-0.5025024  -0.05242337  0.24856337 -0.10083979]
 [ 2.42772351  0.01565429  0.05720924 -0.29944175]
 [ 1.3619791   0.0195566  -0.03021882 -0.73249338]
 [ 0.32047919  0.33208083  0.55475544  0.62749437]
 [ 0.26082506 -2.4320144   0.47788609  0.54282325]
 [ 1.3950106  -1.93048357  0.67579633 -0.90666284]
 [-1.6392893   0.71432532 -1.49714045 -1.13197772]]


In [78]:
# create the dataframe
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,-2.288712,-1.436783,-0.464511,0.384231
1,0.666102,0.499951,1.790957,0.768327
2,-1.214533,0.853775,-0.769805,0.857925
3,-0.028744,0.696811,-1.042793,0.456059
4,-0.775098,-1.92273,0.621376,0.098636
5,-0.203385,-1.641765,-0.029536,0.85681
6,-0.380972,-0.329888,1.297703,-0.108151
7,-1.377262,0.502247,0.010635,1.044921


In [79]:
# retreive the 3th row
s = df.iloc[3]
s

A   -0.028744
B    0.696811
C   -1.042793
D    0.456059
Name: 3, dtype: float64

In [80]:
# copy into a new appended row in dataframe
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-2.288712,-1.436783,-0.464511,0.384231
1,0.666102,0.499951,1.790957,0.768327
2,-1.214533,0.853775,-0.769805,0.857925
3,-0.028744,0.696811,-1.042793,0.456059
4,-0.775098,-1.92273,0.621376,0.098636
5,-0.203385,-1.641765,-0.029536,0.85681
6,-0.380972,-0.329888,1.297703,-0.108151
7,-1.377262,0.502247,0.010635,1.044921
8,-0.028744,0.696811,-1.042793,0.456059


#### Grouping

In [81]:
# create new dataframe
df = pd.DataFrame({'A': ['foo','bar','foo','bar','foo','bar','foo','foo'],
                  'B': ['one','one','two','three','two','two','one','three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.132018,0.396835
1,bar,one,2.605155,-0.369765
2,foo,two,0.445866,2.778172
3,bar,three,-0.812548,1.173325
4,foo,two,0.908384,1.777824
5,bar,two,-1.072309,2.458881
6,foo,one,-0.468584,-0.246135
7,foo,three,-0.230137,0.934048


In [82]:
# group items by unique items in column 'A', then sum them up to create new value
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.720298,3.262442
foo,0.787549,5.640744


In [83]:
# group hierarchically, then sum up results
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,2.605155,-0.369765
bar,three,-0.812548,1.173325
bar,two,-1.072309,2.458881
foo,one,-0.336565,0.1507
foo,three,-0.230137,0.934048
foo,two,1.354251,4.555996


### Reshaping

In [84]:
# zip(*[[list of n lenght],[list of n length]]) converts things into tuples.
# list() adds everything to a list

# Here my goal is to make a stack like this:
"""
                     A         B
first second                    
bar   one     0.029399 -0.542108
      two     0.282696 -0.087302
baz   one    -1.575170  1.771208
      two     0.816482  1.100230
"""

#first I make a list of tuples
tuples = list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],
     ['one','two','one','two','one','two','one','two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [85]:
# now I make the multi index object
my_multi_index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
my_multi_index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [86]:
# now Mi make a dataframe with random numbers and the multi index
# I also give the columns of the dataframe labels
df = pd.DataFrame(np.random.randn(8,2), index=my_multi_index, columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.412608,0.127117
bar,two,0.65078,-0.743324
baz,one,1.308483,-1.156507
baz,two,0.770438,0.16886
foo,one,-0.152692,-1.159813
foo,two,-0.327089,0.536114
qux,one,0.765618,-0.431563
qux,two,1.327563,-0.410191


In [87]:
# I can slice it too if I want
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.412608,0.127117
bar,two,0.65078,-0.743324
baz,one,1.308483,-1.156507
baz,two,0.770438,0.16886


In [88]:
# stack() method "compresses" a level in the DataFrame's columns
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.412608
               B    0.127117
       two     A    0.650780
               B   -0.743324
baz    one     A    1.308483
               B   -1.156507
       two     A    0.770438
               B    0.168860
dtype: float64

In [90]:
# and of course you can unstack it
# note, this is done to multiple levels with a number inside the parens
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.412608,0.127117
bar,two,0.65078,-0.743324
baz,one,1.308483,-1.156507
baz,two,0.770438,0.16886


#### Pivot tables

In [96]:
# create my dataframe using a dictionary
df = pd.DataFrame({'A': ['one','one','two','three'] * 3,
                  'B': ['A','B','C'] * 4,
                  'C': ['foo','foo','foo','bar','bar','bar'] * 2,
                  'D': [1,2,3,4,5,6,7,8,9,10,11,12],
                  'E': [13,14,15,16,17,18,19,20,21,22,23,24]})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1,13
1,one,B,foo,2,14
2,two,C,foo,3,15
3,three,A,bar,4,16
4,one,B,bar,5,17
5,one,C,bar,6,18
6,two,A,foo,7,19
7,three,B,foo,8,20
8,one,C,foo,9,21
9,one,A,bar,10,22


In [97]:
# make the pivot table
pd.pivot_table(df, values='D', index=['A','B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,10.0,1.0
one,B,5.0,2.0
one,C,6.0,9.0
three,A,4.0,
three,B,,8.0
three,C,12.0,
two,A,,7.0
two,B,11.0,
two,C,,3.0


#### Time Series

In [99]:
# create the date DateTimeIndex object using pandas date_range() function
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07',
               '2012-01-01 00:00:08', '2012-01-01 00:00:09',
               '2012-01-01 00:00:10', '2012-01-01 00:00:11',
               '2012-01-01 00:00:12', '2012-01-01 00:00:13',
               '2012-01-01 00:00:14', '2012-01-01 00:00:15',
               '2012-01-01 00:00:16', '2012-01-01 00:00:17',
               '2012-01-01 00:00:18', '2012-01-01 00:00:19',
               '2012-01-01 00:00:20', '2012-01-01 00:00:21',
               '2012-01-01 00:00:22', '2012-01-01 00:00:23',
               '2012-01-01 00:00:24', '2012-01-01 00:00:25',
               '2012-01-01 00:00:26', '2012-01-01 00:00:27',
               '2012-01-01 00:00:28', '2012-01-01 00:00:29',
               '2012-01-01 00:00:30', '2012-01-01 00:00:31',
               '2012-01-

In [100]:
# create a Series object filled with random numbers between 0 and 500
# index is rng
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts

2012-01-01 00:00:00    216
2012-01-01 00:00:01      0
2012-01-01 00:00:02    423
2012-01-01 00:00:03    251
2012-01-01 00:00:04    405
2012-01-01 00:00:05    371
2012-01-01 00:00:06    417
2012-01-01 00:00:07    273
2012-01-01 00:00:08    432
2012-01-01 00:00:09    444
2012-01-01 00:00:10    210
2012-01-01 00:00:11    124
2012-01-01 00:00:12    328
2012-01-01 00:00:13    382
2012-01-01 00:00:14    328
2012-01-01 00:00:15    387
2012-01-01 00:00:16    182
2012-01-01 00:00:17    151
2012-01-01 00:00:18    296
2012-01-01 00:00:19     52
2012-01-01 00:00:20    473
2012-01-01 00:00:21    339
2012-01-01 00:00:22     29
2012-01-01 00:00:23    138
2012-01-01 00:00:24    212
2012-01-01 00:00:25    383
2012-01-01 00:00:26    153
2012-01-01 00:00:27     95
2012-01-01 00:00:28    394
2012-01-01 00:00:29    371
                      ... 
2012-01-01 00:01:10    449
2012-01-01 00:01:11     71
2012-01-01 00:01:12    168
2012-01-01 00:01:13    474
2012-01-01 00:01:14    482
2012-01-01 00:01:15    259
2

In [101]:
# localize to timezone
ts_utc = ts.tz_localize('UTC')
ts_utc

2012-01-01 00:00:00+00:00    216
2012-01-01 00:00:01+00:00      0
2012-01-01 00:00:02+00:00    423
2012-01-01 00:00:03+00:00    251
2012-01-01 00:00:04+00:00    405
2012-01-01 00:00:05+00:00    371
2012-01-01 00:00:06+00:00    417
2012-01-01 00:00:07+00:00    273
2012-01-01 00:00:08+00:00    432
2012-01-01 00:00:09+00:00    444
2012-01-01 00:00:10+00:00    210
2012-01-01 00:00:11+00:00    124
2012-01-01 00:00:12+00:00    328
2012-01-01 00:00:13+00:00    382
2012-01-01 00:00:14+00:00    328
2012-01-01 00:00:15+00:00    387
2012-01-01 00:00:16+00:00    182
2012-01-01 00:00:17+00:00    151
2012-01-01 00:00:18+00:00    296
2012-01-01 00:00:19+00:00     52
2012-01-01 00:00:20+00:00    473
2012-01-01 00:00:21+00:00    339
2012-01-01 00:00:22+00:00     29
2012-01-01 00:00:23+00:00    138
2012-01-01 00:00:24+00:00    212
2012-01-01 00:00:25+00:00    383
2012-01-01 00:00:26+00:00    153
2012-01-01 00:00:27+00:00     95
2012-01-01 00:00:28+00:00    394
2012-01-01 00:00:29+00:00    371
          

In [102]:
# convert to another timezone
ts_utc.tz_convert('US/Eastern')

2011-12-31 19:00:00-05:00    216
2011-12-31 19:00:01-05:00      0
2011-12-31 19:00:02-05:00    423
2011-12-31 19:00:03-05:00    251
2011-12-31 19:00:04-05:00    405
2011-12-31 19:00:05-05:00    371
2011-12-31 19:00:06-05:00    417
2011-12-31 19:00:07-05:00    273
2011-12-31 19:00:08-05:00    432
2011-12-31 19:00:09-05:00    444
2011-12-31 19:00:10-05:00    210
2011-12-31 19:00:11-05:00    124
2011-12-31 19:00:12-05:00    328
2011-12-31 19:00:13-05:00    382
2011-12-31 19:00:14-05:00    328
2011-12-31 19:00:15-05:00    387
2011-12-31 19:00:16-05:00    182
2011-12-31 19:00:17-05:00    151
2011-12-31 19:00:18-05:00    296
2011-12-31 19:00:19-05:00     52
2011-12-31 19:00:20-05:00    473
2011-12-31 19:00:21-05:00    339
2011-12-31 19:00:22-05:00     29
2011-12-31 19:00:23-05:00    138
2011-12-31 19:00:24-05:00    212
2011-12-31 19:00:25-05:00    383
2011-12-31 19:00:26-05:00    153
2011-12-31 19:00:27-05:00     95
2011-12-31 19:00:28-05:00    394
2011-12-31 19:00:29-05:00    371
          

In [104]:
# make into month only

2012-01-01 00:00:00    216
2012-01-01 00:00:01      0
2012-01-01 00:00:02    423
2012-01-01 00:00:03    251
2012-01-01 00:00:04    405
2012-01-01 00:00:05    371
2012-01-01 00:00:06    417
2012-01-01 00:00:07    273
2012-01-01 00:00:08    432
2012-01-01 00:00:09    444
2012-01-01 00:00:10    210
2012-01-01 00:00:11    124
2012-01-01 00:00:12    328
2012-01-01 00:00:13    382
2012-01-01 00:00:14    328
2012-01-01 00:00:15    387
2012-01-01 00:00:16    182
2012-01-01 00:00:17    151
2012-01-01 00:00:18    296
2012-01-01 00:00:19     52
2012-01-01 00:00:20    473
2012-01-01 00:00:21    339
2012-01-01 00:00:22     29
2012-01-01 00:00:23    138
2012-01-01 00:00:24    212
2012-01-01 00:00:25    383
2012-01-01 00:00:26    153
2012-01-01 00:00:27     95
2012-01-01 00:00:28    394
2012-01-01 00:00:29    371
                      ... 
2012-01-01 00:01:10    449
2012-01-01 00:01:11     71
2012-01-01 00:01:12    168
2012-01-01 00:01:13    474
2012-01-01 00:01:14    482
2012-01-01 00:01:15    259
2

#### Stopped at 'Categoricals'