## Intro to Pandas
+ Object Creation
+ Viewing data
+ Selection
+ Missing data
+ Grouping
+ Reshaping
+ Time series
+ Plotting
+ i/o

pandas.pydata.org


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [6]:
demo_2d = np.arange(24)
print(type(demo_2d))
print(demo_2d.shape)
demo_2d

<class 'numpy.ndarray'>
(24,)


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [7]:
demo_2d = np.arange(24).reshape(-1,2)
print(type(demo_2d))
print(demo_2d.shape)
demo_2d

<class 'numpy.ndarray'>
(12, 2)


array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [12, 13],
       [14, 15],
       [16, 17],
       [18, 19],
       [20, 21],
       [22, 23]])

### Q: Why is -1 the same as 12?

In [8]:
pd.DataFrame(demo_2d)   # note row and col indexes are auto-generated

Unnamed: 0,0,1
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
5,10,11
6,12,13
7,14,15
8,16,17
9,18,19


In [9]:
pd.date_range?

In [11]:
# creates a DateTime index
dates = pd.date_range('20140101', periods=6)
dates

DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
               '2014-01-05', '2014-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
list('ABCD')

['A', 'B', 'C', 'D']

In [13]:
# generates 6x4 dataframe with random numbers
# assigns dates (derined above) as indexes
df = pd.DataFrame(np.random.randn(6,4), 
                  index=dates, 
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2014-01-01,0.037203,1.448711,1.279602,-0.014906
2014-01-02,0.243059,0.890902,-0.331032,-2.085118
2014-01-03,-0.896884,-0.271741,0.457335,1.484917
2014-01-04,1.23744,1.748171,-0.380199,-0.050542
2014-01-05,-0.097982,-0.913569,1.004484,0.099598
2014-01-06,1.458151,-0.618865,-0.387347,-0.419083


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2014-01-01 to 2014-01-06
Freq: D
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.330165,0.380601,0.273807,-0.164189
std,0.880667,1.128793,0.74965,1.147107
min,-0.896884,-0.913569,-0.387347,-2.085118
25%,-0.064186,-0.532084,-0.367907,-0.326948
50%,0.140131,0.30958,0.063151,-0.032724
75%,0.988845,1.309259,0.867697,0.070972
max,1.458151,1.748171,1.279602,1.484917


In [17]:
df.describe

<bound method NDFrame.describe of                    A         B         C         D
2014-01-01  0.037203  1.448711  1.279602 -0.014906
2014-01-02  0.243059  0.890902 -0.331032 -2.085118
2014-01-03 -0.896884 -0.271741  0.457335  1.484917
2014-01-04  1.237440  1.748171 -0.380199 -0.050542
2014-01-05 -0.097982 -0.913569  1.004484  0.099598
2014-01-06  1.458151 -0.618865 -0.387347 -0.419083>

In [18]:
# index, columns, underlying numpy data
df

Unnamed: 0,A,B,C,D
2014-01-01,0.037203,1.448711,1.279602,-0.014906
2014-01-02,0.243059,0.890902,-0.331032,-2.085118
2014-01-03,-0.896884,-0.271741,0.457335,1.484917
2014-01-04,1.23744,1.748171,-0.380199,-0.050542
2014-01-05,-0.097982,-0.913569,1.004484,0.099598
2014-01-06,1.458151,-0.618865,-0.387347,-0.419083


In [19]:
df.transpose()

Unnamed: 0,2014-01-01 00:00:00,2014-01-02 00:00:00,2014-01-03 00:00:00,2014-01-04 00:00:00,2014-01-05 00:00:00,2014-01-06 00:00:00
A,0.037203,0.243059,-0.896884,1.23744,-0.097982,1.458151
B,1.448711,0.890902,-0.271741,1.748171,-0.913569,-0.618865
C,1.279602,-0.331032,0.457335,-0.380199,1.004484,-0.387347
D,-0.014906,-2.085118,1.484917,-0.050542,0.099598,-0.419083


In [21]:
# shorthand for transpose
df.T

Unnamed: 0,2014-01-01 00:00:00,2014-01-02 00:00:00,2014-01-03 00:00:00,2014-01-04 00:00:00,2014-01-05 00:00:00,2014-01-06 00:00:00
A,0.037203,0.243059,-0.896884,1.23744,-0.097982,1.458151
B,1.448711,0.890902,-0.271741,1.748171,-0.913569,-0.618865
C,1.279602,-0.331032,0.457335,-0.380199,1.004484,-0.387347
D,-0.014906,-2.085118,1.484917,-0.050542,0.099598,-0.419083


In [24]:
df.index

DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
               '2014-01-05', '2014-01-06'],
              dtype='datetime64[ns]', freq='D')

In [27]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [28]:
df.values

array([[ 0.03720268,  1.44871125,  1.27960228, -0.01490582],
       [ 0.24305909,  0.89090212, -0.3310319 , -2.08511764],
       [-0.89688385, -0.27174126,  0.45733455,  1.48491711],
       [ 1.23743988,  1.74817057, -0.38019862, -0.05054186],
       [-0.0979819 , -0.91356948,  1.00448428,  0.09959807],
       [ 1.45815149, -0.61886513, -0.38734744, -0.41908314]])

### New Example

In [29]:
pd.Series(1, 
          index=list(range(4)), 
          dtype='float32')

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32

In [39]:
df2 = pd.DataFrame(
    { 'A' : 1.,
      'B' : pd.Timestamp('20130102'),
      'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
      'D' : np.array([3] * 4, dtype='int32'),
      'E' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E
0,1.0,2013-01-02,1.0,3,foo
1,1.0,2013-01-02,1.0,3,foo
2,1.0,2013-01-02,1.0,3,foo
3,1.0,2013-01-02,1.0,3,foo


In [36]:
# with specific dtypes
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
dtype: object

### Viewing Data

In [41]:
df.head(1)

Unnamed: 0,A,B,C,D
2014-01-01,0.037203,1.448711,1.279602,-0.014906


In [42]:
df.tail(2)

Unnamed: 0,A,B,C,D
2014-01-05,-0.097982,-0.913569,1.004484,0.099598
2014-01-06,1.458151,-0.618865,-0.387347,-0.419083


In [43]:
df.index

DatetimeIndex(['2014-01-01', '2014-01-02', '2014-01-03', '2014-01-04',
               '2014-01-05', '2014-01-06'],
              dtype='datetime64[ns]', freq='D')

In [46]:
# Note: columns are also of type 'Index'
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### Quick summary statistics

In [45]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.330165,0.380601,0.273807,-0.164189
std,0.880667,1.128793,0.74965,1.147107
min,-0.896884,-0.913569,-0.387347,-2.085118
25%,-0.064186,-0.532084,-0.367907,-0.326948
50%,0.140131,0.30958,0.063151,-0.032724
75%,0.988845,1.309259,0.867697,0.070972
max,1.458151,1.748171,1.279602,1.484917


In [53]:
percentiles = np.arange(0, 1, 0.1)
print(type(percentiles))
percentiles

<class 'numpy.ndarray'>


array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [54]:
# convert to list
percentiles=list(percentiles)
print(type(percentiles))
percentiles

<class 'list'>


[0.0,
 0.1,
 0.2,
 0.30000000000000004,
 0.4,
 0.5,
 0.6000000000000001,
 0.7000000000000001,
 0.8,
 0.9]

In [55]:
percentiles = np.arange(0, 1, 0.1)
df.describe(percentiles=list(percentiles))  # instead of 25%, 50%, 75% => 10%, 20%, 30%, ,,,

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.330165,0.380601,0.273807,-0.164189
std,0.880667,1.128793,0.74965,1.147107
min,-0.896884,-0.913569,-0.387347,-2.085118
0%,-0.896884,-0.913569,-0.387347,-2.085118
10%,-0.497433,-0.766217,-0.383773,-1.2521
20%,-0.097982,-0.618865,-0.380199,-0.419083
30.0%,-0.03039,-0.445303,-0.355615,-0.234813
40%,0.037203,-0.271741,-0.331032,-0.050542
50%,0.140131,0.30958,0.063151,-0.032724


In [56]:
df.sort_values(by=['B', 'D'], ascending=False)

Unnamed: 0,A,B,C,D
2014-01-04,1.23744,1.748171,-0.380199,-0.050542
2014-01-01,0.037203,1.448711,1.279602,-0.014906
2014-01-02,0.243059,0.890902,-0.331032,-2.085118
2014-01-03,-0.896884,-0.271741,0.457335,1.484917
2014-01-06,1.458151,-0.618865,-0.387347,-0.419083
2014-01-05,-0.097982,-0.913569,1.004484,0.099598


### Selection