# 10 Minutes to Pandas

A notebook working through the Pandas tutorial [here](http://pandas.pydata.org/pandas-docs/stable/10min.html).  Once done can be used to try out other keyword arguments.


# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Object Creation

In [2]:
# Series creation

s = pd.Series([7,8,5,np.nan])
s

0    7.0
1    8.0
2    5.0
3    NaN
dtype: float64

In [3]:
# Dataframe creation

dates = pd.date_range('20160520',periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2016-05-20,-1.59534,1.282374,1.08735,-0.281389
2016-05-21,1.907173,1.207056,1.055957,-0.576032
2016-05-22,-0.035068,-1.632605,-0.214031,1.491922
2016-05-23,-1.785685,0.151009,0.089607,0.575147
2016-05-24,-0.677438,-0.775961,-2.028828,1.814576
2016-05-25,-0.61333,-0.355182,0.892233,0.030227


In [4]:
# Dataframe creation using dictionaries
kv={'A' : 2.,
    'B' : pd.Timestamp('20160605'),
    'C' : s,
    'D' : np.array([4,3,5,2],dtype='int32'),
    'E' : pd.Categorical(['foo','bar','fizz','buzz']),
    'F' : 'lorem ipsum'
   }
dict_df= pd.DataFrame(kv)
dict_df

Unnamed: 0,A,B,C,D,E,F
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
3,2.0,2016-06-05,,2,buzz,lorem ipsum


In [5]:
# Get data types
dict_df.dtypes

A           float64
B    datetime64[ns]
C           float64
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [6]:
# Head of dataframe

dict_df.head()

Unnamed: 0,A,B,C,D,E,F
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
3,2.0,2016-06-05,,2,buzz,lorem ipsum


In [7]:
# Tail of dataframe

dict_df.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
3,2.0,2016-06-05,,2,buzz,lorem ipsum


In [8]:
# Display columns

dict_df.columns

Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')

In [9]:
# Display data

dict_df.values

array([[2.0, Timestamp('2016-06-05 00:00:00'), 7.0, 4, 'foo', 'lorem ipsum'],
       [2.0, Timestamp('2016-06-05 00:00:00'), 8.0, 3, 'bar', 'lorem ipsum'],
       [2.0, Timestamp('2016-06-05 00:00:00'), 5.0, 5, 'fizz',
        'lorem ipsum'],
       [2.0, Timestamp('2016-06-05 00:00:00'), nan, 2, 'buzz',
        'lorem ipsum']], dtype=object)

In [10]:
# Display row indices

dict_df.index

RangeIndex(start=0, stop=4, step=1)

In [11]:
# Describe

dict_df.describe()



Unnamed: 0,A,C,D
count,4.0,3.0,4.0
mean,2.0,6.666667,3.5
std,0.0,1.527525,1.290994
min,2.0,5.0,2.0
25%,2.0,,2.75
50%,2.0,,3.5
75%,2.0,,4.25
max,2.0,8.0,5.0


In [12]:
# Transpose

dict_df.T

Unnamed: 0,0,1,2,3
A,2,2,2,2
B,2016-06-05 00:00:00,2016-06-05 00:00:00,2016-06-05 00:00:00,2016-06-05 00:00:00
C,7,8,5,
D,4,3,5,2
E,foo,bar,fizz,buzz
F,lorem ipsum,lorem ipsum,lorem ipsum,lorem ipsum


In [13]:
# Sort by index or column name

dict_df.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,2.0,2016-06-05,,2,buzz,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum


In [14]:
# Sort by values
# New in 0.18; older versions use 'sort'

dict_df.sort_values(by='D')

Unnamed: 0,A,B,C,D,E,F
3,2.0,2016-06-05,,2,buzz,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum


# Selection

## Getting

In [15]:
dict_df['A']

0    2.0
1    2.0
2    2.0
3    2.0
Name: A, dtype: float64

In [16]:
# Fascinating

dict_df['A']==dict_df.A

0    True
1    True
2    True
3    True
Name: A, dtype: bool

In [17]:
# Row slicing

df[0:2]

Unnamed: 0,A,B,C,D
2016-05-20,-1.59534,1.282374,1.08735,-0.281389
2016-05-21,1.907173,1.207056,1.055957,-0.576032


In [18]:
df['20160520':'20160523']

Unnamed: 0,A,B,C,D
2016-05-20,-1.59534,1.282374,1.08735,-0.281389
2016-05-21,1.907173,1.207056,1.055957,-0.576032
2016-05-22,-0.035068,-1.632605,-0.214031,1.491922
2016-05-23,-1.785685,0.151009,0.089607,0.575147


In [19]:
# Robust to dates that precede first recorded date in the dataframe

df['20130228':'20160520']

Unnamed: 0,A,B,C,D
2016-05-20,-1.59534,1.282374,1.08735,-0.281389


## Selection by Label

In [20]:
# .loc is for labels
firstdate=pd.tslib.Timestamp(('20160520'))
df.loc[firstdate]

A   -1.595340
B    1.282374
C    1.087350
D   -0.281389
Name: 2016-05-20 00:00:00, dtype: float64

In [21]:
# Select two axes by label
df.loc[:,['D','C']]

Unnamed: 0,D,C
2016-05-20,-0.281389,1.08735
2016-05-21,-0.576032,1.055957
2016-05-22,1.491922,-0.214031
2016-05-23,0.575147,0.089607
2016-05-24,1.814576,-2.028828
2016-05-25,0.030227,0.892233


In [22]:
# Both endpoints are included in date slicing

df.loc['20160522':'20160524',['D','C']]

Unnamed: 0,D,C
2016-05-22,1.491922,-0.214031
2016-05-23,0.575147,0.089607
2016-05-24,1.814576,-2.028828


In [23]:
# Return scalar

df.loc[firstdate,'C']

1.0873499922531713

In [24]:
# Return scalar, but quickly (how exactly? / why would you want to use .loc for returning scalars then)

df.at[firstdate,'C']

1.0873499922531713

## Selection by Position

In [25]:
# By int position
df.iloc[2]

A   -0.035068
B   -1.632605
C   -0.214031
D    1.491922
Name: 2016-05-22 00:00:00, dtype: float64

In [26]:
# Slice me nice; r,c
df.iloc[0:3,1:3]

Unnamed: 0,B,C
2016-05-20,1.282374,1.08735
2016-05-21,1.207056,1.055957
2016-05-22,-1.632605,-0.214031


In [27]:
# Using lists

df.iloc[[1,3,4],2:4]

Unnamed: 0,C,D
2016-05-21,1.055957,-0.576032
2016-05-23,0.089607,0.575147
2016-05-24,-2.028828,1.814576


In [28]:
# Row slicing

df.iloc[2:5,:]

Unnamed: 0,A,B,C,D
2016-05-22,-0.035068,-1.632605,-0.214031,1.491922
2016-05-23,-1.785685,0.151009,0.089607,0.575147
2016-05-24,-0.677438,-0.775961,-2.028828,1.814576


In [29]:
# Column slicing

df.iloc[:,0:3]

Unnamed: 0,A,B,C
2016-05-20,-1.59534,1.282374,1.08735
2016-05-21,1.907173,1.207056,1.055957
2016-05-22,-0.035068,-1.632605,-0.214031
2016-05-23,-1.785685,0.151009,0.089607
2016-05-24,-0.677438,-0.775961,-2.028828
2016-05-25,-0.61333,-0.355182,0.892233


In [30]:
# Value access

df.iloc[0,0]

-1.5953396784942171

In [31]:
# ... and assignment
df.iloc[0,0]=50
df

Unnamed: 0,A,B,C,D
2016-05-20,50.0,1.282374,1.08735,-0.281389
2016-05-21,1.907173,1.207056,1.055957,-0.576032
2016-05-22,-0.035068,-1.632605,-0.214031,1.491922
2016-05-23,-1.785685,0.151009,0.089607,0.575147
2016-05-24,-0.677438,-0.775961,-2.028828,1.814576
2016-05-25,-0.61333,-0.355182,0.892233,0.030227


In [32]:
# Fast access

df.iat[0,0]

50.0

## Boolean Indexing

In [33]:
# Data returned based off of column

df[df.A > 0]

Unnamed: 0,A,B,C,D
2016-05-20,50.0,1.282374,1.08735,-0.281389
2016-05-21,1.907173,1.207056,1.055957,-0.576032


In [34]:
# Indexing by condition

df[df == 5]

Unnamed: 0,A,B,C,D
2016-05-20,,,,
2016-05-21,,,,
2016-05-22,,,,
2016-05-23,,,,
2016-05-24,,,,
2016-05-25,,,,


In [35]:
# Filtering with isin()
df['E'] = [1,4,3,2,5,7]
df[df['E'].isin([1,4])]

Unnamed: 0,A,B,C,D,E
2016-05-20,50.0,1.282374,1.08735,-0.281389,1
2016-05-21,1.907173,1.207056,1.055957,-0.576032,4


In [36]:
# Filtering without isin()

df[(df.E == 1) | (df.E == 4)]

Unnamed: 0,A,B,C,D,E
2016-05-20,50.0,1.282374,1.08735,-0.281389,1
2016-05-21,1.907173,1.207056,1.055957,-0.576032,4


## Setting

In [37]:
# Adding a new column

s1 = pd.Series([3,5,3,4,2,1], index=pd.date_range('20160520',periods=6))
s1
df['F']=s1
df

Unnamed: 0,A,B,C,D,E,F
2016-05-20,50.0,1.282374,1.08735,-0.281389,1,3
2016-05-21,1.907173,1.207056,1.055957,-0.576032,4,5
2016-05-22,-0.035068,-1.632605,-0.214031,1.491922,3,3
2016-05-23,-1.785685,0.151009,0.089607,0.575147,2,4
2016-05-24,-0.677438,-0.775961,-2.028828,1.814576,5,2
2016-05-25,-0.61333,-0.355182,0.892233,0.030227,7,1


In [38]:
# Setting by label
df.at[firstdate,'D']=0
df

Unnamed: 0,A,B,C,D,E,F
2016-05-20,50.0,1.282374,1.08735,0.0,1,3
2016-05-21,1.907173,1.207056,1.055957,-0.576032,4,5
2016-05-22,-0.035068,-1.632605,-0.214031,1.491922,3,3
2016-05-23,-1.785685,0.151009,0.089607,0.575147,2,4
2016-05-24,-0.677438,-0.775961,-2.028828,1.814576,5,2
2016-05-25,-0.61333,-0.355182,0.892233,0.030227,7,1


In [39]:
# Setting by position
df.iat[1,3]=50
df

Unnamed: 0,A,B,C,D,E,F
2016-05-20,50.0,1.282374,1.08735,0.0,1,3
2016-05-21,1.907173,1.207056,1.055957,50.0,4,5
2016-05-22,-0.035068,-1.632605,-0.214031,1.491922,3,3
2016-05-23,-1.785685,0.151009,0.089607,0.575147,2,4
2016-05-24,-0.677438,-0.775961,-2.028828,1.814576,5,2
2016-05-25,-0.61333,-0.355182,0.892233,0.030227,7,1


In [40]:
# Assign using numpy array

df.loc[:,'D']=np.array([4,2,3,1,1,1])
df

Unnamed: 0,A,B,C,D,E,F
2016-05-20,50.0,1.282374,1.08735,4,1,3
2016-05-21,1.907173,1.207056,1.055957,2,4,5
2016-05-22,-0.035068,-1.632605,-0.214031,3,3,3
2016-05-23,-1.785685,0.151009,0.089607,1,2,4
2016-05-24,-0.677438,-0.775961,-2.028828,1,5,2
2016-05-25,-0.61333,-0.355182,0.892233,1,7,1


In [41]:
# Assign using a regular list?

df.loc[:,'D']=[5,5,5,2,2,2]
df

# Yes!

Unnamed: 0,A,B,C,D,E,F
2016-05-20,50.0,1.282374,1.08735,5,1,3
2016-05-21,1.907173,1.207056,1.055957,5,4,5
2016-05-22,-0.035068,-1.632605,-0.214031,5,3,3
2016-05-23,-1.785685,0.151009,0.089607,2,2,4
2016-05-24,-0.677438,-0.775961,-2.028828,2,5,2
2016-05-25,-0.61333,-0.355182,0.892233,2,7,1


In [42]:
# Copy a dataframe and then apply a 'where' operation to it (like thresholding)

df2 = df.copy()
df2 = df2[df2 < 0]
df2

Unnamed: 0,A,B,C,D,E,F
2016-05-20,,,,,,
2016-05-21,,,,,,
2016-05-22,-0.035068,-1.632605,-0.214031,,,
2016-05-23,-1.785685,,,,,
2016-05-24,-0.677438,-0.775961,-2.028828,,,
2016-05-25,-0.61333,-0.355182,,,,


In [43]:
# Convert all the NaNs to zeros.

df2.fillna(0)

Unnamed: 0,A,B,C,D,E,F
2016-05-20,0.0,0.0,0.0,0.0,0.0,0.0
2016-05-21,0.0,0.0,0.0,0.0,0.0,0.0
2016-05-22,-0.035068,-1.632605,-0.214031,0.0,0.0,0.0
2016-05-23,-1.785685,0.0,0.0,0.0,0.0,0.0
2016-05-24,-0.677438,-0.775961,-2.028828,0.0,0.0,0.0
2016-05-25,-0.61333,-0.355182,0.0,0.0,0.0,0.0


## Missing Data

In [48]:
# Reindexing

df1 = df.reindex(dates[0:4], columns=list(df.columns) + ['G'])
df1.loc[dates[1]:dates[2],'G'] = 1
df1

Unnamed: 0,A,B,C,D,E,F,G
2016-05-20,50.0,1.282374,1.08735,5,1,3,
2016-05-21,1.907173,1.207056,1.055957,5,4,5,1.0
2016-05-22,-0.035068,-1.632605,-0.214031,5,3,3,1.0
2016-05-23,-1.785685,0.151009,0.089607,2,2,4,


In [49]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E,F,G
2016-05-21,1.907173,1.207056,1.055957,5,4,5,1.0
2016-05-22,-0.035068,-1.632605,-0.214031,5,3,3,1.0


In [50]:
df1.fillna(value=100)

Unnamed: 0,A,B,C,D,E,F,G
2016-05-20,50.0,1.282374,1.08735,5,1,3,100.0
2016-05-21,1.907173,1.207056,1.055957,5,4,5,1.0
2016-05-22,-0.035068,-1.632605,-0.214031,5,3,3,1.0
2016-05-23,-1.785685,0.151009,0.089607,2,2,4,100.0


In [51]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E,F,G
2016-05-20,False,False,False,False,False,False,True
2016-05-21,False,False,False,False,False,False,False
2016-05-22,False,False,False,False,False,False,False
2016-05-23,False,False,False,False,False,False,True
