# 10 Minutes to Pandas

A notebook working through the Pandas tutorial [here](http://pandas.pydata.org/pandas-docs/stable/10min.html).  Once done can be used to try out other keyword arguments.


# Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Object Creation

In [2]:
# Series creation

s = pd.Series([7,8,5,np.nan])
s

0    7.0
1    8.0
2    5.0
3    NaN
dtype: float64

In [3]:
# Dataframe creation

dates = pd.date_range('20160520',periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2016-05-20,0.499541,1.208421,1.521087,-0.3325
2016-05-21,0.590088,1.858737,0.1421,0.076725
2016-05-22,0.337256,-0.56415,0.455163,1.388999
2016-05-23,1.814928,-0.400173,-0.18403,0.85876
2016-05-24,-0.201217,-0.601466,0.565065,-0.421012
2016-05-25,0.253561,-0.416451,0.130984,-0.000867


In [4]:
# Dataframe creation using dictionaries
kv={'A' : 2.,
    'B' : pd.Timestamp('20160605'),
    'C' : s,
    'D' : np.array([4,3,5,2],dtype='int32'),
    'E' : pd.Categorical(['foo','bar','fizz','buzz']),
    'F' : 'lorem ipsum'
   }
dict_df= pd.DataFrame(kv)
dict_df

Unnamed: 0,A,B,C,D,E,F
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
3,2.0,2016-06-05,,2,buzz,lorem ipsum


In [5]:
# Get data types
dict_df.dtypes

A           float64
B    datetime64[ns]
C           float64
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [6]:
# Head of dataframe

dict_df.head()

Unnamed: 0,A,B,C,D,E,F
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
3,2.0,2016-06-05,,2,buzz,lorem ipsum


In [7]:
# Tail of dataframe

dict_df.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
3,2.0,2016-06-05,,2,buzz,lorem ipsum


In [8]:
# Display columns

dict_df.columns

Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')

In [9]:
# Display data

dict_df.values

array([[2.0, Timestamp('2016-06-05 00:00:00'), 7.0, 4, 'foo', 'lorem ipsum'],
       [2.0, Timestamp('2016-06-05 00:00:00'), 8.0, 3, 'bar', 'lorem ipsum'],
       [2.0, Timestamp('2016-06-05 00:00:00'), 5.0, 5, 'fizz',
        'lorem ipsum'],
       [2.0, Timestamp('2016-06-05 00:00:00'), nan, 2, 'buzz',
        'lorem ipsum']], dtype=object)

In [10]:
# Display row indices

dict_df.index

RangeIndex(start=0, stop=4, step=1)

In [11]:
# Describe

dict_df.describe()



Unnamed: 0,A,C,D
count,4.0,3.0,4.0
mean,2.0,6.666667,3.5
std,0.0,1.527525,1.290994
min,2.0,5.0,2.0
25%,2.0,,2.75
50%,2.0,,3.5
75%,2.0,,4.25
max,2.0,8.0,5.0


In [12]:
# Transpose

dict_df.T

Unnamed: 0,0,1,2,3
A,2,2,2,2
B,2016-06-05 00:00:00,2016-06-05 00:00:00,2016-06-05 00:00:00,2016-06-05 00:00:00
C,7,8,5,
D,4,3,5,2
E,foo,bar,fizz,buzz
F,lorem ipsum,lorem ipsum,lorem ipsum,lorem ipsum


In [13]:
# Sort by index or column name

dict_df.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,2.0,2016-06-05,,2,buzz,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum


In [14]:
# Sort by values
# New in 0.18; older versions use 'sort'

dict_df.sort_values(by='D')

Unnamed: 0,A,B,C,D,E,F
3,2.0,2016-06-05,,2,buzz,lorem ipsum
1,2.0,2016-06-05,8.0,3,bar,lorem ipsum
0,2.0,2016-06-05,7.0,4,foo,lorem ipsum
2,2.0,2016-06-05,5.0,5,fizz,lorem ipsum


# Selection

## Getting

In [15]:
dict_df['A']

0    2.0
1    2.0
2    2.0
3    2.0
Name: A, dtype: float64

In [16]:
# Fascinating

dict_df['A']==dict_df.A

0    True
1    True
2    True
3    True
Name: A, dtype: bool

In [18]:
# Row slicing

df[0:2]

Unnamed: 0,A,B,C,D
2016-05-20,0.499541,1.208421,1.521087,-0.3325
2016-05-21,0.590088,1.858737,0.1421,0.076725


In [21]:
df['20160520':'20160523']

Unnamed: 0,A,B,C,D
2016-05-20,0.499541,1.208421,1.521087,-0.3325
2016-05-21,0.590088,1.858737,0.1421,0.076725
2016-05-22,0.337256,-0.56415,0.455163,1.388999
2016-05-23,1.814928,-0.400173,-0.18403,0.85876


In [23]:
# Robust to dates that precede first recorded date in the dataframe

df['20130228':'20160520']

Unnamed: 0,A,B,C,D
2016-05-20,0.499541,1.208421,1.521087,-0.3325


## Selection by Label

In [37]:
# .loc is for labels
firstdate=pd.tslib.Timestamp(('20160520'))
df.loc[firstdate]

A    0.499541
B    1.208421
C    1.521087
D   -0.332500
Name: 2016-05-20 00:00:00, dtype: float64

In [38]:
# Select two axes by label
df.loc[:,['D','C']]

Unnamed: 0,D,C
2016-05-20,-0.3325,1.521087
2016-05-21,0.076725,0.1421
2016-05-22,1.388999,0.455163
2016-05-23,0.85876,-0.18403
2016-05-24,-0.421012,0.565065
2016-05-25,-0.000867,0.130984


In [42]:
# Both endpoints are included in date slicing

df.loc['20160522':'20160524',['D','C']]

Unnamed: 0,D,C
2016-05-22,1.388999,0.455163
2016-05-23,0.85876,-0.18403
2016-05-24,-0.421012,0.565065


In [43]:
# Return scalar

df.loc[firstdate,'C']

1.5210867830963324

In [44]:
# Return scalar, but quickly (how exactly? / why would you want to use .loc for returning scalars then)

df.at[firstdate,'C']

1.5210867830963324

## Selection by Position

## Boolean Indexing

## Setting

In [48]:
# Adding a new column

s1 = pd.Series([3,5,3,4,2,1], index=pd.date_range('20160520',periods=6))
s1
df['F']=s1
df

Unnamed: 0,A,B,C,D,F
2016-05-20,0.499541,1.208421,1.521087,-0.3325,3
2016-05-21,0.590088,1.858737,0.1421,0.076725,5
2016-05-22,0.337256,-0.56415,0.455163,1.388999,3
2016-05-23,1.814928,-0.400173,-0.18403,0.85876,4
2016-05-24,-0.201217,-0.601466,0.565065,-0.421012,2
2016-05-25,0.253561,-0.416451,0.130984,-0.000867,1


In [49]:
# Setting by label
df.at[firstdate,'D']=0
df

Unnamed: 0,A,B,C,D,F
2016-05-20,0.499541,1.208421,1.521087,0.0,3
2016-05-21,0.590088,1.858737,0.1421,0.076725,5
2016-05-22,0.337256,-0.56415,0.455163,1.388999,3
2016-05-23,1.814928,-0.400173,-0.18403,0.85876,4
2016-05-24,-0.201217,-0.601466,0.565065,-0.421012,2
2016-05-25,0.253561,-0.416451,0.130984,-0.000867,1


In [50]:
# Setting by position
df.iat[1,3]=50
df

Unnamed: 0,A,B,C,D,F
2016-05-20,0.499541,1.208421,1.521087,0.0,3
2016-05-21,0.590088,1.858737,0.1421,50.0,5
2016-05-22,0.337256,-0.56415,0.455163,1.388999,3
2016-05-23,1.814928,-0.400173,-0.18403,0.85876,4
2016-05-24,-0.201217,-0.601466,0.565065,-0.421012,2
2016-05-25,0.253561,-0.416451,0.130984,-0.000867,1


In [51]:
# Assign using numpy array

df.loc[:,'D']=np.array([4,2,3,1,1,1])
df

Unnamed: 0,A,B,C,D,F
2016-05-20,0.499541,1.208421,1.521087,4,3
2016-05-21,0.590088,1.858737,0.1421,2,5
2016-05-22,0.337256,-0.56415,0.455163,3,3
2016-05-23,1.814928,-0.400173,-0.18403,1,4
2016-05-24,-0.201217,-0.601466,0.565065,1,2
2016-05-25,0.253561,-0.416451,0.130984,1,1


In [52]:
# Assign using a regular list?

df.loc[:,'D']=[5,5,5,2,2,2]
df

# Yes!

Unnamed: 0,A,B,C,D,F
2016-05-20,0.499541,1.208421,1.521087,5,3
2016-05-21,0.590088,1.858737,0.1421,5,5
2016-05-22,0.337256,-0.56415,0.455163,5,3
2016-05-23,1.814928,-0.400173,-0.18403,2,4
2016-05-24,-0.201217,-0.601466,0.565065,2,2
2016-05-25,0.253561,-0.416451,0.130984,2,1


In [53]:
# Copy a dataframe and then apply a 'where' operation to it (like thresholding)

df2 = df.copy()
df2 = df2[df2 < 0]
df2

Unnamed: 0,A,B,C,D,F
2016-05-20,,,,,
2016-05-21,,,,,
2016-05-22,,-0.56415,,,
2016-05-23,,-0.400173,-0.18403,,
2016-05-24,-0.201217,-0.601466,,,
2016-05-25,,-0.416451,,,


In [54]:
# Convert all the NaNs to zeros.

df2.fillna(0)

Unnamed: 0,A,B,C,D,F
2016-05-20,0.0,0.0,0.0,0.0,0.0
2016-05-21,0.0,0.0,0.0,0.0,0.0
2016-05-22,0.0,-0.56415,0.0,0.0,0.0
2016-05-23,0.0,-0.400173,-0.18403,0.0,0.0
2016-05-24,-0.201217,-0.601466,0.0,0.0,0.0
2016-05-25,0.0,-0.416451,0.0,0.0,0.0
