# [10 Minutes To Pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html)

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creatation

## Create Series

In [5]:
s = pd.Series([1,3,5,np.nan,6,8])

In [6]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## Create DataFrame

Creating a DataFrame by passing a numpy array, with a datetime index and labled columns

In [9]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.775083,-1.611017,0.233601,0.087273
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-04,-0.692955,-2.72573,1.579383,0.945874
2013-01-05,0.379824,0.642109,-0.649285,-0.40232
2013-01-06,-0.391827,-0.210352,0.790347,0.398298


Create a DataFrame by passing a dict of objects that can be converted to series-like

In [16]:
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D' : np.array([3] * 4, dtype='int32'),
                    'E' : pd.Categorical(["test", "train", "test", "train"]),
                    'F' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


Having specific dtypes

In [18]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

 See the [Basics section](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics)
 
 See the top & bottom rows of the frame

In [21]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.775083,-1.611017,0.233601,0.087273
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-04,-0.692955,-2.72573,1.579383,0.945874
2013-01-05,0.379824,0.642109,-0.649285,-0.40232


In [22]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.692955,-2.72573,1.579383,0.945874
2013-01-05,0.379824,0.642109,-0.649285,-0.40232
2013-01-06,-0.391827,-0.210352,0.790347,0.398298


In [23]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [24]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [25]:
df.values

array([[-0.77508258, -1.61101655,  0.23360112,  0.08727285],
       [ 0.30049781, -1.5021867 , -0.0039957 , -1.46674481],
       [ 0.10420817, -0.14037552,  0.75022229,  1.30861148],
       [-0.69295499, -2.72572986,  1.5793829 ,  0.94587426],
       [ 0.37982445,  0.6421091 , -0.64928549, -0.40232032],
       [-0.39182674, -0.21035184,  0.79034736,  0.39829793]])

Describe shows a quick statistic summary of your data

In [26]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.179222,-0.924592,0.450045,0.145165
std,0.507379,1.235199,0.767182,0.996008
min,-0.775083,-2.72573,-0.649285,-1.466745
25%,-0.617673,-1.583809,0.055404,-0.279922
50%,-0.143809,-0.856269,0.491912,0.242785
75%,0.251425,-0.15787,0.780316,0.80898
max,0.379824,0.642109,1.579383,1.308611


In [27]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.775083,0.300498,0.104208,-0.692955,0.379824,-0.391827
B,-1.611017,-1.502187,-0.140376,-2.72573,0.642109,-0.210352
C,0.233601,-0.003996,0.750222,1.579383,-0.649285,0.790347
D,0.087273,-1.466745,1.308611,0.945874,-0.40232,0.398298


In [30]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.775083,-1.611017,0.233601,0.087273
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-04,-0.692955,-2.72573,1.579383,0.945874
2013-01-05,0.379824,0.642109,-0.649285,-0.40232
2013-01-06,-0.391827,-0.210352,0.790347,0.398298


Sorting by an axis

In [36]:
df.sort_index(axis=0, ascending=False)
# axis = 0 means sorted by the index(in this example, the index is the date)
# axis = 1 means sorted by the columns( A, B, C, D)

Unnamed: 0,A,B,C,D
2013-01-06,-0.391827,-0.210352,0.790347,0.398298
2013-01-05,0.379824,0.642109,-0.649285,-0.40232
2013-01-04,-0.692955,-2.72573,1.579383,0.945874
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-01,-0.775083,-1.611017,0.233601,0.087273


Sorting by values

In [37]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,-0.692955,-2.72573,1.579383,0.945874
2013-01-01,-0.775083,-1.611017,0.233601,0.087273
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-06,-0.391827,-0.210352,0.790347,0.398298
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-05,0.379824,0.642109,-0.649285,-0.40232


## Selection

> Note: While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, .at, .iat, .loc, .iloc and .ix.

The indexing documentation [Indexing and Selecting Data](http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing) and [MultiIndex / Advanced Indexing](http://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced)

### Getting

In [39]:
df['A']
# equivalent to df.A

2013-01-01   -0.775083
2013-01-02    0.300498
2013-01-03    0.104208
2013-01-04   -0.692955
2013-01-05    0.379824
2013-01-06   -0.391827
Freq: D, Name: A, dtype: float64

In [40]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.775083,-1.611017,0.233601,0.087273
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-03,0.104208,-0.140376,0.750222,1.308611


In [41]:
df['20130102' : '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-04,-0.692955,-2.72573,1.579383,0.945874


### Selection by Label

See more in [Selection by Label](http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-label)

For getting a cross section using a label

In [44]:
df.loc[dates[0]]

A   -0.775083
B   -1.611017
C    0.233601
D    0.087273
Name: 2013-01-01 00:00:00, dtype: float64

In [46]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.775083,-1.611017
2013-01-02,0.300498,-1.502187
2013-01-03,0.104208,-0.140376
2013-01-04,-0.692955,-2.72573
2013-01-05,0.379824,0.642109
2013-01-06,-0.391827,-0.210352


In [47]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.300498,-1.502187
2013-01-03,0.104208,-0.140376
2013-01-04,-0.692955,-2.72573


In [48]:
df.loc['20130102', ['A', 'B']]

A    0.300498
B   -1.502187
Name: 2013-01-02 00:00:00, dtype: float64

For getting a scalar value

In [49]:
df.loc[dates[0], 'A']

-0.77508258229514282

For getting **fast** access to a scalar (equiv to the prior method)

In [50]:
df.at[dates[0], 'A']

-0.77508258229514282

### Selection by Position

See more in [Selection by Position](http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-integer)

Select via the position of the passed integers

In [53]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.775083,-1.611017,0.233601,0.087273
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-04,-0.692955,-2.72573,1.579383,0.945874
2013-01-05,0.379824,0.642109,-0.649285,-0.40232
2013-01-06,-0.391827,-0.210352,0.790347,0.398298


In [52]:
df.iloc[3]

A   -0.692955
B   -2.725730
C    1.579383
D    0.945874
Name: 2013-01-04 00:00:00, dtype: float64

In [54]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,0.300498,-0.003996
2013-01-03,0.104208,0.750222
2013-01-05,0.379824,-0.649285


In [55]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-1.611017,0.233601
2013-01-02,-1.502187,-0.003996
2013-01-03,-0.140376,0.750222
2013-01-04,-2.72573,1.579383
2013-01-05,0.642109,-0.649285
2013-01-06,-0.210352,0.790347


In [56]:
df.iloc[1,1]

-1.5021866998378248

For getting fast access to a scalar(equiv to the prior method)

In [57]:
df.iat[1,1]

-1.5021866998378248

### Boolean Indexing

In [58]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745
2013-01-03,0.104208,-0.140376,0.750222,1.308611
2013-01-05,0.379824,0.642109,-0.649285,-0.40232


In [60]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,0.233601,0.087273
2013-01-02,0.300498,,,
2013-01-03,0.104208,,0.750222,1.308611
2013-01-04,,,1.579383,0.945874
2013-01-05,0.379824,0.642109,,
2013-01-06,,,0.790347,0.398298


In [61]:
df2 = df.copy()

In [62]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [63]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.775083,-1.611017,0.233601,0.087273,one
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745,one
2013-01-03,0.104208,-0.140376,0.750222,1.308611,two
2013-01-04,-0.692955,-2.72573,1.579383,0.945874,three
2013-01-05,0.379824,0.642109,-0.649285,-0.40232,four
2013-01-06,-0.391827,-0.210352,0.790347,0.398298,three


In [64]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.104208,-0.140376,0.750222,1.308611,two
2013-01-05,0.379824,0.642109,-0.649285,-0.40232,four


### Setting

Setting a new column automatically aligns the data by the indexes

In [68]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [69]:
df['F'] = s1

In [70]:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-1.611017,0.233601,0.087273,
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745,1.0
2013-01-03,0.104208,-0.140376,0.750222,1.308611,2.0
2013-01-04,-0.692955,-2.72573,1.579383,0.945874,3.0
2013-01-05,0.379824,0.642109,-0.649285,-0.40232,4.0
2013-01-06,-0.391827,-0.210352,0.790347,0.398298,5.0


In [72]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.233601,0.087273,
2013-01-02,0.300498,-1.502187,-0.003996,-1.466745,1.0
2013-01-03,0.104208,-0.140376,0.750222,1.308611,2.0
2013-01-04,-0.692955,-2.72573,1.579383,0.945874,3.0
2013-01-05,0.379824,0.642109,-0.649285,-0.40232,4.0
2013-01-06,-0.391827,-0.210352,0.790347,0.398298,5.0


In [76]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.233601,5,
2013-01-02,0.300498,-1.502187,-0.003996,5,1.0
2013-01-03,0.104208,-0.140376,0.750222,5,2.0
2013-01-04,-0.692955,-2.72573,1.579383,5,3.0
2013-01-05,0.379824,0.642109,-0.649285,5,4.0
2013-01-06,-0.391827,-0.210352,0.790347,5,5.0


A *where* operation with setting.

In [78]:
df2 = df.copy()

In [82]:
df2[df2 > 0] = -df2

In [83]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.233601,-5,
2013-01-02,-0.300498,-1.502187,-0.003996,-5,-1.0
2013-01-03,-0.104208,-0.140376,-0.750222,-5,-2.0
2013-01-04,-0.692955,-2.72573,-1.579383,-5,-3.0
2013-01-05,-0.379824,-0.642109,-0.649285,-5,-4.0
2013-01-06,-0.391827,-0.210352,-0.790347,-5,-5.0


## Missing Data