# Pandas Introduction

Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.

In [1]:
import pandas as pd
import numpy as np

print('Pandas version:', pd.__version__)

Pandas version: 0.20.3


In [2]:
s = pd.Series([1,3,6,np.nan,4,1]) # similar with 1D numpy
print(s)

0    1.0
1    3.0
2    6.0
3    NaN
4    4.0
5    1.0
dtype: float64


In [3]:
dates = pd.date_range('20160101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])

print(df)

                   A         B         C         D
2016-01-01  0.285769 -0.359904 -1.246052  0.274460
2016-01-02 -1.574403  0.467659  0.625964 -0.508490
2016-01-03  1.439421  2.873205 -0.732800 -0.497975
2016-01-04 -0.822470 -0.224765 -1.436297  0.234497
2016-01-05 -0.663748  1.314054 -0.435610  0.791698
2016-01-06  0.936635 -0.302569  0.406865  0.476138


In [4]:
print(df['B'])

2016-01-01   -0.359904
2016-01-02    0.467659
2016-01-03    2.873205
2016-01-04   -0.224765
2016-01-05    1.314054
2016-01-06   -0.302569
Freq: D, Name: B, dtype: float64


In [5]:
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'})
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


In [6]:
print(df2.dtypes)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


In [7]:
print(df.index)

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')


In [8]:
print(df.columns)

Index(['A', 'B', 'C', 'D'], dtype='object')


In [9]:
print(df.values)

[[ 0.28576855 -0.35990389 -1.24605178  0.27446033]
 [-1.57440336  0.46765878  0.62596423 -0.50848971]
 [ 1.43942083  2.87320517 -0.73279975 -0.49797459]
 [-0.82246962 -0.2247646  -1.43629717  0.23449696]
 [-0.66374828  1.31405446 -0.43561035  0.79169848]
 [ 0.93663548 -0.30256894  0.40686507  0.47613826]]


In [10]:
print(df.describe())

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.066466  0.627947 -0.469655  0.128388
std    1.148925  1.273299  0.845726  0.527532
min   -1.574403 -0.359904 -1.436297 -0.508490
25%   -0.782789 -0.283118 -1.117739 -0.314857
50%   -0.188990  0.121447 -0.584205  0.254479
75%    0.773919  1.102456  0.196246  0.425719
max    1.439421  2.873205  0.625964  0.791698


In [11]:
print(df.T)

   2016-01-01  2016-01-02  2016-01-03  2016-01-04  2016-01-05  2016-01-06
A    0.285769   -1.574403    1.439421   -0.822470   -0.663748    0.936635
B   -0.359904    0.467659    2.873205   -0.224765    1.314054   -0.302569
C   -1.246052    0.625964   -0.732800   -1.436297   -0.435610    0.406865
D    0.274460   -0.508490   -0.497975    0.234497    0.791698    0.476138


In [12]:
print(df.sort_index(axis=1, ascending=False))

                   D         C         B         A
2016-01-01  0.274460 -1.246052 -0.359904  0.285769
2016-01-02 -0.508490  0.625964  0.467659 -1.574403
2016-01-03 -0.497975 -0.732800  2.873205  1.439421
2016-01-04  0.234497 -1.436297 -0.224765 -0.822470
2016-01-05  0.791698 -0.435610  1.314054 -0.663748
2016-01-06  0.476138  0.406865 -0.302569  0.936635


In [13]:
print(df.sort_values(by='B'))

                   A         B         C         D
2016-01-01  0.285769 -0.359904 -1.246052  0.274460
2016-01-06  0.936635 -0.302569  0.406865  0.476138
2016-01-04 -0.822470 -0.224765 -1.436297  0.234497
2016-01-02 -1.574403  0.467659  0.625964 -0.508490
2016-01-05 -0.663748  1.314054 -0.435610  0.791698
2016-01-03  1.439421  2.873205 -0.732800 -0.497975
