# PANDAS

pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# create a pandas series
s = pd.Series([1, 3, 5, np.nan, 6, 7])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    7.0
dtype: float64

In [4]:
s.shape

(6,)

In [5]:
# create a range of dates, date provided is start date, periods increments by day
dates = pd.date_range('20130101', periods=6)
print(dates)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [6]:
# create a dataframe from random data using the dates variable created earlier as the index
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print (df)

                   A         B         C         D
2013-01-01  0.816414  0.241281  1.259946  0.563317
2013-01-02 -1.004928  1.039068 -1.062491  0.495044
2013-01-03 -0.011139 -1.235460  0.463615 -0.019130
2013-01-04 -1.209340  0.320708 -1.750841  0.288654
2013-01-05  0.510095 -0.840999 -2.061217  0.881860
2013-01-06  0.028280 -2.540471 -0.537396 -0.118283


In [7]:
df.head(100) # get first 2 records, default is 5

Unnamed: 0,A,B,C,D
2013-01-01,0.816414,0.241281,1.259946,0.563317
2013-01-02,-1.004928,1.039068,-1.062491,0.495044
2013-01-03,-0.011139,-1.23546,0.463615,-0.01913
2013-01-04,-1.20934,0.320708,-1.750841,0.288654
2013-01-05,0.510095,-0.840999,-2.061217,0.88186
2013-01-06,0.02828,-2.540471,-0.537396,-0.118283


In [8]:
df.tail(2) # get last 2 records, default is 5

Unnamed: 0,A,B,C,D
2013-01-05,0.510095,-0.840999,-2.061217,0.88186
2013-01-06,0.02828,-2.540471,-0.537396,-0.118283


In [9]:
df.index # view indexs

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns # view available columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df['B'] # view specific columns

2013-01-01    0.241281
2013-01-02    1.039068
2013-01-03   -1.235460
2013-01-04    0.320708
2013-01-05   -0.840999
2013-01-06   -2.540471
Freq: D, Name: B, dtype: float64

In [12]:
df.describe() # inspect dataset

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.145103,-0.502646,-0.614731,0.348577
std,0.809161,1.297109,1.285829,0.376487
min,-1.20934,-2.540471,-2.061217,-0.118283
25%,-0.75648,-1.136845,-1.578754,0.057816
50%,0.008571,-0.299859,-0.799944,0.391849
75%,0.389641,0.300851,0.213362,0.546248
max,0.816414,1.039068,1.259946,0.88186


In [13]:
df.T # transpose

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.816414,-1.004928,-0.011139,-1.20934,0.510095,0.02828
B,0.241281,1.039068,-1.23546,0.320708,-0.840999,-2.540471
C,1.259946,-1.062491,0.463615,-1.750841,-2.061217,-0.537396
D,0.563317,0.495044,-0.01913,0.288654,0.88186,-0.118283


In [14]:
df.sort_index(axis=1, ascending=False) # sort by axis

Unnamed: 0,D,C,B,A
2013-01-01,0.563317,1.259946,0.241281,0.816414
2013-01-02,0.495044,-1.062491,1.039068,-1.004928
2013-01-03,-0.01913,0.463615,-1.23546,-0.011139
2013-01-04,0.288654,-1.750841,0.320708,-1.20934
2013-01-05,0.88186,-2.061217,-0.840999,0.510095
2013-01-06,-0.118283,-0.537396,-2.540471,0.02828


# ingest excel files

In [15]:
xl = pd.read_excel("./docs/exampleexcel.xlsx")

In [16]:
xl

Unnamed: 0,Example column 1,Example column 2,Example column three,Example column 4
0,Text value 1,4,4.6,2019-09-03
1,Text value 2,5,5.2,2019-03-05
2,Text value 3,2,7.8,2018-05-04
3,Text value 4,8,5.9,2017-05-07
4,Text value 5,9,3.5,2016-12-09
5,Text value 6,4,2.5,2019-02-01
6,Text value 7,8,7.8,2019-04-03
7,Text value 8,0,9.6,2019-05-02
8,Text value 9,3,10.5,2018-02-01


In [17]:
xl.head()

Unnamed: 0,Example column 1,Example column 2,Example column three,Example column 4
0,Text value 1,4,4.6,2019-09-03
1,Text value 2,5,5.2,2019-03-05
2,Text value 3,2,7.8,2018-05-04
3,Text value 4,8,5.9,2017-05-07
4,Text value 5,9,3.5,2016-12-09


In [18]:
xl['Example column three'].describe()

count     9.000000
mean      6.377778
std       2.727534
min       2.500000
25%       4.600000
50%       5.900000
75%       7.800000
max      10.500000
Name: Example column three, dtype: float64

In [19]:
xl['Example column 2'] = xl['Example column 2'] * 2

In [20]:
xl.head()

Unnamed: 0,Example column 1,Example column 2,Example column three,Example column 4
0,Text value 1,8,4.6,2019-09-03
1,Text value 2,10,5.2,2019-03-05
2,Text value 3,4,7.8,2018-05-04
3,Text value 4,16,5.9,2017-05-07
4,Text value 5,18,3.5,2016-12-09
