# An introduction to Pandas Library

## Python Coffee February 18th, 2016


In [None]:
import pandas as pd
import matplotlib.pyplot as pl
import datetime as dt
%matplotlib notebook

In [None]:
pd.options.display.max_rows = 20
pd.options.display.max_columns = 100

## Reading Data

In [None]:
pwv = pd.read_csv('pwv_APEX_3h.csv', na_values='NaN')

In [None]:
measurements = pd.read_csv('sc_measurements.csv')
sources = pd.read_csv('sc_sources.csv')

In [None]:
targets = pd.read_csv('sg_targets.csv')

In [None]:
execblocks = pd.read_csv('aqua_exeblock.csv', index_col=0)
execblocks.rename(columns={'SB_UID.1': 'SB_UID'}, inplace=True)

## Inspecting Data

In [None]:
pwv.values

In [None]:
pwv

In [None]:
# Get a quick summary of the dataframe
pwv.info()

In [None]:
# Get the first columns
pwv.head()

In [None]:
# Get the last rows
pwv.tail()

In [None]:
# Extract/Slice data (rows)
pwv[:3]

In [None]:
pwv.ix[:3,1:3]

In [None]:
pwv[['Month', 'Day']][:3]

In [None]:
pwv['PWV']

In [None]:
?pd.read_csv

In [None]:
print pwv.PWV.min()
print pwv.PWV.max()
print pwv.PWV.mean()
print pwv.PWV.median()
print pwv.PWV.std()

In [None]:
pwv.PWV.describe()

In [None]:
pwv.describe()

In [None]:
pl.figure(1)
pwv.PWV.hist(bins=40)

In [None]:
pl.figure(2)
pwv.PWV.plot()

In [None]:
pl.figure(3)
pwv.plot(y='PWV')

## Selecting/Querying Data

In [None]:
execblocks.info()

In [None]:
execblocks.head()

In [None]:
execblocks[['QA0STATUS', 'SE_STATUS']][:7]

In [None]:
execblocks['QA0STATUS'].unique()

In [None]:
execblocks['QA0STATUS'].value_counts()

In [None]:
pl.figure(7)
execblocks['QA0STATUS'].value_counts().plot(kind='bar')

In [None]:
execblocks['SB_UID'].value_counts()

In [None]:
pl.figure()
execblocks['SB_UID'].value_counts()[:10].plot(kind='bar')

In [None]:
execblocks['QA0STATUS'] == "Pass"

In [None]:
execblocks[execblocks['QA0STATUS'] == "Pass"]

In [None]:
execblocks.query('QA0STATUS == "Pass"')

In [None]:
execblocks.query('QA0STATUS == "Pass" and SE_STATUS == "FAIL"')

In [None]:
sblist = execblocks['SB_UID'].value_counts()[:10].index.values
sblist

In [None]:
mostobserved = execblocks.query('SB_UID in @sblist')
mostobserved.head()

In [None]:
mostobserved.groupby(['SB_UID', 'QA0STATUS']).aggregate({'EXECBLOCKUID': pd.np.count_nonzero, 'delta': pd.np.mean})

In [None]:
table1 = mostobserved.groupby(['SB_UID', 'QA0STATUS']).aggregate({'EXECBLOCKUID': pd.np.count_nonzero, 'delta': pd.np.mean})
table1.unstack()

In [None]:
table1.to_excel('table1.xls')

In [None]:
execblocks.dropna().apply(lambda x: x['delta'] * 45., axis=1)

In [None]:
execblocks['day'] = execblocks.apply(lambda x: x['STARTTIME'][:10], axis=1)

In [None]:
execblocks

In [None]:
effi = execblocks.groupby(['day', 'QA0STATUS']).aggregate({'EXECBLOCKUID': pd.np.count_nonzero})
effi

In [None]:
effipl = effi.unstack()['EXECBLOCKUID'].reset_index()

In [None]:
effipl['Date'] = effipl.apply(lambda x: dt.datetime.strptime(x['day'], '%Y-%m-%d'), axis=1)

In [None]:
effipl

In [None]:
ax = effipl.plot(x='Date', y='Pass')
effipl.plot(x='Date', y='Fail', ax=ax)

In [None]:
pwv['Date'] = pwv.apply(lambda x: 
                        dt.datetime(int(x['Year']), int(x['Month']), int(x['Day']), int(x['Hour'])), 
                        axis=1)

In [None]:
pwv.set_index('Date', inplace=True, drop=False)