# Non-Mini Intro to Pandas 2

Official Pandas Doc: [pandas.pydata.org](https://pandas.pydata.org/)

In [None]:
import numpy as np
import pandas as pd

### Pandas -- Continued...
1. Data Transformation
2. Data Grouping & Aggregation
3. Time Series

## 1. Data Transformation

### Element-wise transformation using map()

In [None]:
# map() method is a Series method
sr = pd.Series(np.random.randn(2))
sr

In [None]:
def function1(paras):
    return round(paras,2)

In [None]:
sr.map(function1)

In [None]:
# apply a lambda function to each data point
func = lambda x: round(x, 2)
sr.map(func)

### Element-wise Transformation using applymap()

In [None]:
# this method applies a function that accepts and returns a scalar to every element of a DataFrame
# A DataFrame with 3x3 floats
df = pd.DataFrame(np.random.randn(3, 3), columns=['col_1', 'col_2', 'col_3'], index=['row_1', 'row_2', 'row_3'])
df

In [None]:
df.applymap(function1)

In [None]:
# applemap is a DataFrame method
# round to 2 decimals for each data point in DataFrame
func = lambda x: round(x, 2)
df.applymap(func)

### array-wise Transformation using apply()

In [None]:
# axis=0 ==> computation on from top to bottom
df.apply(lambda x: x.max() - x.min(), axis=0)

In [None]:
# axis=1 ==> computation on data from left to right
df.apply(lambda x: x.max() - x.min(), axis=1)

## 2. Data Grouping

<img src="img/split-apply-combine.svg">

In [None]:
# create the above dataset
data = {'key': ['A', 'B', 'C', 'A', 'B', 'C'], 
        'data': [1, 2, 3, 4, 5, 6]}
df = pd.DataFrame(data, columns=['key', 'data'])
df

In [None]:
# Pandas's GroupBy object.
# It has not actually computed anything yet but form the intermediate datasets
grouped = df.groupby(by='key', as_index=False)
grouped

In [None]:
# we can access a subset of data by referring a 'key' value
grouped.get_group('A')

In [None]:
# to see what have been saved in the GroupBy object
# we can use iteration to print out data in each group
for name, group in grouped:
    print("Sub-group: {}".format(name))
    print(group)
    print("\n")

In [None]:
# Now let's apply some functions and/or methods
# Note, the function sum() has been applied to each group
# results are then combined together as a DataFrame object
grouped.sum()
# or we can use flexible apply() method

In [None]:
grouped['data'].apply(lambda x: sum(x))

In [None]:
# More importantly, we can use aggregate() methods
# to apply multiple functions


grouped.agg(['sum', 'mean', 'std'] )

## 3. Time Series

### Datetime Object

In [None]:
# built-in `datetime` module
from datetime import datetime, timedelta
from dateutil.parser import parse

# datetime stores both the date and time down to the microsecond
datetime.now()

In [None]:
# we can compute the temporal diff between two datetime objective
delta = datetime.now() - datetime(1949, 10, 1)
delta

In [None]:
# use diff: timedelta methods
start = datetime.now()
print(start)

print(start + timedelta(12))

print(start + timedelta(days= 12, hours=1))

### Converting between string and datetime

In [None]:
# convert datetime object to a spefic "human friendly" format
datetime.now().strftime('%d/%m/%Y') # <= str 'f'ormat time

In [None]:
# convert string to datetime object
datetime.strptime('2018-05-09', '%Y-%m-%d') # <= str 'p'arse time

In [None]:
# use dateutil package
parse('May 09, 2018, 23:59')

In [None]:
# pandas's to_datetime function
pd.to_datetime(['2018-05-09 23:59', None], format="%Y-%m-%d %H:%M")

### Use datetime object as Index

In [None]:
# mannually create timestmaps
dates = [datetime(2018, 5, 10), datetime(2018, 5, 11), datetime(2018, 5, 12)]
# the list of dates are passed as index
ts1 = pd.Series(np.random.randn(3), index=dates)
ts1

In [None]:
# to get a fixed date index objective
# Pandas's date_range, by default, generates daily timestamps
pd.date_range('2018-05-10', '2018, May, 12') # accepts different formats...

In [None]:
# specify start (end) date, and periods
pd.date_range(start='2018-05-10', periods=3)

In [None]:
# specify frequency
ts = pd.date_range('2018-05-01', '2018-08-30', freq='M') # <= month end, others 'D', 'Q'
df = pd.DataFrame(np.random.rand(3,3), index=ts)
df


### Shift method

In [None]:
# create a dataset with month-start as index
index = pd.date_range('1/1/2000', periods=3, freq='MS')
ts2 = pd.Series(np.random.randn(3), index=index)

What if we want to create `lead` or `lag` data?

In [None]:
# shift() moves the data point forward or backward
# leaves datetime index unmodified
ts2.shift(-1) 

In [None]:
# with passing a freq argument, instead of moving data,
# shift() method move timestamps
ts2.shift(9, freq='MS')

In [None]:
# Another handy function to shift datetime
# especially helpful when merging databases
from pandas.tseries.offsets import MonthEnd

datetime.now() + MonthEnd(-1)