# Chapter 12: Advanced pandas

In [33]:
import pandas as pd
import numpy as np
import sys

## 12.1 Categorical Data

In [6]:
# dimension table stores unique values and keys refercning primary observations 
df = pd.DataFrame({'dogs' : ['big', 'bark', 'bite'] * 3,
                  'data' : np.random.randint(1,11, size=9)})
df

Unnamed: 0,dogs,data
0,big,8
1,bark,1
2,bite,6
3,big,1
4,bark,5
5,bite,7
6,big,7
7,bark,1
8,bite,10


In [23]:
df['dogs'] = df.dogs.astype('category')
print(df.dogs)
print(df.dogs.values)
print()
cats = df.dogs.values.categories # The unique values
print(cats)
values = df.dogs.values.codes # The codes
print(values)
# putting them back together
print(cats.take(values)) 
# or alternativly
print(pd.Categorical.from_codes(values, cats, ordered=True)) # Maintain order

0     big
1    bark
2    bite
3     big
4    bark
5    bite
6     big
7    bark
8    bite
Name: dogs, dtype: category
Categories (3, object): ['bark', 'big', 'bite']
['big', 'bark', 'bite', 'big', 'bark', 'bite', 'big', 'bark', 'bite']
Categories (3, object): ['bark', 'big', 'bite']

Index(['bark', 'big', 'bite'], dtype='object')
[1 0 2 1 0 2 1 0 2]
Index(['big', 'bark', 'bite', 'big', 'bark', 'bite', 'big', 'bark', 'bite'], dtype='object')
['big', 'bark', 'bite', 'big', 'bark', 'bite', 'big', 'bark', 'bite']
Categories (3, object): ['bark' < 'big' < 'bite']


In [35]:
series = df.drop('data', axis=1)
cat = pd.Categorical.from_codes(values, cats)
print(f'Series size {sys.getsizeof(series)} v cat size {sys.getsizeof(cat)}') # Returns obj size in bytes

Series size 443 v cat size 315


In [53]:
# Slick way to get quantile stats
data = np.random.randn(1000)
qs = pd.Series(pd.qcut(data,
                       4,
                       labels=['Q1', 'Q2', 'Q3', 'Q4']),
               name='quartile')
res = (pd.Series(data)
       .groupby(qs)
       .agg(['count', 'min', 'max'])
       .reset_index()
      )
res

Unnamed: 0,quartile,count,min,max
0,Q1,250,-3.259249,-0.761498
1,Q2,250,-0.759203,-0.084734
2,Q3,250,-0.080998,0.621179
3,Q4,250,0.641343,2.816702


In [52]:
# Converting columns to categorical results in less data, faster compute times bc interacting with
# integer codes instead of strings                                                                                

['Q4', 'Q1', 'Q3', 'Q2', 'Q4', ..., 'Q1', 'Q1', 'Q1', 'Q3', 'Q2']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [67]:
# use category.cat to access category methods
this = pd.Series('this old hound is red'.split(' ') * 2,).astype('category')

this.cat.rename_categories(('red road in the morn'.split(' ')))

0    morn
1      in
2     red
3    road
4     the
5    morn
6      in
7     red
8    road
9     the
dtype: category
Categories (5, object): ['red', 'road', 'in', 'the', 'morn']

In [72]:
# .transform() on groupby like .apply(), can produce scalar values to be broadcast across groups
df = pd.DataFrame({'key' : list('abc')*4,
                  'value' : np.arange(12.)})
df.groupby('key').transform('mean')


Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [81]:
# Nvm TimeGrouper is depriciated
# # Use timegrouper() if multiple values per timestamp
# df = pd.DataFrame({'key' : list('abc') * 3,
#                   'value' : np.arange(9),
#                   'times' : pd.date_range('10-02-1992',
#                                           periods=3,
#                                           freq='min'
#                                          ).repeat(3)})
# print(df)

# print(df.groupby['key', pd.Grouper('5min')])


## 12.3 Techniques for Method Chaining

In [95]:
# Use df.assign to copy df and add new columns, can chain methods
df2 = (df.assign(new = df.key + "_new")
       .groupby('key')
       .count()
      )

In [98]:
# Use .pipe(fun, arg1) to chain together functinos on df; function must return df or series
def f(df):
    return df

def g(df):
    return df

def h(df):
    return df
    

result = (df.pipe(f)
          .pipe(g)
          .pipe(h)
         )
result

Unnamed: 0,key,value,times
0,a,0,1992-10-02 00:00:00
1,b,1,1992-10-02 00:00:00
2,c,2,1992-10-02 00:00:00
3,a,3,1992-10-02 00:01:00
4,b,4,1992-10-02 00:01:00
5,c,5,1992-10-02 00:01:00
6,a,6,1992-10-02 00:02:00
7,b,7,1992-10-02 00:02:00
8,c,8,1992-10-02 00:02:00
