In [1]:
import pandas as pd
import numpy as np

In [2]:
# Helper function: don't mind me :)
def masker(size, prob):
    return np.random.choice([True, False], size=size, p=[prob, 1-prob])

def read_gsheet(gid):
    url = 'https://docs.google.com/spreadsheets/d/1ATeuU3wYnFwdn3UOQ703893zFomqQBpv9HDoWGatbDo/export?format=csv&gid='
    df = pd.read_csv(url + gid, encoding='utf-8')
    # df = df.dropna(thresh=2).dropna(thresh=10, axis=1).drop('1', axis=1)
    return df

## In and out
- pd.read_csv
- pd.to_csv

In [52]:
ppl = read_gsheet(gid='0')
acads = read_gsheet(gid='567561206')
sb  = read_gsheet(gid='174628450')

display(acads.head(), ppl.head(), sb.head())

Unnamed: 0,nickname,LT,love_subj,hate_subj
0,gil,6,DVS,LOB
1,moskie,6,DVS,LOB
2,toby,11,MC,
3,jacq,6,DVS,PDS
4,agnes,15,MDS,LOB


Unnamed: 0,name,nickname,bdate,size_household,civil_status
0,Gilian Uy,gil,02 Sep 1991,6,S
1,Amos Changcoco,moskie,10 Feb 1995,4,S
2,Antonio Rafael Fernando,Toby,15 Jun 1994,4,S
3,Jacqueline Yu Cabrera,Jacq,06 Mar 1983,2,M
4,Agnes Lazo,agnes,22 Jan 1998,6,S


Unnamed: 0,nickname,MBTI,size_household,civil_status,num_sis,num_bro,num_pet,num_jowa,single,date_last_left_house,hours_outside_per_week
0,gil,INTP,6,S,1.0,2.0,3.0,3.0,Y,2021/06/05,2.0
1,moskie,INFJ,4,S,2.0,0.0,1.0,1.0,Y,2021/06/08,3.0
2,toby,ENTP,4,S,1.0,0.0,2.0,1.0,N,2021/06/09,6.0
3,jacq,INTJ,2,M,1.0,0.0,1.0,3.0,N,2021/05/27,1.0
4,agnes,ENFP,6,S,2.0,1.0,2.0,2.0,N,2021/06/06,1.0


# Confusing functions

## df.loc vs df.iloc
- df.loc uses pandas index and column names
- df.iloc is essentially using numpy indexing

In [53]:
acads = acads.set_index('nickname')
acads

Unnamed: 0_level_0,LT,love_subj,hate_subj
nickname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gil,6,DVS,LOB
moskie,6,DVS,LOB
toby,11,MC,
jacq,6,DVS,PDS
agnes,15,MDS,LOB
kash,9,IDS,PE
bonnie,1,DVS,LOB
mimay,1,MDS,LOB
mason,1,MDS,
joaqs,1,MDS,LOB


In [31]:
acads.iloc[:6][['LT','hate_subj']]

Unnamed: 0_level_0,LT,hate_subj
nickname,Unnamed: 1_level_1,Unnamed: 2_level_1
gil,6,LOB
moskie,6,LOB
toby,11,
jacq,6,PDS
agnes,15,LOB
kash,9,PE


## df.append vs pd.concat
Append:
- one at a time
- only add rows (`pd.concat` with `axis=0`)

Concat:
- more general, has `axis` and `join` type 

## pd.pivot vs pd.pivot_table 
Pivot is a "simplified" version of pivot_table

In [54]:
acads = acads.reset_index()
acads

Unnamed: 0,nickname,LT,love_subj,hate_subj
0,gil,6,DVS,LOB
1,moskie,6,DVS,LOB
2,toby,11,MC,
3,jacq,6,DVS,PDS
4,agnes,15,MDS,LOB
5,kash,9,IDS,PE
6,bonnie,1,DVS,LOB
7,mimay,1,MDS,LOB
8,mason,1,MDS,
9,joaqs,1,MDS,LOB


In [46]:
pd.pivot(acads, values='love_subj', index='nickname', columns='hate_subj')

ValueError: Index contains duplicate entries, cannot reshape

## pd.pivot_table vs pd.crosstab
- Pivot_table accepts a df. Your parameters are column **names**.
- Crosstab doesn't need a df. It accepts **arrays/series**. 


## pd.pivot_table vs pd.melt
They are opposites!
- pivot table goes from long to wide
- melt goes from wide to long

## pd.merge vs pd.join

`merge` is just `join` with `right_index=True`

## pd.groupby vs pd.Grouper
- Grouper bins temporal data
- groupby does the actual grouping-by!

df.groupby(pd.Grouper...)

## normal groupby vs groupby()[col].transform()
- groupby returns N groups
- transform returns as many rows as your original df

In [84]:
#getting the most favorite subj of each LT
acads.groupby('LT')['love_subj'].apply(lambda x: x.value_counts().idxmax())

LT
1     MDS
2     MDS
3     MDS
4     PDS
5     MDS
6     DVS
7     MDS
8     MDS
9     MDS
10    MDS
11    PDS
13    MDS
14    MDS
15    MDS
Name: love_subj, dtype: object

## datetime vs period
- datetime: one point in time
- period: one chunk of time

dt: has functions like is_startofmonth

pd.date_range('January 2020', 'March 2021', freq='M')
pd.period_range('January 2020', 'March 2021', freq='M')

In [86]:
pd.date_range('January 2020', 'March 2021', freq='M') #returns array like


DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31',
               '2020-09-30', '2020-10-31', '2020-11-30', '2020-12-31',
               '2021-01-31', '2021-02-28'],
              dtype='datetime64[ns]', freq='M')

In [87]:
pd.period_range('January 2020', 'March 2021', freq='M') #returns array like


PeriodIndex(['2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06',
             '2020-07', '2020-08', '2020-09', '2020-10', '2020-11', '2020-12',
             '2021-01', '2021-02', '2021-03'],
            dtype='period[M]', freq='M')

In [94]:
dates = pd.date_range('January 1 2021', 'January 5 2021', freq='H')
dates
dates[dates.hour>8]


DatetimeIndex(['2021-01-01 09:00:00', '2021-01-01 10:00:00',
               '2021-01-01 11:00:00', '2021-01-01 12:00:00',
               '2021-01-01 13:00:00', '2021-01-01 14:00:00',
               '2021-01-01 15:00:00', '2021-01-01 16:00:00',
               '2021-01-01 17:00:00', '2021-01-01 18:00:00',
               '2021-01-01 19:00:00', '2021-01-01 20:00:00',
               '2021-01-01 21:00:00', '2021-01-01 22:00:00',
               '2021-01-01 23:00:00', '2021-01-02 09:00:00',
               '2021-01-02 10:00:00', '2021-01-02 11:00:00',
               '2021-01-02 12:00:00', '2021-01-02 13:00:00',
               '2021-01-02 14:00:00', '2021-01-02 15:00:00',
               '2021-01-02 16:00:00', '2021-01-02 17:00:00',
               '2021-01-02 18:00:00', '2021-01-02 19:00:00',
               '2021-01-02 20:00:00', '2021-01-02 21:00:00',
               '2021-01-02 22:00:00', '2021-01-02 23:00:00',
               '2021-01-03 09:00:00', '2021-01-03 10:00:00',
               '2021-01-

In [102]:
dates_series = pd.Series(range(len(dates)),index = dates)
dates_series['2021-01-04 20:00:00':'2021-01-05 00:00:00']

2021-01-04 20:00:00    92
2021-01-04 21:00:00    93
2021-01-04 22:00:00    94
2021-01-04 23:00:00    95
2021-01-05 00:00:00    96
Freq: H, dtype: int64

## asfreq vs resample
- resample bins data - you can use it like groupby
- I usually use asfreq to add nans to missing dates

df.asfreq(freq='S')
df.resample('S', how='sum')

In [108]:
peeps = ppl.copy()
peeps['bdate'] = pd.to_datetime(peeps['bdate'])
peeps = peeps.set_index('bdate')
peeps.head()

Unnamed: 0_level_0,name,nickname,size_household,civil_status
bdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1991-09-02,Gilian Uy,gil,6,S
1995-02-10,Amos Changcoco,moskie,4,S
1994-06-15,Antonio Rafael Fernando,Toby,4,S
1983-03-06,Jacqueline Yu Cabrera,Jacq,2,M
1998-01-22,Agnes Lazo,agnes,6,S


<pandas.core.resample.DatetimeIndexResampler object at 0x00000211E9CC4608>

# Other common functions
## Getting subsets / masking 
- df[mask1 & mask2]   
- & | ^
- s.isin  
- isduplicate
- isnull
- query

## Dropping bad rows/columns
- dropna
- drop_duplicates

## Sorting
- sort_values
- sort_index

## Row-wise operations
- map
- apply
- applymap

## Miscellaneous
- value_counts
- unique
- nunique
- head
- tail

In [118]:
sb.head()

Unnamed: 0,nickname,MBTI,size_household,civil_status,num_sis,num_bro,num_pet,num_jowa,single,date_last_left_house,hours_outside_per_week
0,gil,INTP,6,S,1.0,2.0,3.0,3.0,Y,2021/06/05,2.0
1,moskie,INFJ,4,S,2.0,0.0,1.0,1.0,Y,2021/06/08,3.0
2,toby,ENTP,4,S,1.0,0.0,2.0,1.0,N,2021/06/09,6.0
3,jacq,INTJ,2,M,1.0,0.0,1.0,3.0,N,2021/05/27,1.0
4,agnes,ENFP,6,S,2.0,1.0,2.0,2.0,N,2021/06/06,1.0


In [119]:
ppl.head()

Unnamed: 0,name,nickname,bdate,size_household,civil_status
0,Gilian Uy,gil,02 Sep 1991,6,S
1,Amos Changcoco,moskie,10 Feb 1995,4,S
2,Antonio Rafael Fernando,Toby,15 Jun 1994,4,S
3,Jacqueline Yu Cabrera,Jacq,06 Mar 1983,2,M
4,Agnes Lazo,agnes,22 Jan 1998,6,S


Unnamed: 0,nickname,MBTI,single,bdate
0,gil,INTP,Y,1991-09-02
1,moskie,INFJ,Y,1995-02-10
2,toby,ENTP,N,1994-06-15
3,jacq,INTJ,N,1983-03-06
4,agnes,ENFP,N,1998-01-22
5,kash,INTJ-T,N,1982-04-19
6,joaqs,INFP,N,1998-12-29
7,mimay,ENFP-T,Y,1995-03-10
8,bonnie,INTJ,N,1986-11-29
9,mason,ENFP-T,Y,1991-02-06


# Fun question times!

**What's the most common love-hate subject combo**

**What's the average age of the cohort?**

**Who's the most eligible person for X?**

**Who are the risky people in the batch? (wrt to # people in the household and number of hours spent outside)**

## Summary info
- df.info
- df.describe