# Reshaping DataFrames with Dummies

#### Loading Libraries

In [34]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# Category Boost
import catboost as cb

#### Loading Data - Dummy Columns

In [35]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/'\
      '2020-jetbrains-python-survey.csv'
jb = pd.read_csv(url)

  jb = pd.read_csv(url)


In [36]:
jb.filter(like='job.role')

Unnamed: 0,job.role.DBA,job.role.Architect,job.role.QA engineer,job.role.Developer / Programmer,job.role.Technical writer,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other
0,,,,,,,,Business analyst,,,,,
1,,,,Developer / Programmer,,,,,,,,,
2,,,,Developer / Programmer,,Technical support,Data analyst,,Team lead,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,,,,,,,,,,,,Systems analyst,
54458,,,,,,,,,,,,,
54459,,,,,,,,,,,CIO / CEO / CTO,,
54460,,,,Developer / Programmer,,,Data analyst,,,,,,


In [37]:
(jb
 .filter(like=r'job.role.*t')
 .where(jb.isna(), 1)
)

0
1
2
3
4
...
54457
54458
54459
54460
54461


In [38]:
(jb
 .filter(like=r'job.role/*t')
 .where(jb.isna(), 1)
 .fillna(0)
)

0
1
2
3
4
...
54457
54458
54459
54460
54461


In [39]:
(jb
 .filter(like=r'job.role.*t')
 .where(jb.isna(), 1)
 .fillna(0)
)

0
1
2
3
4
...
54457
54458
54459
54460
54461


In [40]:
(jb
 .filter(like=r'job.role')
 .where(jb.isna(), 1)
 .fillna(0)
 .idxmax(axis='columns')
)

  .fillna(0)


0              job.role.Business analyst
1        job.role.Developer / Programmer
2        job.role.Developer / Programmer
3                           job.role.DBA
4                           job.role.DBA
                      ...               
54457           job.role.Systems analyst
54458                       job.role.DBA
54459           job.role.CIO / CEO / CTO
54460    job.role.Developer / Programmer
54461                 job.role.Architect
Length: 54462, dtype: object

In [41]:
job = (jb
 .filter(like=r'job.role')
 .where(jb.isna(), 1)
 .fillna(0)
 .idxmax(axis='columns')
 .str.replace('job.role.', '', regex=False)
)
job

  .fillna(0)


0              Business analyst
1        Developer / Programmer
2        Developer / Programmer
3                           DBA
4                           DBA
                  ...          
54457           Systems analyst
54458                       DBA
54459           CIO / CEO / CTO
54460    Developer / Programmer
54461                 Architect
Length: 54462, dtype: object

In [42]:
dum = pd.get_dummies(job)
dum

Unnamed: 0,Architect,Business analyst,CIO / CEO / CTO,DBA,Data analyst,Developer / Programmer,Other,Product manager,QA engineer,Systems analyst,Team lead,Technical support,Technical writer
0,False,True,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,False,False,False,False,False,False,False,False,False,True,False,False,False
54458,False,False,False,True,False,False,False,False,False,False,False,False,False
54459,False,False,True,False,False,False,False,False,False,False,False,False,False
54460,False,False,False,False,False,True,False,False,False,False,False,False,False


#### Undoing Dummy Columns

In [43]:
dum.idxmax(axis='columns')

0              Business analyst
1        Developer / Programmer
2        Developer / Programmer
3                           DBA
4                           DBA
                  ...          
54457           Systems analyst
54458                       DBA
54459           CIO / CEO / CTO
54460    Developer / Programmer
54461                 Architect
Length: 54462, dtype: object

In [44]:
i, j = np.where(dum)
pd.Series(dum.columns[j], i)

0              Business analyst
1        Developer / Programmer
2        Developer / Programmer
3                           DBA
4                           DBA
                  ...          
54457           Systems analyst
54458                       DBA
54459           CIO / CEO / CTO
54460    Developer / Programmer
54461                 Architect
Length: 54462, dtype: object

# Reshaping by Pivoting & Grouping

## Retrieving from Chapter 21

In [45]:
jb = pd.read_csv(url)
jb    

  jb = pd.read_csv(url)


Unnamed: 0,is.python.main,other.lang.None,other.lang.Java,other.lang.JavaScript,other.lang.C/C++,other.lang.PHP,other.lang.C#,other.lang.Ruby,other.lang.Bash / Shell,other.lang.Objective-C,...,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other,age,country.live
0,Yes,,,,,,,,Bash / Shell,,...,,,Business analyst,,,,,,30–39,
1,Yes,,Java,JavaScript,,,C#,,,,...,,,,,,,,,21–29,India
2,Yes,,,,C/C++,,,,Bash / Shell,,...,Technical support,Data analyst,,Team lead,,,,,30–39,United States
3,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,,
4,Yes,,Java,JavaScript,C/C++,,,,Bash / Shell,,...,,,,,,,,,21–29,Italy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,Yes,,,,C/C++,,,,Bash / Shell,Objective-C,...,,,,,,,Systems analyst,,21–29,Russian Federation
54458,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,,
54459,Yes,,,JavaScript,,PHP,,,Bash / Shell,,...,,,,,,CIO / CEO / CTO,,,21–29,Russian Federation
54460,Yes,,,JavaScript,C/C++,PHP,,,Bash / Shell,,...,,Data analyst,,,,,,,30–39,Spain


In [46]:
import collections
counter = collections.defaultdict(list)
for col in sorted(jb.columns):
    period_count = col.count('.')
    if period_count >= 2:
        part_end = 2
    else:
        part_end = 1
    parts = col.split('.')[:part_end]
    counter['.'.join(parts)].append(col)
uniq_cols = []
for cols in counter.values():
    if len(cols) == 1:
        uniq_cols.extend(cols)

In [47]:
(jb
 [uniq_cols]
 .rename(columns=lambda c: c.replace('.', '_'))
 .age
 .str.slice(0,2)
 .astype(float)
 .astype('Int64')
)

0          30
1          21
2          30
3        <NA>
4          21
         ... 
54457      21
54458    <NA>
54459      21
54460      30
54461      21
Name: age, Length: 54462, dtype: Int64

In [48]:
jb2 = jb[uniq_cols]
age_slice = jb.age.str.slice(0, 2)
age_float = age_slice.astype(float)
age_int = age_float.astype('Int64')
jb2['age'] = age_int

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jb2['age'] = age_int


In [49]:
jb2 = jb[uniq_cols]
age_slice = jb.age.str.slice(0, 2)
age_float = age_slice.astype(float)
age_int = age_float.astype('Int64')
jb2['age'] = age_int

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jb2['age'] = age_int


In [50]:
(jb
 [uniq_cols]
 .rename(columns=lambda c: c.replace('.', '_'))
 .assign(age=lambda df_:df_.age.str.slice(0,2)
                           .astype(float).astype('Int64'),
         are_you_datascientist=lambda df_: df_.are_you_datascientist
            .replace({'Yes': True, 'No': False, np.nan: False})
        )
 .are_you_datascientist
)

0        False
1         True
2        False
3        False
4        False
         ...  
54457    False
54458    False
54459    False
54460     True
54461    False
Name: are_you_datascientist, Length: 54462, dtype: object

In [51]:
(jb
 [uniq_cols]
 .rename(columns=lambda c: c.replace('.', '_'))
 .assign(age=lambda df_:df_.age.str.slice(0,2)
                           .astype(float).astype('Int64'),
        are_you_datascientist=lambda df_: df_.are_you_datascientist
            .replace({'Yes': True, 'No': False, np.nan: False})
        )
 .company_size
 .value_counts(dropna=False)
)

company_size
NaN                35037
51–500              4608
More than 5,000     3635
11–50               3507
2–10                2558
1,001–5,000         1934
Just me             1492
501–1,000           1165
Not sure             526
Name: count, dtype: int64

In [52]:
jb2 = (jb
 [uniq_cols]
 .rename(columns=lambda c: c.replace('.', '_'))
 .assign(age=lambda df_:df_.age.str.slice(0,2).astype(float)
             .astype('Int64'),
         are_you_datascientist=lambda df_:df_.are_you_datascientist
             .replace({'Yes': True, 'No': False, np.nan: False}),
         company_size=lambda df_:df_.company_size.replace({
             'Just me': 1, 'Not sure': np.nan, 
             'More than 5,000': 5000, '2–10': 2, '11–50':11,
             '51–500': 51, '501–1,000':501,
             '1,001–5,000':1001}).astype('Int64'),
         country_live=lambda df_:df_.country_live.astype('category'),
         employment_status=lambda df_:df_.employment_status
              .fillna('Other').astype('category'),
         is_python_main=lambda df_:df_.is_python_main
              .astype('category'),
         team_size=lambda df_:df_.team_size
             .str.split(r'-', n=1, expand=True)
             .iloc[:,0].replace('More than 40 people', 41)
             .where(df_.company_size!=1, 1).astype(float),
         years_of_coding=lambda df_:df_.years_of_coding
             .replace('Less than 1 year', .5).str.extract(r'(\d+)')
             .astype(float),
         python_years=lambda df_:df_.python_years
             .replace('Less than 1 year', .5).str.extract(r'(\d+)')
             .astype(float),
         python3_ver=lambda df_:df_.python3_version_most
              .str.replace('_', '.').str.extract(r'(\d\.\d)')
              .astype(float),
         use_python_most=lambda df_:df_.use_python_most
              .fillna('Unknown')
        )
    .drop(columns=['python2_version_most'])
)

  company_size=lambda df_:df_.company_size.replace({


In [53]:
jb2

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
0,30,False,1,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3.0,Python 3_7,"Yes, I work on many different projects",1.0,Unknown,1.0,3.7
1,21,True,5000,India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2.0,Software prototyping,3.0,3.6
2,30,False,5000,United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3.0,3.6
3,,False,,,Other,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11.0,Python 3_8,"Yes, I work on many different projects",1.0,Web development,11.0,3.8
4,21,False,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",1.0,Web development,,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,21,False,2,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",,Data analysis,1.0,3.6
54458,,False,,,Other,,,,Yes,,Both for work and personal,,,3.0,Python 3_7,,1.0,Web development,1.0,3.7
54459,21,False,1,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",1.0,Web development,6.0,3.7
54460,30,True,51,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",,Data analysis,3.0,3.7


In [54]:
(jb2
 .query('team_size.isna()')
 .employment_status
 .value_counts(dropna=False)
)

employment_status
Fully employed by a company / organization                                                        5279
Working student                                                                                    696
Partially employed by a company / organization                                                     482
Self-employed (a person earning income directly from one's own business, trade, or profession)     430
Freelancer (a person pursuing a profession without a long-term commitment to any one employer)       0
Other                                                                                                0
Retired                                                                                              0
Student                                                                                              0
Name: count, dtype: int64

In [55]:
jb2

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
0,30,False,1,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3.0,Python 3_7,"Yes, I work on many different projects",1.0,Unknown,1.0,3.7
1,21,True,5000,India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2.0,Software prototyping,3.0,3.6
2,30,False,5000,United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3.0,3.6
3,,False,,,Other,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11.0,Python 3_8,"Yes, I work on many different projects",1.0,Web development,11.0,3.8
4,21,False,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",1.0,Web development,,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,21,False,2,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",,Data analysis,1.0,3.6
54458,,False,,,Other,,,,Yes,,Both for work and personal,,,3.0,Python 3_7,,1.0,Web development,1.0,3.7
54459,21,False,1,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",1.0,Web development,6.0,3.7
54460,30,True,51,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",,Data analysis,3.0,3.7


## Now! Let's delve in

#### A Basic Example

In [59]:
# Pivot Table Procedure
(jb2
 .pivot_table(index='country_live', columns='employment_status',
              values='age', aggfunc='mean', observed=False)
)

employment_status,Freelancer (a person pursuing a profession without a long-term commitment to any one employer),Fully employed by a company / organization,Other,Partially employed by a company / organization,Retired,"Self-employed (a person earning income directly from one's own business, trade, or profession)",Student,Working student
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Algeria,33.285714,28.344828,43.333333,25.181818,60.0,28.411765,20.4375,21.0
Argentina,34.954545,30.209524,40.0,30.25,55.0,29.571429,22.913043,23.2
Armenia,30.0,23.842105,60.0,24.0,40.0,24.0,,
Australia,30.818182,33.074468,33.1,32.0,48.285714,39.21875,24.828571,22.470588
Austria,35.857143,31.318182,30.0,29.0,60.0,34.615385,20.842105,23.8125
...,...,...,...,...,...,...,...,...
United States,34.318471,32.343223,35.534884,25.894737,54.920635,38.446512,21.947791,22.89375
Uruguay,30.0,28.909091,,36.2,,30.0,20.0,
Uzbekistan,36.0,22.125,29.0,21.0,,30.5,19.0,21.0
Venezuela,29.888889,27.826087,30.5,26.8,55.0,28.75,20.454545,28.833333


In [60]:
# CrossTab Procedure
pd.crosstab(index=jb2.country_live, columns=jb2.employment_status,
    values=jb2.age, aggfunc='mean')

employment_status,Freelancer (a person pursuing a profession without a long-term commitment to any one employer),Fully employed by a company / organization,Other,Partially employed by a company / organization,Retired,"Self-employed (a person earning income directly from one's own business, trade, or profession)",Student,Working student
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Algeria,33.285714,28.344828,43.333333,25.181818,60.0,28.411765,20.4375,21.0
Argentina,34.954545,30.209524,40.0,30.25,55.0,29.571429,22.913043,23.2
Armenia,30.0,23.842105,60.0,24.0,40.0,24.0,,
Australia,30.818182,33.074468,33.1,32.0,48.285714,39.21875,24.828571,22.470588
Austria,35.857143,31.318182,30.0,29.0,60.0,34.615385,20.842105,23.8125
...,...,...,...,...,...,...,...,...
United States,34.318471,32.343223,35.534884,25.894737,54.920635,38.446512,21.947791,22.89375
Uruguay,30.0,28.909091,,36.2,,30.0,20.0,
Uzbekistan,36.0,22.125,29.0,21.0,,30.5,19.0,21.0
Venezuela,29.888889,27.826087,30.5,26.8,55.0,28.75,20.454545,28.833333


In [62]:
# Group By Procedure 
(jb2
 .groupby(['country_live', 'employment_status'], observed=False)
 .age
 .mean()
 .unstack()
)

employment_status,Freelancer (a person pursuing a profession without a long-term commitment to any one employer),Fully employed by a company / organization,Other,Partially employed by a company / organization,Retired,"Self-employed (a person earning income directly from one's own business, trade, or profession)",Student,Working student
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Algeria,33.285714,28.344828,43.333333,25.181818,60.0,28.411765,20.4375,21.0
Argentina,34.954545,30.209524,40.0,30.25,55.0,29.571429,22.913043,23.2
Armenia,30.0,23.842105,60.0,24.0,40.0,24.0,,
Australia,30.818182,33.074468,33.1,32.0,48.285714,39.21875,24.828571,22.470588
Austria,35.857143,31.318182,30.0,29.0,60.0,34.615385,20.842105,23.8125
...,...,...,...,...,...,...,...,...
United States,34.318471,32.343223,35.534884,25.894737,54.920635,38.446512,21.947791,22.89375
Uruguay,30.0,28.909091,,36.2,,30.0,20.0,
Uzbekistan,36.0,22.125,29.0,21.0,,30.5,19.0,21.0
Venezuela,29.888889,27.826087,30.5,26.8,55.0,28.75,20.454545,28.833333


#### Using a Custom Aggregation Function

In [63]:
# Setting up Function
def per_emacs(ser):
    return ser.str.containts('Emacs').sum() / len(ser) * 100

In [64]:
def per_emacs(ser):
    return ser.str.contains('Emacs').mean() * 100

In [66]:
(jb2
 .pivot_table(index='country_live', values='ide_main', aggfunc=per_emacs, observed=False)
)

Unnamed: 0_level_0,ide_main
country_live,Unnamed: 1_level_1
Algeria,0.000000
Argentina,4.347826
Armenia,0.000000
Australia,3.000000
Austria,1.648352
...,...
United States,4.187386
Uruguay,0.000000
Uzbekistan,0.000000
Venezuela,0.000000


In [68]:
(jb2
 .pivot_table(index='country_live', values='ide_main', aggfunc=per_emacs, observed=False)
)

Unnamed: 0_level_0,ide_main
country_live,Unnamed: 1_level_1
Algeria,0.000000
Argentina,4.347826
Armenia,0.000000
Australia,3.000000
Austria,1.648352
...,...
United States,4.187386
Uruguay,0.000000
Uzbekistan,0.000000
Venezuela,0.000000


In [69]:
pd.crosstab(index=jb2.country_live, 
    columns=jb2.assign(iden='emacs_per').iden,
    values=jb2.ide_main, aggfunc=per_emacs)

iden,emacs_per
country_live,Unnamed: 1_level_1
Algeria,0.000000
Argentina,4.347826
Armenia,0.000000
Australia,3.000000
Austria,1.648352
...,...
United States,4.187386
Uruguay,0.000000
Uzbekistan,0.000000
Venezuela,0.000000


In [73]:
(jb2
 .groupby('country_live', observed=False)
 [['ide_main']]
 .agg(per_emacs)
)

Unnamed: 0_level_0,ide_main
country_live,Unnamed: 1_level_1
Algeria,0.000000
Argentina,4.347826
Armenia,0.000000
Australia,3.000000
Austria,1.648352
...,...
United States,4.187386
Uruguay,0.000000
Uzbekistan,0.000000
Venezuela,0.000000


#### Multiple Aggregations

In [76]:
(jb2
    .pivot_table(index='country_live', values='age',
                 aggfunc=('min', 'max'), observed=False)
)

Unnamed: 0_level_0,max,min
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,60,18
Argentina,60,18
Armenia,60,18
Australia,60,18
Austria,60,18
...,...,...
United States,60,18
Uruguay,60,18
Uzbekistan,60,18
Venezuela,60,18


In [79]:
(jb2
 .groupby('country_live', observed=True)
 .age
 .agg(['min', 'max'])
)

Unnamed: 0_level_0,min,max
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,18,60
Argentina,18,60
Armenia,18,60
Australia,18,60
Austria,18,60
...,...,...
United States,18,60
Uruguay,18,60
Uzbekistan,18,60
Venezuela,18,60


In [81]:
pd.crosstab(jb2.country_live, values=jb2.age, aggfunc=('min', 'max'),
    columns=jb2.assign(val='age').val)

Unnamed: 0_level_0,max,min
val,age,age
country_live,Unnamed: 1_level_2,Unnamed: 2_level_2
Algeria,60,18
Argentina,60,18
Armenia,60,18
Australia,60,18
Austria,60,18
...,...,...
United States,60,18
Uruguay,60,18
Uzbekistan,60,18
Venezuela,60,18


#### Per Column Aggregations

In [89]:
# (jb2
#  .pivot_table(index='country_live', 
#               aggfunc=(min, max))
# )

In [91]:
# (jb2
#  .groupby('country_live')
#  .agg([min, max])
# )

In [94]:
(jb2
 .pivot_table(index='country_live', observed=False,
              aggfunc={'age': ['min', 'max'],
                       'team_size': 'mean'})
)

Unnamed: 0_level_0,age,age,team_size
Unnamed: 0_level_1,max,min,mean
country_live,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Algeria,60,18,2.428571
Argentina,60,18,3.192053
Armenia,60,18,6.076923
Australia,60,18,2.710884
Austria,60,18,2.448000
...,...,...,...
United States,60,18,3.391337
Uruguay,60,18,4.692308
Uzbekistan,60,18,1.160000
Venezuela,60,18,1.812500


In [96]:
(jb2
 .groupby('country_live', observed=False)
 .agg({'age': ['min', 'max'],
      'team_size': 'mean'})
)

Unnamed: 0_level_0,age,age,team_size
Unnamed: 0_level_1,min,max,mean
country_live,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Algeria,18,60,2.428571
Argentina,18,60,3.192053
Armenia,18,60,6.076923
Australia,18,60,2.710884
Austria,18,60,2.448000
...,...,...,...
United States,18,60,3.391337
Uruguay,18,60,4.692308
Uzbekistan,18,60,1.160000
Venezuela,18,60,1.812500


In [99]:
(jb2
 .groupby('country_live', observed=False)
 .agg(age_min=('age', 'min'),
      age_max=('age', 'max'),
      team_size_mean=('team_size', 'mean')
     )
)

Unnamed: 0_level_0,age_min,age_max,team_size_mean
country_live,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Algeria,18,60,2.428571
Argentina,18,60,3.192053
Armenia,18,60,6.076923
Australia,18,60,2.710884
Austria,18,60,2.448000
...,...,...,...
United States,18,60,3.391337
Uruguay,18,60,4.692308
Uzbekistan,18,60,1.160000
Venezuela,18,60,1.812500


#### Grouping by Hierarchy

In [103]:
(jb2.pivot_table(index=['country_live', 'ide_main'], observed=True, 
  values='age', aggfunc=['min', 'max']))

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
Unnamed: 0_level_1,Unnamed: 1_level_1,age,age
country_live,ide_main,Unnamed: 2_level_2,Unnamed: 3_level_2
Algeria,Atom,18,60
Algeria,Eclipse + Pydev,18,30
Algeria,IDLE,18,50
Algeria,IntelliJ IDEA,21,21
Algeria,Jupyter Notebook,21,30
...,...,...,...
Viet Nam,PyCharm Community Edition,21,30
Viet Nam,PyCharm Professional Edition,18,30
Viet Nam,Spyder,21,21
Viet Nam,VS Code,18,30


In [111]:
(jb2
 .groupby(by=['country_live', 'ide_main'], observed=True,)
 [['age']]
 .agg(['min', 'max'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max
country_live,ide_main,Unnamed: 2_level_2,Unnamed: 3_level_2
Algeria,Atom,18,60
Algeria,Eclipse + Pydev,18,30
Algeria,IDLE,18,50
Algeria,IntelliJ IDEA,21,21
Algeria,Jupyter Notebook,21,30
...,...,...,...
Viet Nam,PyCharm Community Edition,21,30
Viet Nam,PyCharm Professional Edition,18,30
Viet Nam,Spyder,21,21
Viet Nam,VS Code,18,30


In [113]:
(jb2
 .groupby(by=['country_live', 'ide_main'], observed=False)
 [['age']]
 .agg(['min', 'max'])
 .swaplevel(axis='columns')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
Unnamed: 0_level_1,Unnamed: 1_level_1,age,age
country_live,ide_main,Unnamed: 2_level_2,Unnamed: 3_level_2
Algeria,Atom,18,60
Algeria,Eclipse + Pydev,18,30
Algeria,Emacs,,
Algeria,IDLE,18,50
Algeria,IntelliJ IDEA,21,21
...,...,...,...
Viet Nam,Python Tools for Visual Studio (PTVS),,
Viet Nam,Spyder,21,21
Viet Nam,Sublime Text,,
Viet Nam,VS Code,18,30


In [115]:
(jb2
 .groupby(by=['country_live', 'ide_main'], observed=False()
 .agg(age_min=('age', 'min'), age_max=('age', 'max'))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,age_min,age_max
country_live,ide_main,Unnamed: 2_level_1,Unnamed: 3_level_1
Algeria,Atom,18,60
Algeria,Eclipse + Pydev,18,30
Algeria,Emacs,,
Algeria,IDLE,18,50
Algeria,IntelliJ IDEA,21,21
...,...,...,...
Viet Nam,Python Tools for Visual Studio (PTVS),,
Viet Nam,Spyder,21,21
Viet Nam,Sublime Text,,
Viet Nam,VS Code,18,30


In [117]:
(jb2
 .groupby(by=['country_live', 'ide_main'], observed=True)
 .agg(age_min=('age', 'min'), age_max=('age', 'max'))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,age_min,age_max
country_live,ide_main,Unnamed: 2_level_1,Unnamed: 3_level_1
Algeria,Atom,18,60
Algeria,Eclipse + Pydev,18,30
Algeria,IDLE,18,50
Algeria,IntelliJ IDEA,21,21
Algeria,Jupyter Notebook,21,30
...,...,...,...
Viet Nam,PyCharm Community Edition,21,30
Viet Nam,PyCharm Professional Edition,18,30
Viet Nam,Spyder,21,21
Viet Nam,VS Code,18,30


#### Grouping with Functions

In [118]:
def even_grouper(idx):
    return 'odd' if idx % 2 else 'even'

In [119]:
jb2.pivot_table(index=even_grouper, aggfunc='size')

even    27231
odd     27231
dtype: int64

In [120]:
(jb2
 .groupby(even_grouper)
 .size()
)

even    27231
odd     27231
dtype: int64