In [1]:
import pandas as pd
import numpy as np

### Tidying when multiple variables are stored as column names

In [2]:
weightlifting = pd.read_csv("../python_cookbook/data/weightlifting_men.csv")
weightlifting

Unnamed: 0,Weight Category,M35 35-39,M40 40-44,M45 45-49,M50 50-54,M55 55-59,M60 60-64,M65 65-69,M70 70-74,M75 75-79,M80 80+
0,56,137,130,125,115,102,92,80,67,62,55
1,62,152,145,137,127,112,102,90,75,67,57
2,69,167,160,150,140,125,112,97,82,75,60
3,77,182,172,165,150,135,122,107,90,82,65
4,85,192,182,175,160,142,130,112,95,87,70
5,94,202,192,182,167,150,137,120,100,90,75
6,105,210,200,190,175,157,142,122,102,95,80
7,105+,217,207,197,182,165,150,127,107,100,85


In [3]:
(weightlifting
    .melt(id_vars="Weight Category",
          var_name="sex_age",
          value_name="Qual Total")
)

Unnamed: 0,Weight Category,sex_age,Qual Total
0,56,M35 35-39,137
1,62,M35 35-39,152
2,69,M35 35-39,167
3,77,M35 35-39,182
4,85,M35 35-39,192
...,...,...,...
75,77,M80 80+,65
76,85,M80 80+,70
77,94,M80 80+,75
78,105,M80 80+,80


In [4]:
(weightlifting
    .melt(id_vars="Weight Category",
          var_name="sex_age",
          value_name="Qual Total")
    ["sex_age"]
    .str.split(expand=True)
)

Unnamed: 0,0,1
0,M35,35-39
1,M35,35-39
2,M35,35-39
3,M35,35-39
4,M35,35-39
...,...,...
75,M80,80+
76,M80,80+
77,M80,80+
78,M80,80+


In [5]:
(weightlifting
    .melt(id_vars="Weight Category",
          var_name="sex_age",
          value_name="Qual Total")
    ["sex_age"]
    .str.split(expand=True)
    .rename(columns={0: "Sex", 1: "Age Group"})
)

Unnamed: 0,Sex,Age Group
0,M35,35-39
1,M35,35-39
2,M35,35-39
3,M35,35-39
4,M35,35-39
...,...,...
75,M80,80+
76,M80,80+
77,M80,80+
78,M80,80+


In [6]:
(weightlifting
    .melt(id_vars="Weight Category",
          var_name="sex_age",
          value_name="Qual Total")
    ["sex_age"]
    .str.split(expand=True)
    .rename(columns={0: "Sex", 1: "Age Group"})
    .assign(Sex=lambda df_: df_.Sex.str[0])
)

Unnamed: 0,Sex,Age Group
0,M,35-39
1,M,35-39
2,M,35-39
3,M,35-39
4,M,35-39
...,...,...
75,M,80+
76,M,80+
77,M,80+
78,M,80+


In [7]:
melted = (weightlifting
             .melt(id_vars="Weight Category",
                   var_name="sex_age",
                   value_name="Qual Total")
         )
tidy = pd.concat([melted
          ["sex_age"]
          .str.split(expand=True)
          .rename(columns={0: "Sex", 1: "Age Group"})
          .assign(Sex=lambda df_: df_.Sex.str[0]),
           melted
           [["Weight Category", "Qual Total"]]],
          axis="columns"
                )
tidy

Unnamed: 0,Sex,Age Group,Weight Category,Qual Total
0,M,35-39,56,137
1,M,35-39,62,152
2,M,35-39,69,167
3,M,35-39,77,182
4,M,35-39,85,192
...,...,...,...,...
75,M,80+,77,65
76,M,80+,85,70
77,M,80+,94,75
78,M,80+,105,80


In [8]:
melted = (weightlifting
             .melt(id_vars="Weight Category",
                   var_name="sex_age",
                   value_name="Qual Total")
         )
(melted
    ["sex_age"]
    .str.split(expand=True)
    .rename(columns={0: "Sex", 1: "Age Group"})
    .assign(Sex=lambda df_: df_.Sex.str[0],
            Category=melted["Weight Category"],
            Total=melted["Qual Total"]
           )
)

Unnamed: 0,Sex,Age Group,Category,Total
0,M,35-39,56,137
1,M,35-39,62,152
2,M,35-39,69,167
3,M,35-39,77,182
4,M,35-39,85,192
...,...,...,...,...
75,M,80+,77,65
76,M,80+,85,70
77,M,80+,94,75
78,M,80+,105,80


In [9]:
(weightlifting
    .melt(id_vars="Weight Category",
          var_name="sex_age",
          value_name="Qual Total")
    .assign(Sex=lambda df_: df_.sex_age.str[0],
           **{"Age Group": (lambda df_: (df_
                                        .sex_age
                                        .str.extract(r'(\d{2}[-+](?:\d{2})?)',
                                                     expand=False)))})
    .drop(columns="sex_age")
)

Unnamed: 0,Weight Category,Qual Total,Sex,Age Group
0,56,137,M,35-39
1,62,152,M,35-39
2,69,167,M,35-39
3,77,182,M,35-39
4,85,192,M,35-39
...,...,...,...,...
75,77,65,M,80+
76,85,70,M,80+
77,94,75,M,80+
78,105,80,M,80+


In [10]:
(weightlifting
    .melt(id_vars="Weight Category",
          var_name="sex_age",
          value_name="Qual Total")
    .assign(Sex=lambda df_: df_.sex_age.str[0],
            Age_Group= lambda df_: df_.sex_age.str.extract(r'(\d{2}[-+](?:\d{2})?)', expand=False))
    .drop(columns="sex_age")
)

Unnamed: 0,Weight Category,Qual Total,Sex,Age_Group
0,56,137,M,35-39
1,62,152,M,35-39
2,69,167,M,35-39
3,77,182,M,35-39
4,85,192,M,35-39
...,...,...,...,...
75,77,65,M,80+
76,85,70,M,80+
77,94,75,M,80+
78,105,80,M,80+
