**Basics of data analysis : Part 5**
# Summarise the data - Pivot Like Excel
- Using groupby - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html
- pivot_table - https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html

In [1]:
import pandas as pd

In [2]:
sleep = pd.read_csv("Data_From_Kaggle - Sleep_health_and_lifestyle_dataset.csv")

**Shape of the data**

In [3]:
sleep.shape

(374, 13)

**Column names**

In [4]:
sleep.columns

Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')

In [5]:
sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           374 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


**Check the first few rows**

In [6]:
sleep.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30,8,Obese,140/90,85,3000,Insomnia
6,7,Male,29,Teacher,6.3,6,40,7,Obese,140/90,82,3500,Insomnia
7,8,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,
8,9,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,
9,10,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,


In [7]:
sleep['Person ID'] = sleep['Person ID'].astype('str')

# Pivot Like Excel

## Group by on multiple columns and pivot

In [8]:
df_grp = sleep.groupby(['Occupation', 'BMI Category'])['Sleep Duration']\
.agg(sleep_duration_mean = 'mean').round(2)\
.reset_index()

df_grp

Unnamed: 0,Occupation,BMI Category,sleep_duration_mean
0,Accountant,Normal,7.16
1,Accountant,Normal Weight,7.52
2,Accountant,Overweight,6.57
3,Doctor,Normal,6.89
4,Doctor,Normal Weight,8.2
5,Doctor,Obese,7.7
6,Engineer,Normal,8.08
7,Engineer,Normal Weight,7.45
8,Engineer,Overweight,6.97
9,Lawyer,Normal,7.44


In [9]:
df_grp.pivot(index = 'Occupation', columns = 'BMI Category', values = 'sleep_duration_mean')

BMI Category,Normal,Normal Weight,Obese,Overweight
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accountant,7.16,7.52,,6.57
Doctor,6.89,8.2,7.7,
Engineer,8.08,7.45,,6.97
Lawyer,7.44,6.9,7.4,7.1
Manager,,,,6.9
Nurse,,6.9,,7.08
Sales Representative,,,5.9,
Salesperson,,,,6.4
Scientist,,,,6.0
Software Engineer,,7.5,5.9,6.1


## Pivot table

In [12]:
sleep.pivot_table(index='Occupation', columns='BMI Category', 
                  values='Sleep Duration', aggfunc='mean').round(2)#.fillna(0)

BMI Category,Normal,Normal Weight,Obese,Overweight
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accountant,7.16,7.52,,6.57
Doctor,6.89,8.2,7.7,
Engineer,8.08,7.45,,6.97
Lawyer,7.44,6.9,7.4,7.1
Manager,,,,6.9
Nurse,,6.9,,7.08
Sales Representative,,,5.9,
Salesperson,,,,6.4
Scientist,,,,6.0
Software Engineer,,7.5,5.9,6.1


## Pivot Table with multiple values

In [13]:
sleep_pivot = sleep.pivot_table(index='Occupation', columns='BMI Category', 
                  values=['Sleep Duration'], 
                  aggfunc=['mean', 'count']).round(2).reset_index()

sleep_pivot

Unnamed: 0_level_0,Occupation,mean,mean,mean,mean,count,count,count,count
Unnamed: 0_level_1,Unnamed: 1_level_1,Sleep Duration,Sleep Duration,Sleep Duration,Sleep Duration,Sleep Duration,Sleep Duration,Sleep Duration,Sleep Duration
BMI Category,Unnamed: 1_level_2,Normal,Normal Weight,Obese,Overweight,Normal,Normal Weight,Obese,Overweight
0,Accountant,7.16,7.52,,6.57,26.0,5.0,,6.0
1,Doctor,6.89,8.2,7.7,,65.0,2.0,4.0,
2,Engineer,8.08,7.45,,6.97,56.0,4.0,,3.0
3,Lawyer,7.44,6.9,7.4,7.1,42.0,1.0,2.0,2.0
4,Manager,,,,6.9,,,,1.0
5,Nurse,,6.9,,7.08,,7.0,,66.0
6,Sales Representative,,,5.9,,,,2.0,
7,Salesperson,,,,6.4,,,,32.0
8,Scientist,,,,6.0,,,,4.0
9,Software Engineer,,7.5,5.9,6.1,,2.0,1.0,1.0


In [14]:
sleep_pivot.columns

MultiIndex([('Occupation',               '',              ''),
            (      'mean', 'Sleep Duration',        'Normal'),
            (      'mean', 'Sleep Duration', 'Normal Weight'),
            (      'mean', 'Sleep Duration',         'Obese'),
            (      'mean', 'Sleep Duration',    'Overweight'),
            (     'count', 'Sleep Duration',        'Normal'),
            (     'count', 'Sleep Duration', 'Normal Weight'),
            (     'count', 'Sleep Duration',         'Obese'),
            (     'count', 'Sleep Duration',    'Overweight')],
           names=[None, None, 'BMI Category'])

In [15]:
sleep_pivot2 = sleep.pivot_table(index='Occupation', columns='BMI Category', 
                  values='Sleep Duration', 
                  aggfunc=['mean', 'count']).round(2).reset_index().fillna(0)

sleep_pivot2

Unnamed: 0_level_0,Occupation,mean,mean,mean,mean,count,count,count,count
BMI Category,Unnamed: 1_level_1,Normal,Normal Weight,Obese,Overweight,Normal,Normal Weight,Obese,Overweight
0,Accountant,7.16,7.52,0.0,6.57,26.0,5.0,0.0,6.0
1,Doctor,6.89,8.2,7.7,0.0,65.0,2.0,4.0,0.0
2,Engineer,8.08,7.45,0.0,6.97,56.0,4.0,0.0,3.0
3,Lawyer,7.44,6.9,7.4,7.1,42.0,1.0,2.0,2.0
4,Manager,0.0,0.0,0.0,6.9,0.0,0.0,0.0,1.0
5,Nurse,0.0,6.9,0.0,7.08,0.0,7.0,0.0,66.0
6,Sales Representative,0.0,0.0,5.9,0.0,0.0,0.0,2.0,0.0
7,Salesperson,0.0,0.0,0.0,6.4,0.0,0.0,0.0,32.0
8,Scientist,0.0,0.0,0.0,6.0,0.0,0.0,0.0,4.0
9,Software Engineer,0.0,7.5,5.9,6.1,0.0,2.0,1.0,1.0


In [16]:
sleep_pivot2.columns

MultiIndex([('Occupation',              ''),
            (      'mean',        'Normal'),
            (      'mean', 'Normal Weight'),
            (      'mean',         'Obese'),
            (      'mean',    'Overweight'),
            (     'count',        'Normal'),
            (     'count', 'Normal Weight'),
            (     'count',         'Obese'),
            (     'count',    'Overweight')],
           names=[None, 'BMI Category'])

In [17]:
# Create the pivot table
pivot_table = sleep.pivot_table(index='Occupation', columns='BMI Category', 
                                values='Sleep Duration', 
                                aggfunc=['mean', 'count']).round(2)

# Format the pivot table for Excel
pivot_table.columns = [f'{stat} of {col}' for stat, col in pivot_table.columns]
pivot_table.reset_index(inplace=True)

pivot_table


Unnamed: 0,Occupation,mean of Normal,mean of Normal Weight,mean of Obese,mean of Overweight,count of Normal,count of Normal Weight,count of Obese,count of Overweight
0,Accountant,7.16,7.52,,6.57,26.0,5.0,,6.0
1,Doctor,6.89,8.2,7.7,,65.0,2.0,4.0,
2,Engineer,8.08,7.45,,6.97,56.0,4.0,,3.0
3,Lawyer,7.44,6.9,7.4,7.1,42.0,1.0,2.0,2.0
4,Manager,,,,6.9,,,,1.0
5,Nurse,,6.9,,7.08,,7.0,,66.0
6,Sales Representative,,,5.9,,,,2.0,
7,Salesperson,,,,6.4,,,,32.0
8,Scientist,,,,6.0,,,,4.0
9,Software Engineer,,7.5,5.9,6.1,,2.0,1.0,1.0
