**Basics of data analysis : Part 2**

- columns - get the column names of the dataframe
- value_counts - Gives the count of the values present in the column - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html
- sort_values - Sorts the dataframe - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html

In [1]:
import pandas as pd

In [2]:
sleep = pd.read_csv("Data_From_Kaggle - Sleep_health_and_lifestyle_dataset.csv")

**Shape of the data**

In [3]:
sleep.shape

(374, 13)

**Column names**

In [4]:
sleep.columns

Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')

In [5]:
sleep.columns.to_list()

['Person ID',
 'Gender',
 'Age',
 'Occupation',
 'Sleep Duration',
 'Quality of Sleep',
 'Physical Activity Level',
 'Stress Level',
 'BMI Category',
 'Blood Pressure',
 'Heart Rate',
 'Daily Steps',
 'Sleep Disorder']

**Check the first few rows**

In [6]:
sleep.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30,8,Obese,140/90,85,3000,Insomnia
6,7,Male,29,Teacher,6.3,6,40,7,Obese,140/90,82,3500,Insomnia
7,8,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,
8,9,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,
9,10,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,


In [7]:
sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           374 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [8]:
sleep['Person ID'] = sleep['Person ID'].astype('str')

# Value Counts

- normalize : default False
- sort : default True
- ascending : default False
- bins : int, optional,  works with numeric data
- dropna :  default True (Don't include NaN counts)

In [9]:
sleep.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


## Explore the data with value counts
- What is the distribution by Gender
- Which is the most common occupation in the data
- What % of records are Overweight
- Which Age group has more number of records

In [10]:
# Distribution by Gender
gender_distribution = sleep['Gender'].value_counts()
gender_distribution

Male      189
Female    185
Name: Gender, dtype: int64

In [11]:
# Distribution by Occupation
occupation_distribution = sleep['Occupation'].value_counts()
occupation_distribution

Nurse                   73
Doctor                  71
Engineer                63
Lawyer                  47
Teacher                 40
Accountant              37
Salesperson             32
Software Engineer        4
Scientist                4
Sales Representative     2
Manager                  1
Name: Occupation, dtype: int64

In [12]:
# Distribution by BMI Category - normalize = True
bmi_distribution = sleep['BMI Category'].value_counts(normalize=True).round(3)*100
bmi_distribution

Normal           52.1
Overweight       39.6
Normal Weight     5.6
Obese             2.7
Name: BMI Category, dtype: float64

In [14]:
# Distribution by BMI Category - normalize = True, ascending = True
bmi_distribution = sleep['BMI Category'].value_counts(normalize=True, ascending = True).round(3)*100
bmi_distribution

Obese             2.7
Normal Weight     5.6
Overweight       39.6
Normal           52.1
Name: BMI Category, dtype: float64

In [15]:
# Age distribution
age_distribution =  sleep['Age'].value_counts(bins = 5)
age_distribution

(39.8, 46.2]      103
(33.4, 39.8]       81
(26.967, 33.4]     80
(52.6, 59.0]       59
(46.2, 52.6]       51
Name: Age, dtype: int64

# Sort values
- by : generally column or ist of columns
- axis : default 0; 0 - index, 1 - columns
- ascending : default True
- inplace :  default False
- kind : default 'quicksort'
- na_position : default 'last'
- ignore_index : dafault False
- key : key function

In [16]:
sleep.columns

Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')

## Sort the data
- by Age
- by Sleep Duration
- by Quality of Sleep
- by Stress Level
- by Age, Stress Level, Sleep Duration

In [20]:
# sleep.sort_values(by = 'Age')
# sleep.sort_values(by = 'Sleep Duration')
# sleep.sort_values(by = 'Quality of Sleep')
sleep.sort_values(by = 'Stress Level')

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
373,374,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
322,323,Female,53,Engineer,8.4,9,30,3,Normal,125/80,65,5000,
51,52,Male,32,Engineer,7.5,8,45,3,Normal,120/80,70,8000,
50,51,Male,32,Engineer,7.5,8,45,3,Normal,120/80,70,8000,
323,324,Female,53,Engineer,8.5,9,30,3,Normal,125/80,65,5000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,58,Male,32,Doctor,6.0,6,30,8,Normal,125/80,72,5000,
58,59,Male,32,Doctor,6.0,6,30,8,Normal,125/80,72,5000,
285,286,Female,50,Nurse,6.0,6,90,8,Overweight,140/95,75,10000,Sleep Apnea
52,53,Male,32,Doctor,6.0,6,30,8,Normal,125/80,72,5000,


In [21]:
# sleep.sort_values(by = 'Age', ascending = False)
# sleep.sort_values(by = 'Sleep Duration', ascending = False)
# sleep.sort_values(by = 'Quality of Sleep', ascending = False)
sleep.sort_values(by = 'Stress Level', ascending = False)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
64,65,Male,32,Doctor,6.2,6,30,8,Normal,125/80,72,5000,
62,63,Male,32,Doctor,6.2,6,30,8,Normal,125/80,72,5000,
60,61,Male,32,Doctor,6.0,6,30,8,Normal,125/80,72,5000,
270,271,Female,49,Nurse,6.1,6,90,8,Overweight,140/95,75,10000,Sleep Apnea
58,59,Male,32,Doctor,6.0,6,30,8,Normal,125/80,72,5000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,325,Female,53,Engineer,8.3,9,30,3,Normal,125/80,65,5000,
323,324,Female,53,Engineer,8.5,9,30,3,Normal,125/80,65,5000,
322,323,Female,53,Engineer,8.4,9,30,3,Normal,125/80,65,5000,
321,322,Female,53,Engineer,8.4,9,30,3,Normal,125/80,65,5000,


In [22]:
sleep.sort_values(by = ['Age', 'Stress Level', 'Sleep Duration'])

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30,8,Obese,140/90,85,3000,Insomnia
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,374,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
360,361,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
361,362,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
362,363,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [23]:
sleep.sort_values(by = ['Age', 'Stress Level', 'Sleep Duration'], ascending = [False, True, False])

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
360,361,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
361,362,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
362,363,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
363,364,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
359,360,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30,8,Obese,140/90,85,3000,Insomnia


## Storing the sorted dataframe

In [24]:
sleep_sorted = sleep.sort_values(by = ['Age', 'Stress Level', 'Sleep Duration'], ascending = [False, False, False])

sleep_sorted

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
360,361,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
361,362,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
362,363,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
363,364,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
359,360,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30,8,Obese,140/90,85,3000,Insomnia


In [25]:
sleep.sort_values(by = ['Age', 'Stress Level', 'Sleep Duration'], ascending = [False, False, False], inplace = True)

sleep

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
360,361,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
361,362,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
362,363,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
363,364,Female,59,Nurse,8.2,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
359,360,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
5,6,Male,28,Software Engineer,5.9,4,30,8,Obese,140/90,85,3000,Insomnia


## Sort Index : Sort the dataframe back to original order
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_index.html
- axis :  default 0
- ascending : dafault True
- inplace : default False
- ignore_index : default False; if True - the axis will be labeled again from 0

In [26]:
sleep.index

Int64Index([360, 361, 362, 363, 359, 366, 368, 369, 371, 372,
            ...
              7,   8,   9,  11,   1,   2,   3,   4,   5,   0],
           dtype='int64', length=374)

In [27]:
sleep.sort_index()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [28]:
sleep.sort_index(ascending = True, inplace = True)
sleep

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [29]:
sleep.sort_index(axis=1)

Unnamed: 0,Age,BMI Category,Blood Pressure,Daily Steps,Gender,Heart Rate,Occupation,Person ID,Physical Activity Level,Quality of Sleep,Sleep Disorder,Sleep Duration,Stress Level
0,27,Overweight,126/83,4200,Male,77,Software Engineer,1,42,6,,6.1,6
1,28,Normal,125/80,10000,Male,75,Doctor,2,60,6,,6.2,8
2,28,Normal,125/80,10000,Male,75,Doctor,3,60,6,,6.2,8
3,28,Obese,140/90,3000,Male,85,Sales Representative,4,30,4,Sleep Apnea,5.9,8
4,28,Obese,140/90,3000,Male,85,Sales Representative,5,30,4,Sleep Apnea,5.9,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,59,Overweight,140/95,7000,Female,68,Nurse,370,75,9,Sleep Apnea,8.1,3
370,59,Overweight,140/95,7000,Female,68,Nurse,371,75,9,Sleep Apnea,8.0,3
371,59,Overweight,140/95,7000,Female,68,Nurse,372,75,9,Sleep Apnea,8.1,3
372,59,Overweight,140/95,7000,Female,68,Nurse,373,75,9,Sleep Apnea,8.1,3
