# Sleep time analysis data cleaning

## Load libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
data1 = pd.read_csv('data/sleep_analysis_data.csv')
data2 = pd.read_csv('data/sleep_analysis_2_data.csv')

In [3]:
data = pd.concat([data1,data2])
data

Unnamed: 0,Age,Gender,meals/day,physical illness,screen time,bluelight filter,sleep direction,exercise,smoke/drink,beverage,sleep time
0,22,Male,two,no,2hrs,yes,west,sometimes,no,Tea,6.7575
1,22,Female,three,no,3-4 hrs,no,south,no,no,Coffee,8.0000
2,23,Male,three,no,3-4 hrs,no,south,no,no,Tea,8.0000
3,23,Female,two,no,1-2 hrs,no,east,sometimes,no,Coffee,6.5000
4,22,Male,three,no,more than 5,yes,east,sometimes,yes,Tea and Coffee both,6.0000
...,...,...,...,...,...,...,...,...,...,...,...
41,22,Male,three,no,3-4 hrs,no,east,sometimes,no,Tea,6.0000
42,21,Male,two,no,more than 5,no,east,sometimes,no,Tea,7.0000
43,24,Male,three,no,more than 5,yes,east,sometimes,no,none of the above,4.0000
44,25,Prefer not to say,three,yes,more than 5,yes,east,yes,yes,Tea and Coffee both,8.0000


## Checking for data type and null values

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92 entries, 0 to 45
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               92 non-null     int64  
 1   Gender            92 non-null     object 
 2   meals/day         92 non-null     object 
 3   physical illness  92 non-null     object 
 4   screen time       92 non-null     object 
 5   bluelight filter  92 non-null     object 
 6   sleep direction   92 non-null     object 
 7   exercise          92 non-null     object 
 8   smoke/drink       92 non-null     object 
 9   beverage          92 non-null     object 
 10  sleep time        92 non-null     float64
dtypes: float64(1), int64(1), object(9)
memory usage: 8.6+ KB


Null values are not present in our dataset.

## All the different values in each column

In [5]:
labels = list(data.columns)
def values_type(data,labels):
    for label in labels:
        print(label ,": ",data[label].unique())

In [6]:
values_type(data,labels)

Age :  [22 23 24 28 59 25 39 50 20 21]
Gender :  ['Male' 'Female' 'Prefer not to say']
meals/day :  ['two' 'three' 'four' 'more than 5' 'five' 'one']
physical illness :  ['no' 'yes']
screen time :  ['2hrs' '3-4 hrs' '1-2 hrs' 'more than 5' '2-3 hrs' '4-5 hrs' '0-1 hrs']
bluelight filter :  ['yes' 'no']
sleep direction :  ['west' 'south' 'east' 'north']
exercise :  ['sometimes' 'no' 'yes']
smoke/drink :  ['no' 'yes']
beverage :  ['Tea' 'Coffee' 'Tea and Coffee both' 'none of the above']
sleep time :  [6.7575 8.     6.5    6.     7.     1.5    5.     3.33   9.     4.    ]


## Round sleep time value to 2 decimals

In [7]:
data['sleep time'] = data['sleep time'].round(2)

## Adding dummy variables for categorical data

In [8]:
# Change Gender into dummy varables
columns = pd.get_dummies(data['Gender'])
data = pd.concat([data,columns],axis = 1)

In [9]:
# Change sleep direction into dummy varables
columns = pd.get_dummies(data['sleep direction'])
data = pd.concat([data,columns],axis = 1)

In [10]:
# Change exercise into dummy varables
columns = pd.get_dummies(data['exercise'])
data = pd.concat([data,columns],axis = 1)

In [11]:
# Change beverage into dummy varables
columns = pd.get_dummies(data['beverage'])
data = pd.concat([data,columns],axis = 1)

In [12]:
data['Tea'] = data['Tea'] | data['Tea and Coffee both']
data['Coffee'] = data['Coffee'] | data['Tea and Coffee both']

## Change column type from string to numerical

As of Q3 2023, on average users aged 16 to 64 worldwide spent 6 hours and 40 minutes per day on screens across various devices. That equals to 46 hours and 40 minutes for average screen time per week among worldwide internet users.

In [13]:
data['smoke/drink'] = data['smoke/drink'].apply(lambda x : 1 if x=='yes' else 0)
data['bluelight filter'] = data['bluelight filter'].apply(lambda x : 1 if x=='yes' else 0)
data['physical illness'] = data['physical illness'].apply(lambda x : 1 if x=='yes' else 0)
data['screen time'] = data['screen time'].apply(lambda x: 
    2 if x == '2hrs' else
    3.5 if x == '3-4hrs' else
    1.5 if x == '1-2 hrs' else
    6.67 if x == 'more than 5' else
    2.5 if x == '2-3 hrs' else
    4.5 if x == '4-5 hrs' else 
    0.5
)
data['meals/day'] = data['meals/day'].apply(lambda x: 
    1 if x == 'one' else
    2 if x == 'two' else
    3 if x == 'three' else
    4 if x == 'four' else
    5 if x == 'five' else
    6
)

## Drop the unnecessary columns

In [14]:
data.drop(labels=['Gender', 'sleep direction', 'exercise', 'beverage', 'Tea and Coffee both'], axis=1, inplace=True)

## Rename the unnecessary columns

In [15]:
data.rename(columns={
    'Age' : 'age',
    'Male' : 'male',
    'Female' : 'female',
    'none of the above': 'no_beverage',
    'sometimes':'seldom_exercise',
    'yes' : 'frequent_exercise',
    'no' : 'no_exercise',
    'Prefer not to say' : 'other_gender',
    'Tea' : 'tea',
    'Coffee' : 'coffee',
    'physical illness' : 'physical_illness',
    'bluelight filter' : 'bluelight_filter',
    'sleep time' : 'sleep_time',
    'screen time' : 'screen_time'
}, inplace=True)

In [16]:
data

Unnamed: 0,age,meals/day,physical_illness,screen_time,bluelight_filter,smoke/drink,sleep_time,female,male,other_gender,east,north,south,west,no_exercise,seldom_exercise,frequent_exercise,coffee,tea,no_beverage
0,22,2,0,2.00,1,0,6.76,False,True,False,False,False,False,True,False,True,False,False,True,False
1,22,3,0,0.50,0,0,8.00,True,False,False,False,False,True,False,True,False,False,True,False,False
2,23,3,0,0.50,0,0,8.00,False,True,False,False,False,True,False,True,False,False,False,True,False
3,23,2,0,1.50,0,0,6.50,True,False,False,True,False,False,False,False,True,False,True,False,False
4,22,3,0,6.67,1,1,6.00,False,True,False,True,False,False,False,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,22,3,0,0.50,0,0,6.00,False,True,False,True,False,False,False,False,True,False,False,True,False
42,21,2,0,6.67,0,0,7.00,False,True,False,True,False,False,False,False,True,False,False,True,False
43,24,3,0,6.67,1,0,4.00,False,True,False,True,False,False,False,False,True,False,False,False,True
44,25,3,1,6.67,1,1,8.00,False,False,True,True,False,False,False,False,False,True,True,True,False


## Rearrange columns

In [17]:
data.columns

Index(['age', 'meals/day', 'physical_illness', 'screen_time',
       'bluelight_filter', 'smoke/drink', 'sleep_time', 'female', 'male',
       'other_gender', 'east', 'north', 'south', 'west', 'no_exercise',
       'seldom_exercise', 'frequent_exercise', 'coffee', 'tea', 'no_beverage'],
      dtype='object')

In [22]:
data = data[['age', 'meals/day', 'physical_illness', 'screen_time',
       'bluelight_filter', 'smoke/drink', 'female', 'male',
       'other_gender', 'east', 'north', 'south', 'west', 'no_exercise',
       'seldom_exercise', 'frequent_exercise', 'coffee', 'tea',
       'no_beverage', 'sleep_time']]
data

Unnamed: 0,age,meals/day,physical_illness,screen_time,bluelight_filter,smoke/drink,female,male,other_gender,east,north,south,west,no_exercise,seldom_exercise,frequent_exercise,coffee,tea,no_beverage,sleep_time
0,22,2,0,2.00,1,0,False,True,False,False,False,False,True,False,True,False,False,True,False,6.76
1,22,3,0,0.50,0,0,True,False,False,False,False,True,False,True,False,False,True,False,False,8.00
2,23,3,0,0.50,0,0,False,True,False,False,False,True,False,True,False,False,False,True,False,8.00
3,23,2,0,1.50,0,0,True,False,False,True,False,False,False,False,True,False,True,False,False,6.50
4,22,3,0,6.67,1,1,False,True,False,True,False,False,False,False,True,False,True,True,False,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,22,3,0,0.50,0,0,False,True,False,True,False,False,False,False,True,False,False,True,False,6.00
42,21,2,0,6.67,0,0,False,True,False,True,False,False,False,False,True,False,False,True,False,7.00
43,24,3,0,6.67,1,0,False,True,False,True,False,False,False,False,True,False,False,False,True,4.00
44,25,3,1,6.67,1,1,False,False,True,True,False,False,False,False,False,True,True,True,False,8.00


## Save the data as csv file

In [19]:
data.to_csv('data/clean_data.csv',index = False)
print("Data cleaning successful")

Data cleaning successful
