# Link to the dataset 
https://www.kaggle.com/datasets/siamaktahmasbi/insights-into-sleep-patterns-and-daily-habits

# The imports 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the dataset 

In [None]:
df = pd.read_csv('sleep_health_lifestyle_dataset.csv')

# Consultation and cleaning

In [None]:
df.isnull().sum()

In [None]:
df.head(3)

#### Cleaning

In [None]:
df.fillna(value='No sleep disorder',inplace=True)

In [None]:
# Split 'Blood Pressure' into systolic and diastolic
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure (systolic/diastolic)'].str.split('/', expand=True).astype(int)
df.drop('Blood Pressure (systolic/diastolic)', axis=1, inplace=True)


#### Datatypes of columns

In [None]:
# Get the list of column names
columns = df.columns.tolist()

# Get the list of data types
dtypes = df.dtypes.tolist()

# Print each column name with its corresponding data type
for col, dtype in zip(columns, dtypes):
    print(f"{col} - {dtype}")

#### Printing unique values for object type columns

In [None]:
for col_name, col_series in df.items():
    if pd.api.types.is_object_dtype(col_series):
        print ('##############################')
        print(f"Column: {col_name}")
        if len(col_series.unique()) < 10:
            for value in col_series.unique():
                count = len(df[df[col_name] == value])
                print(f"  Value: {value}, Count: {count}")
            print(f"  Type: {col_series.dtype}") 

        else:
            print('  Number of unique values in the column', col_name, 'are', len(col_series.unique()))
            print(f"  Type: {col_series.dtype}")  

# Plots 
#### Distribution of 'Age'

In [None]:
plt.hist(df['Age'], bins=10, edgecolor='black')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

#### Boxplot of 'Sleep Duration' by 'Gender'


In [None]:
sns.boxplot(x='Gender', y='Sleep Duration (hours)', data=df)
plt.title('Sleep Duration by Gender')
plt.show()

#### Correlation matrix for numerical variables

In [None]:
# Descriptive statistics for numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
# Correlation matrix for numerical variables
corr = df[numerical_cols].corr().round(2)
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

#### Bar plot of 'Sleep Disorder' distribution

In [None]:
sns.countplot(x='Sleep Disorder', data=df, order=df['Sleep Disorder'].value_counts().index)
plt.title('Distribution of Sleep Disorders')
plt.xlabel('Sleep Disorder')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

#### Compare 'Quality of Sleep' across 'BMI Category'

In [None]:
sns.boxplot(x='BMI Category', y='Quality of Sleep (scale: 1-10)', data=df)
plt.title('Quality of Sleep by BMI Category')
plt.xticks(rotation=45)
plt.show()

#### Average 'Stress Level' by 'Occupation'

In [None]:
avg_stress = df.groupby('Occupation')['Stress Level (scale: 1-10)'].mean().sort_values(ascending=False)
avg_stress.plot(kind='bar')
plt.title('Average Stress Level by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Average Stress Level')
plt.show()