# <div style="color:white;display:inline-block;border-radius:5px;background-color:#007FFF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b></b>Introduction</p></div>

This dataset provides a comprehensive overview of behavioral classes among Brazilian adolescents. It encompasses crucial information across various domains, including physical activity levels, sedentary behavior, common mental disorders, and lifestyle habits such as sleep duration and alcohol consumption. The dataset serves as a valuable resource for understanding and analyzing the intricate interplay between these factors, offering insights into the behavioral patterns and health outcomes of this demographic group.


# <div style="color:white;display:inline-block;border-radius:5px;background-color:#007FFF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b></b>Data Overview</p></div>


<html>
<head>
<style>
  table {
    border-collapse: collapse;
    width: 100%;
  }

  th, td {
    border: 1px solid #dddddd;
    text-align: left;
    padding: 8px;
  }

  th {
    background-color: #f2f2f2;
  }
</style>
</head>
<body>

<table>
  <tr>
    <th>Column Name</th>
    <th>Definition</th>
  </tr>
  <tr>
    <td>IC</td>
    <td>Identification code</td>
  </tr>
  <tr>
    <td>Sex</td>
    <td>Sex (1 - boys; 2 - girls)</td>
  </tr>
  <tr>
    <td>Age</td>
    <td>Age in years</td>
  </tr>
  <tr>
    <td>Grade</td>
    <td>High school grades (1 - 10; 2 - 11; 3 - 12)</td>
  </tr>
  <tr>
    <td>Class_SES</td>
    <td>Socioeconomic status classification (1 - wealthy; 2 - middle; 3 - lower)</td>
  </tr>
  <tr>
    <td>LPA/day(min)</td>
    <td>Light physical activity per day (min)</td>
  </tr>
  <tr>
    <td>MVPA/day(min)</td>
    <td>Moderate-to-vigorous physical activity per day (min)</td>
  </tr>
  <tr>
    <td>Steps/day</td>
    <td>Number of steps per day</td>
  </tr>
  <tr>
    <td>SB/day(min)</td>
    <td>Sedentary behavior per day (min)</td>
  </tr>
  <tr>
    <td>ST</td>
    <td>Screen time per day (hours)</td>
  </tr>
  <tr>
    <td>SD</td>
    <td>Sleep duration (h:min)</td>
  </tr>
  <tr>
    <td>Weight (kg)</td>
    <td>Body weight (kg)</td>
  </tr>
  <tr>
    <td>Height (m)</td>
    <td>Height (m)</td>
  </tr>
  <tr>
    <td>BMI</td>
    <td>Body mass index (kg.m-2)</td>
  </tr>
  <tr>
    <td>SBP</td>
    <td>Systolic blood pressure (mmHg)</td>
  </tr>
  <tr>
    <td>DBP</td>
    <td>Diastolic blood pressure (mmHg)</td>
  </tr>
  <tr>
    <td>WC</td>
    <td>Waist circumference (cm)</td>
  </tr>
  <tr>
    <td>HW</td>
    <td>Hip circumference (cm)</td>
  </tr>
  <tr>
    <td>WHtR</td>
    <td>Waist-to-height ratio</td>
  </tr>
  <tr>
    <td>Alcohol</td>
    <td>Alcohol use (1 - no; 2 - yes)</td>
  </tr>
  <tr>
    <td>Tobacco</td>
    <td>Tobacco use (1 - no; 2 - yes)</td>
  </tr>
  <tr>
    <td>CMD</td>
    <td>Common mental disorder score</td>
  </tr>
  <tr>
    <td>LCA</td>
    <td>Latent classes</td>
  </tr>
</table>

</body>
</html>


# <div style="color:white;display:inline-block;border-radius:5px;background-color:#007FFF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b></b> Import Libraries</p></div>


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from copy import deepcopy
from functools import partial
import gc
import time

import warnings
warnings.filterwarnings("ignore")

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#007FFF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b></b>Load the Data</p></div>


In [None]:
# Read the Excel file
df = pd.read_excel('/kaggle/input/behavioral-classes-brazilian-adolescents/Behavioral classes dataset of Brazilian adolescents/Dataset.xlsx')
df.head().style.set_properties(**{'background-color':'lightgreen','color':'royalblue','border-color':'#8b8c8c'})

In [None]:
# Map values to their descriptions
df2 = df.replace({
    'Sex': {1: 'Boys', 2: 'Girls'},
    'Grade': { 1:'10th', 2:'11th', 3:'12th'},
    'Class_SES': {1: 'Wealthy', 2: 'Middle', 3: 'Lower'},
    'Alcohol': {1: 'No', 2: 'Yes'},
    'Tobacco': {1: 'No', 2: 'Yes'},
    'LCA': {1: 'Class 1', 2: 'Class 2', 3: 'Class 3', 4: 'Class 4', 5: 'Class 5'}
})

df2.head()

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#007FFF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b></b>Data Cleaning and Preprocessing</p></div>


In [None]:
# Check the dimensions of the dataset
df.shape

In [None]:
# Check the available columns/features
df.columns

In [None]:
# Display summary statistics
df.describe().style.background_gradient(cmap='summer')

### <div style="color:white;display:inline-block;border-radius:5px;background-color:#007FFF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b></b>Summary Table</p></div>

* The summary table provides information on the percentage of missing values, unique values, as well as the minimum and maximum values for each variable. Additionally, examining the first three values in each column can offer initial insights into the dataset as a whole.

In [None]:
# summary table function
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values * 100
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values

    return summ

In [None]:
summary(df).style.background_gradient(cmap='summer')

In [None]:
df2.fillna(0, inplace=True)

In [None]:
# Remove any duplicate rows
df2 = df2.drop_duplicates()

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#007FFF;font-family:Nexa;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;font-size:90%;letter-spacing:0.5px;margin:0"><b></b>Exploratory Data Analysis (EDA)📊</p></div>


In [None]:
# Count the number of each LCA Type
LCA_type_counts = df2['LCA'].value_counts()

# Set up the figure and axes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
sns.barplot(x=LCA_type_counts.index, y=LCA_type_counts.values, ax=ax1, palette=['#3b5998','#8b9dc3','#dfe3ee'])
ax1.set_xlabel('Distribution of Latent Classes', fontsize=12,fontweight = 'bold', color = 'darkgreen')
ax1.set_ylabel('Count', fontsize=12,fontweight = 'bold', color = 'darkblue')
ax1.set_xticklabels(LCA_type_counts.index,  fontsize=11,fontweight = 'bold', color = 'darkblue')
ax1.yaxis.grid(True, linestyle='--', alpha=0.7)
sns.despine(ax=ax1)

# Pie chart
colors = ['#ee4035','#f37736','#fdf498']
wedges, texts, autotexts = ax2.pie(LCA_type_counts, labels=LCA_type_counts.index, autopct='%1.1f%%', colors=colors, startangle=0, wedgeprops={'edgecolor': 'white', 'linewidth': 1.5})
ax2.axis('equal')

# The pie chart labels
for text in texts:
    text.set_fontsize(11)
    text.set_fontweight('bold')

for autotext in autotexts:
    autotext.set_fontsize(11)
    autotext.set_fontweight('bold')

# Adding shadow to pie chart
for wedge in wedges:
    wedge.set_edgecolor('white')
    wedge.set_linewidth(1.5)
    wedge.set_alpha(0.9)

# Adjust layout spacing between plots
plt.tight_layout()

# Set a single title for the entire figure
fig.suptitle('Distribution of Latent Classes', fontsize=16, fontweight='bold', color = 'darkgreen')

plt.savefig('Distribution of Latent Classes.png')

# Show the enhanced visualization
plt.show()

In [None]:
# Define the categorical variables
categorical_variables = ['Sex', 'Grade', 'Class_SES', 'Alcohol', 'Tobacco', 'CMD', 'LCA']
category_names = ['Sex (1 - boys; 2 - girls)', 'High School Grade', 'Socioeconomic Status',
                  'Alcohol Use (1 - no; 2 - yes)', 'Tobacco Use (1 - no; 2 - yes)',
                  'Common Mental Disorder Score', 'Latent Classes']

# Set up subplots
fig, axes = plt.subplots(nrows=len(categorical_variables), ncols=1, figsize=(10, 15))

# Define a palette with three distinct colors
palette = ['#a8e6cf','#ff8b94','#4d648d']

for idx, (var, category_name) in enumerate(zip(categorical_variables, category_names)):
    sns.countplot(x=var, data=df2, palette=palette, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {category_name}', fontsize=14, fontweight = 'bold', color = 'darkgreen')
    axes[idx].set_xlabel(var, fontsize=12, fontweight = 'bold', color = 'darkorange')
    axes[idx].set_ylabel('Count', fontsize=12, fontweight = 'bold', color = 'darkorange')

plt.tight_layout()
plt.savefig('Distribution.png')
plt.show()

In [None]:
# Define the numerical variables
numerical_var = ['Age', 'LPA/day(min)', 'MVPA/day(min)', 'Steps/day', 'SB/day(min)', 
                 'ST', 'Weight (kg)','Height (m)', 'BMI', 'SBP', 'DBP', 
                 'WC', 'HW', 'WHtR']


# Set up subplots
fig, axes = plt.subplots(nrows=len(numerical_var), ncols=1, figsize=(12, 45)) 

# Define a palette with three distinct colors
palette = ['#3d1e6d','#ff5588','#00b159']

for idx, column in enumerate(numerical_var):
    plt.subplot(len(numerical_var), 1, idx+1)
    sns.histplot(x=column, hue="LCA", data=df2, bins=30, kde=True, multiple="stack", palette=palette)
    plt.title(f"{column} Distribution for Latent Classes", fontsize=14, fontweight='bold', color = 'darkred')
    plt.xlabel(column, fontsize=12, fontweight='bold', color = 'darkblue')
    plt.ylabel("Count", fontsize=12, fontweight='bold', color = 'darkblue')
    plt.ylim(0, df[column].value_counts().max() + 10)
    plt.legend(title="LCA", title_fontsize='11', loc='upper right', labels=df['LCA'].unique())

plt.tight_layout()
plt.savefig('Distribution for Latent Classes.png')
plt.show()

In [None]:
figsize = (6*6, 20)
fig = plt.figure(figsize=figsize)
for idx, col in enumerate(numerical_var):
    ax = plt.subplot(4, 4, idx + 1)
    sns.kdeplot(
        data=df2, hue='LCA', fill=True,
        x=col, palette=['#ff5588','darkblue'], legend=False
    )
 
    ax.set_ylabel(''); ax.spines['top'].set_visible(False),
    ax.set_xlabel(''); ax.spines['right'].set_visible(False)
    ax.set_title(f'{col}', loc='right',
                 weight='bold', fontsize=20)

fig.suptitle(f'Features vs Target\n\n\n', ha='center',  fontweight='bold', fontsize=25)
fig.legend([3, 2,1], loc='upper center', bbox_to_anchor=(0.5, 0.96), fontsize=25, ncol=3)
plt.tight_layout()
plt.savefig('Features vs Target.png')
plt.show()

In [None]:
# Selecting only the numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Calculate the correlation matrix using only numerical columns
corr = df[numerical_columns].corr()

# Create a heatmap with a nice color palette
plt.figure(figsize=(15, 8))
sns.heatmap(corr, annot=True, cmap='summer', linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=15)
plt.savefig('Correlation Heatmap.png')
plt.show()


### To be continued. . . 