In [None]:
import os
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt 
import seaborn as sns 
import sklearn
import kaggle
import kagglehub
from kagglehub import KaggleDatasetAdapter

In [None]:
# Download the dataset from Kaggle
# This will download the dataset to a local cache directory
dataset_path = kagglehub.dataset_download("pratyushpuri/wearable-health-devices-performance-analysis")

print(f"Dataset downloaded to: {dataset_path}")

# List files in the downloaded dataset
import os
dataset_files = os.listdir(dataset_path)
print(f"Files in dataset: {dataset_files}")

# Load the main CSV file (assuming there's a CSV file in the dataset)
# You may need to adjust the filename based on what's actually in the dataset
csv_files = [f for f in dataset_files if f.endswith('.csv')]
if csv_files:
    main_file = csv_files[0]  # Use the first CSV file found
    df = pd.read_csv(os.path.join(dataset_path, main_file))
    print(f"Loaded file: {main_file}")
    print("First 5 records:", df.head())
else:
    print("No CSV files found. Available files:", dataset_files)

In [None]:
# Comprehensive Exploratory Data Analysis (EDA)
# First, let's properly load the dataset

# Download the dataset from Kaggle
dataset_path = kagglehub.dataset_download("pratyushpuri/wearable-health-devices-performance-analysis")
print(f"Dataset downloaded to: {dataset_path}")

# List and load the dataset files
dataset_files = os.listdir(dataset_path)
print(f"Files in dataset: {dataset_files}")

# Load the CSV file
csv_files = [f for f in dataset_files if f.endswith('.csv')]
if csv_files:
    main_file = csv_files[0]
    df = pd.read_csv(os.path.join(dataset_path, main_file))
    print(f"\nLoaded file: {main_file}")
    print(f"Dataset shape: {df.shape}")
else:
    print("No CSV files found. Available files:", dataset_files)

# Basic Dataset Information
print("\n" + "="*50)
print("BASIC DATASET INFORMATION")
print("="*50)

print(f"\nDataset Shape: {df.shape}")
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")

print("\nColumn Names and Data Types:")
print(df.dtypes)

print("\nFirst 5 rows:")
display(df.head())

print("\nLast 5 rows:")
display(df.tail())

print("\nDataset Info:")
df.info()

# Missing Values Analysis
print("\n" + "="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)

missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
}).sort_values('Missing Count', ascending=False)

print("Missing values summary:")
display(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("✅ No missing values found in the dataset!")

# Statistical Summary
print("\n" + "="*50)
print("STATISTICAL SUMMARY")
print("="*50)

print("\nDescriptive statistics for numerical columns:")
display(df.describe())

print("\nDescriptive statistics for categorical columns:")
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    display(df[categorical_cols].describe())
else:
    print("No categorical columns found.")

# Data Distribution Analysis
print("\n" + "="*50)
print("DATA DISTRIBUTION ANALYSIS")
print("="*50)

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

# Visualizations
plt.style.use('default')
sns.set_palette("husl")

# 1. Distribution of numerical variables
if len(numerical_cols) > 0:
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            sns.histplot(data=df, x=col, kde=True, ax=axes[i])
            axes[i].set_title(f'Distribution of {col}')
            axes[i].tick_params(axis='x', rotation=45)
    
    # Hide empty subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

# 2. Box plots for numerical variables
if len(numerical_cols) > 0:
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            sns.boxplot(data=df, y=col, ax=axes[i])
            axes[i].set_title(f'Box Plot of {col}')
    
    # Hide empty subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

# 3. Categorical variables analysis
if len(categorical_cols) > 0:
    for col in categorical_cols:
        print(f"\nValue counts for {col}:")
        value_counts = df[col].value_counts()
        print(value_counts)
        
        # Plot categorical distribution
        plt.figure(figsize=(10, 6))
        if len(value_counts) <= 20:  # Only plot if not too many categories
            sns.countplot(data=df, x=col)
            plt.title(f'Distribution of {col}')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
        else:
            print(f"Too many categories ({len(value_counts)}) to plot effectively.")

# 4. Correlation Analysis
print("\n" + "="*50)
print("CORRELATION ANALYSIS")
print("="*50)

if len(numerical_cols) > 1:
    correlation_matrix = df[numerical_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5)
    plt.title('Correlation Matrix of Numerical Variables')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated pairs
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:  # High correlation threshold
                high_corr_pairs.append((
                    correlation_matrix.columns[i], 
                    correlation_matrix.columns[j], 
                    corr_val
                ))
    
    if high_corr_pairs:
        print("\nHighly correlated pairs (|correlation| > 0.7):")
        for var1, var2, corr in high_corr_pairs:
            print(f"{var1} - {var2}: {corr:.3f}")
    else:
        print("\nNo highly correlated pairs found (|correlation| > 0.7)")

# 5. Outlier Detection
print("\n" + "="*50)
print("OUTLIER DETECTION")
print("="*50)

if len(numerical_cols) > 0:
    outlier_summary = []
    
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        outlier_percentage = (outlier_count / len(df)) * 100
        
        outlier_summary.append({
            'Column': col,
            'Outlier Count': outlier_count,
            'Outlier Percentage': outlier_percentage,
            'Lower Bound': lower_bound,
            'Upper Bound': upper_bound
        })
    
    outlier_df = pd.DataFrame(outlier_summary)
    print("Outlier summary:")
    display(outlier_df)

# 6. Data Quality Assessment
print("\n" + "="*50)
print("DATA QUALITY ASSESSMENT")
print("="*50)

# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicate_count} ({(duplicate_count/len(df)*100):.2f}%)")

# Check for constant columns
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
if constant_cols:
    print(f"\nConstant columns (single unique value): {constant_cols}")
else:
    print("\n✅ No constant columns found.")

# Unique values per column
print("\nUnique values per column:")
unique_counts = df.nunique().sort_values(ascending=False)
display(pd.DataFrame({'Column': unique_counts.index, 'Unique Values': unique_counts.values}))

print("\n" + "="*50)
print("EDA COMPLETE!")
print("="*50)