In [3]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Imports loaded")

Imports loaded


In [4]:
from load_data import (
    load_features_train, load_features_test,
    load_labels_train, load_labels_test,
    load_feature_names, load_activity_labels
)

X_train = load_features_train()
y_train = load_labels_train()
X_test = load_features_test()
y_test = load_labels_test()

names = load_feature_names()
X_train.columns = names
X_test.columns = names

activity_map = load_activity_labels()
y_train.columns = ['activity_id']
y_test.columns = ['activity_id']
y_train['activity_name'] = y_train['activity_id'].map(activity_map)
y_test['activity_name'] = y_test['activity_id'].map(activity_map)

print("Successful data load")


Loaded 6 activity labels
Successful data load


In [6]:
print("=" * 25)
print("DATASET SHAPES")
print("=" * 25)

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")
print(f"\nTotal samples: {X_train.shape[0] + X_test.shape[0]}")
print(f"Total features: {X_train.shape[1]}")

DATASET SHAPES
X_train: (7352, 561)
y_train: (7352, 2)
X_test: (2947, 561)
y_test: (2947, 2)

Total samples: 10299
Total features: 561


In [7]:
print("\n" + "=" * 25)
print("MISSING VALUES")
print("=" * 25)

print(f"X_train missing: {X_train.isnull().sum().sum()}")
print(f"y_train missing: {y_train.isnull().sum().sum()}")
print(f"X_test missing: {X_test.isnull().sum().sum()}")
print(f"y_test missing: {y_test.isnull().sum().sum()}")


MISSING VALUES
X_train missing: 0
y_train missing: 0
X_test missing: 0
y_test missing: 0


In [8]:
print("\n" + "=" * 25)
print("CLASS DISTRIBUTION")
print("=" * 25)

print("\nTrain set:")
print(y_train['activity_name'].value_counts().sort_index())

print("\nTest set:")
print(y_test['activity_name'].value_counts().sort_index())


CLASS DISTRIBUTION

Train set:
activity_name
LAYING                1407
SITTING               1286
STANDING              1374
WALKING               1226
WALKING_DOWNSTAIRS     986
WALKING_UPSTAIRS      1073
Name: count, dtype: int64

Test set:
activity_name
LAYING                537
SITTING               491
STANDING              532
WALKING               496
WALKING_DOWNSTAIRS    420
WALKING_UPSTAIRS      471
Name: count, dtype: int64


In [12]:
# ==================================
# ANÁLISE DE DISTRIBUIÇÃO
# ==================================

print("\n" + "=" * 60)
print("ANÁLISE QUANTITATIVA: PROPORÇÕES E DIFERENÇAS")
print("=" * 60)

# 1. Proporções (%)
train_counts = y_train['activity_name'].value_counts().sort_index()
test_counts = y_test['activity_name'].value_counts().sort_index()

train_total = len(y_train)
test_total = len(y_test)

train_proportions = ((train_counts / train_total) * 100).round(2)
test_proportions = ((test_counts / test_total) * 100).round(2)

# 2. Diferenças
differences = (train_proportions - test_proportions).round(2)

# 3. Comparative Table
comparison_df = pd.DataFrame({
    'Train Count': train_counts,
    'Train %': train_proportions,
    
    'Test Count': test_counts,
    'Test %': test_proportions,

    'Diff (Train - Test)': differences,
    'Abs Diff': differences.abs()
})

print(comparison_df)

# 4. Análises específicas
print("\n" + "=" * 60)
print("INSIGHTS AUTOMÁTICOS")
print("=" * 60)

# Classe com maior diferença
max_diff_class = comparison_df['Abs Diff'].idxmax()
max_diff_value = comparison_df.loc[max_diff_class, 'Diff (Train - Test)']

print(f"\nMaior discrepância:")
print(f"   Classe: {max_diff_class}")
print(f"   Train: {comparison_df.loc[max_diff_class, 'Train %']:.2f}%")
print(f"   Test: {comparison_df.loc[max_diff_class, 'Test %']:.2f}%")
print(f"   Diferença: {max_diff_value:+.2f}%")

# Razão maior/menor classe (balanceamento)
max_class = train_proportions.idxmax()
min_class = train_proportions.idxmin()
balance_ratio = train_proportions.max() / train_proportions.min()

print(f"\nBalanceamento (Train):")
print(f"   Classe majoritária: {max_class} ({train_proportions.max():.2f}%)")
print(f"   Classe minoritária: {min_class} ({train_proportions.min():.2f}%)")
print(f"   Razão maior/menor: {balance_ratio:.2f}×")

if balance_ratio < 2:
    print(f"   ✓ Balanceamento aceitável (<2×)")
elif balance_ratio < 5:
    print(f"   !!  Desbalanceamento moderado (2-5×)")
else:
    print(f"   xXx Desbalanceamento severo (>5×)")

# Consistência train/test
max_abs_diff = comparison_df['Abs Diff'].max()
print(f"\nConsistência Train/Test:")
print(f"   Maior diferença absoluta: {max_abs_diff:.2f}%")

if max_abs_diff < 2:
    print(f"   ✓ Distribuições muito consistentes")
elif max_abs_diff < 5:
    print(f"   !!  Pequenas diferenças, monitorar")
else:
    print(f"   xXx Diferenças significativas, investigar")


ANÁLISE QUANTITATIVA: PROPORÇÕES E DIFERENÇAS
                    Train Count  Train %  Test Count  Test %  \
activity_name                                                  
LAYING                     1407    19.14         537   18.22   
SITTING                    1286    17.49         491   16.66   
STANDING                   1374    18.69         532   18.05   
WALKING                    1226    16.68         496   16.83   
WALKING_DOWNSTAIRS          986    13.41         420   14.25   
WALKING_UPSTAIRS           1073    14.59         471   15.98   

                    Diff (Train - Test)  Abs Diff  
activity_name                                      
LAYING                             0.92      0.92  
SITTING                            0.83      0.83  
STANDING                           0.64      0.64  
WALKING                           -0.15      0.15  
WALKING_DOWNSTAIRS                -0.84      0.84  
WALKING_UPSTAIRS                  -1.39      1.39  

INSIGHTS AUTOMÁTICOS

M