In [1]:
import pandas as pd

# Load the parquet files
df_expected = pd.read_parquet("../data/features_expected.parquet")
df_actual = pd.read_parquet("../data/features.parquet")


# Ensure both DataFrames are compared based on 'session_group'
common_session_groups = set(df_expected["session_group"]).intersection(set(df_actual["session_group"]))
df_expected = df_expected[df_expected["session_group"].isin(common_session_groups)]
df_actual = df_actual[df_actual["session_group"].isin(common_session_groups)]

# Get the common columns
common_columns = df_expected.columns.intersection(df_actual.columns).tolist()

# Store columns that have differences
differing_columns = []

# Set 'session_group' as index for proper alignment
df_expected.set_index("session_group", inplace=True)
df_actual.set_index("session_group", inplace=True)

# Sort index to ensure proper comparison
df_expected = df_expected.sort_index()
df_actual = df_actual.sort_index()

for column in common_columns:
    if column == "session_group":
        continue  # Skip 'session_group' as it's used for alignment
    # Set 'session_group' as index for proper alignment

    # Compare columns, treating NaNs as equal
    col_diff = df_expected[column].compare(df_actual[column], keep_equal=False)

    # Remove rows where both values are NaN (i.e., no real difference)
    col_diff = col_diff.dropna(how="all")

    # If there are any differences left, add column to the differing list
    if not col_diff.empty:
        differing_columns.append(column)

# Print the columns that differ
if differing_columns:
    print("Columns with differences:", differing_columns)
else:
    print("No differences found.")


Columns with differences: ['exercise_with_most_incorrect']


## Checking which columns are different from `features_expected` and `features`

---

## Now checking which values are different among `exercise_with_most_incorrect`


In [2]:
import pandas as pd

# Load the parquet files
df_expected = pd.read_parquet("../data/features_expected.parquet")
df_actual = pd.read_parquet("../data/features.parquet")

# Ensure both DataFrames are compared based on 'session_group'
common_session_groups = set(df_expected["session_group"]).intersection(set(df_actual["session_group"]))
df_expected = df_expected[df_expected["session_group"].isin(common_session_groups)]
df_actual = df_actual[df_actual["session_group"].isin(common_session_groups)]

# Ensure 'session_group' remains in the DataFrame before selecting columns
diff_columns = ["session_group", "exercise_with_most_incorrect" ]

df_expected = df_expected[diff_columns]
df_actual = df_actual[diff_columns]

# Set 'session_group' as index for proper alignment
df_expected.set_index("session_group", inplace=True)
df_actual.set_index("session_group", inplace=True)

# Sort index to ensure proper comparison
df_expected = df_expected.sort_index()
df_actual = df_actual.sort_index()

# Ensure both DataFrames have the same index
df_expected, df_actual = df_expected.align(df_actual, axis=0, fill_value=None)

# Compare selected columns and store differences
differences_df = df_expected.compare(df_actual, keep_shape=True, keep_equal=False)

# Remove rows where both self and other values are NaN
differences_df = differences_df.dropna(how='all')

# Display only differing rows
if not differences_df.empty:
    print("Differences found:")
    # Remove rows where all compared columns (except 'session_group') are "None"
    non_session_columns = [col for col in differences_df.columns.levels[0] if col != "session_group"]
    differences_df = differences_df[~differences_df[non_session_columns].apply(lambda row: all(row.astype(str) == "None"), axis=1)]
    print(differences_df)
else:
    print("No differences found.")


Differences found:
                                         exercise_with_most_incorrect  \
                                                                 self   
session_group                                                           
+/9DagqFxJB3+7WXtioOCZHmVUY=                            hip_abduction   
+/oBodC3e9COxDB/LWXxNCEx2/I=                            forward_lunge   
+091K2IyXQRGBMBQHElnRGEkyGM=    side_lying_shoulder_external_rotation   
+1g7QpKh/q6lfmLwWgWZiddWxR8=  shoulder_external_rotation_in_abduction   
+28CdQftxgOhTmkVxc1QUrxHcA0=                       hip_hyperextension   
...                                                               ...   
zwiQ3nDFF8z2S0q3x5ExMrKD/as=                            hip_abduction   
zwrgqLwVifFhixG6KAuvzdrSpi8=  shoulder_external_rotation_in_abduction   
zxOO41nbo6BT/dyWbF3hB9x73sQ=  shoulder_external_rotation_in_abduction   
zyhke8c2FKk621AsbLPkAQDbnIs=            hip_flexion_with_knee_bending   
zzwyhOME0/jCt/TlocGDlnM7Nx4=    