In [9]:
import os
import pandas as pd

log_dir = '../data/raw'  # Adjust path if needed
log_files = [f for f in os.listdir(log_dir) if f.endswith('.csv')]
print(f'Found {len(log_files)} log files.')

all_dfs = []
for f in log_files:
    file_path = os.path.join(log_dir, f)
    try:
        df = pd.read_csv(file_path)
        all_dfs.append(df)
    except Exception as e:
        print(f'Error loading {f}: {e}')

if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print(f'Combined DataFrame shape: {combined_df.shape}')
    display(combined_df.head())
else:
    print('No valid log files found.')

Found 368 log files.
Combined DataFrame shape: (8492, 10)


Unnamed: 0,timestamp,label_1,count_1,avg_conf_1,label_2,count_2,avg_conf_2,label_3,count_3,avg_conf_3
0,2025-04-19 23:58:00,racoon,3,0.74,,,,,,
1,2025-04-19 01:45:00,racoon,3,0.94,,,,,,
2,2025-04-19 08:09:00,dog,5,0.88,stroller,2.0,0.82,,,
3,2025-04-19 17:44:00,ups truck,4,0.67,,,,,,
4,2025-04-19 06:16:00,dog,5,0.54,car,1.0,0.97,stroller,4.0,0.53


In [10]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8492 entries, 0 to 8491
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   timestamp   8492 non-null   object 
 1   label_1     8492 non-null   object 
 2   count_1     8492 non-null   int64  
 3   avg_conf_1  8492 non-null   float64
 4   label_2     1751 non-null   object 
 5   count_2     1751 non-null   float64
 6   avg_conf_2  1751 non-null   float64
 7   label_3     672 non-null    object 
 8   count_3     672 non-null    float64
 9   avg_conf_3  672 non-null    float64
dtypes: float64(5), int64(1), object(4)
memory usage: 663.6+ KB


In [11]:
print(combined_df.label_1.unique())
print(combined_df.label_2.unique())
print(combined_df.label_3.unique())

['racoon' 'dog' 'ups truck' 'newspaper' 'bicycle' 'school bus' 'mail truck' 'chipmunk' 'garbage truck' 'skunk' 'fedex truck' 'stroller' 'person' 'bird' 'prime van' 'cat' 'car' 'recycling truck' 'oil truck' 'truck' 'streetcleaner' 'police car' 'elephant' 'bottle' 'skateboard' 'suitcase' 'bed' 'vase' 'bench' 'bus'
 'knife' 'fire truck' 'ice cream truck' 'snow plow' 'ambulance']
[nan 'stroller' 'car' 'chipmunk' 'person' 'dog' 'bicycle' 'ups truck' 'bird' 'cat' 'fedex truck' 'truck' 'bottle' 'elephant' 'vase' 'skateboard' 'suitcase' 'bear' 'bed' 'remote' 'bus' 'book' 'surfboard' 'bench' 'prime van']
[nan 'stroller' 'ups truck' 'cat' 'car' 'fedex truck' 'truck' 'chipmunk' 'dog' 'bicycle' 'person' 'bird' 'prime van' 'tv' 'vase' 'skateboard' 'suitcase' 'bear' 'horse' 'bed' 'bus' 'mouse' 'laptop' 'book' 'surfboard' 'bench']


In [12]:
# Show all rows with 'school bus' in any label column
mask_school_bus = (
      combined_df['label_1'].str.lower().str.strip() == 'school bus'
  )
print(combined_df[mask_school_bus][['timestamp', 'label_1']])

                timestamp     label_1
9     2025-04-21 07:13:00  school bus
10    2025-04-21 15:02:00  school bus
15    2025-04-22 07:19:00  school bus
16    2025-04-22 15:25:00  school bus
23    2025-04-23 07:11:00  school bus
...                   ...         ...
8469  2025-04-15 15:30:00  school bus
8478  2025-04-16 07:01:00  school bus
8479  2025-04-16 15:26:00  school bus
8484  2025-04-17 07:11:00  school bus
8485  2025-04-17 15:21:00  school bus

[434 rows x 2 columns]
