# Data Inspection: Reproducibility Check
 
This section performs a data inspection to ensure that the dataset, which has been split by topic, can be accurately aggregated or merged to reproduce the original 'wide' table format.

This is a critical sanity check to confirm that no data was lost or incorrectly transformed during the topic-splitting process.


## Main table

In [1]:
import pandas as pd

df_claim = pd.read_csv("../data/tri_guard_5_py_clean/Claim.csv")
df_claim.head()

Unnamed: 0,claim_number,subrogation,claim_est_payout,liab_prct,claim_date,claim_day_of_week,channel,zip_code,witness_present_ind,policy_report_filed_ind,in_network_bodyshop,accident_key,policyholder_key,vehicle_key,driver_key
0,6090851,1.0,3218.84,31.0,12/4/2016,Saturday,Broker,80040.0,Y,1.0,no,1.0,1.0,1.0,1.0
1,4653734,0.0,1338.52,34.0,4/25/2015,Wednesday,Phone,80030.0,N,1.0,yes,1.0,2.0,2.0,2.0
2,1014777,0.0,3540.05,39.0,6/22/2015,Thursday,Broker,50012.0,N,1.0,yes,2.0,3.0,3.0,3.0
3,8101873,1.0,1507.94,32.0,3/2/2015,Saturday,Phone,20138.0,N,1.0,yes,2.0,4.0,4.0,4.0
4,5081870,0.0,5080.63,28.0,1/12/2016,Sunday,Online,50033.0,N,0.0,yes,3.0,5.0,5.0,5.0


In [2]:
df_claim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18001 entries, 0 to 18000
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   claim_number             18001 non-null  int64  
 1   subrogation              17999 non-null  float64
 2   claim_est_payout         18000 non-null  float64
 3   liab_prct                18000 non-null  float64
 4   claim_date               18000 non-null  object 
 5   claim_day_of_week        18000 non-null  object 
 6   channel                  18000 non-null  object 
 7   zip_code                 18000 non-null  float64
 8   witness_present_ind      18000 non-null  object 
 9   policy_report_filed_ind  18000 non-null  float64
 10  in_network_bodyshop      18000 non-null  object 
 11  accident_key             18000 non-null  float64
 12  policyholder_key         18000 non-null  float64
 13  vehicle_key              18000 non-null  float64
 14  driver_key            

In [3]:
for col in ["accident_key", "policyholder_key", "vehicle_key", "driver_key"]:
    df_claim[col] = df_claim[col].fillna(0).astype(int)
    df_claim[col] = df_claim[col].astype("int64")

df_claim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18001 entries, 0 to 18000
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   claim_number             18001 non-null  int64  
 1   subrogation              17999 non-null  float64
 2   claim_est_payout         18000 non-null  float64
 3   liab_prct                18000 non-null  float64
 4   claim_date               18000 non-null  object 
 5   claim_day_of_week        18000 non-null  object 
 6   channel                  18000 non-null  object 
 7   zip_code                 18000 non-null  float64
 8   witness_present_ind      18000 non-null  object 
 9   policy_report_filed_ind  18000 non-null  float64
 10  in_network_bodyshop      18000 non-null  object 
 11  accident_key             18001 non-null  int64  
 12  policyholder_key         18001 non-null  int64  
 13  vehicle_key              18001 non-null  int64  
 14  driver_key            

In [4]:
df_claim[df_claim["accident_key"] == 0]

Unnamed: 0,claim_number,subrogation,claim_est_payout,liab_prct,claim_date,claim_day_of_week,channel,zip_code,witness_present_ind,policy_report_filed_ind,in_network_bodyshop,accident_key,policyholder_key,vehicle_key,driver_key
8351,0,,,,,,,,,,,0,0,0,0


In [5]:
df_claim.shape

(18001, 15)

## Topic tables

In [7]:
df_accident = pd.read_csv("../data/tri_guard_5_py_clean/Accident.csv")
df_accident.head()

Unnamed: 0,accident_site,accident_type,accident_key
0,Parking Area,multi_vehicle_clear,1
1,Unknown,multi_vehicle_unclear,2
2,Highway/Intersection,multi_vehicle_clear,3
3,Unknown,single_car,4
4,Local,single_car,5


In [8]:
df_accident.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   accident_site  12 non-null     object
 1   accident_type  12 non-null     object
 2   accident_key   12 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 420.0+ bytes


In [9]:
df_policy = pd.read_csv("../data/tri_guard_5_py_clean/Policyholder.csv")
df_policy.head()

Unnamed: 0,annual_income,high_education_ind,email_or_tel_available,address_change_ind,living_status,past_num_of_claims,policyholder_key
0,70966,1,0,1,Rent,7,1
1,79723,1,1,1,Rent,0,2
2,41527,1,1,1,Own,2,3
3,42099,1,1,1,Rent,0,4
4,47206,1,1,1,Own,14,5


In [10]:
df_policy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13965 entries, 0 to 13964
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   annual_income           13965 non-null  int64 
 1   high_education_ind      13965 non-null  int64 
 2   email_or_tel_available  13965 non-null  int64 
 3   address_change_ind      13965 non-null  int64 
 4   living_status           13965 non-null  object
 5   past_num_of_claims      13965 non-null  int64 
 6   policyholder_key        13965 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 763.8+ KB


In [11]:
df_vehicle = pd.read_csv("../data/tri_guard_5_py_clean/Vehicle.csv")
df_vehicle.head()

Unnamed: 0,vehicle_made_year,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,vehicle_mileage,vehicle_key
0,2021,Large,16272.12725,red,21620.79697,75421,1
1,2025,Medium,34102.78197,silver,10840.5852,31988,2
2,2022,Compact,15000.0,silver,24318.12282,60876,3
3,2025,Medium,16984.45295,white,36958.26656,152772,4
4,2021,Compact,46545.72863,blue,11779.17422,41151,5


In [12]:
df_vehicle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   vehicle_made_year  18000 non-null  int64  
 1   vehicle_category   18000 non-null  object 
 2   vehicle_price      18000 non-null  float64
 3   vehicle_color      18000 non-null  object 
 4   vehicle_weight     18000 non-null  float64
 5   vehicle_mileage    18000 non-null  int64  
 6   vehicle_key        18000 non-null  int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 984.5+ KB


In [13]:
df_driver = pd.read_csv("../data/tri_guard_5_py_clean/Driver.csv")
df_driver.head()

Unnamed: 0,year_of_born,gender,age_of_DL,safety_rating,driver_key
0,1990,F,25,75,1
1,1972,F,23,94,2
2,2003,F,23,76,3
3,1983,F,23,54,4
4,1985,F,17,54,5


In [14]:
df_driver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15190 entries, 0 to 15189
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_of_born   15190 non-null  int64 
 1   gender         15190 non-null  object
 2   age_of_DL      15190 non-null  int64 
 3   safety_rating  15190 non-null  int64 
 4   driver_key     15190 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 593.5+ KB


## Merge

In [15]:
assert not df_accident["accident_key"].duplicated().any()
assert not df_policy["policyholder_key"].duplicated().any()
assert not df_vehicle["vehicle_key"].duplicated().any()
assert not df_driver["driver_key"].duplicated().any()

In [16]:
merged = (df_claim
          .merge(df_accident, on="accident_key", how="left")
          .merge(df_policy, on="policyholder_key", how="left")
          .merge(df_vehicle, on="vehicle_key", how="left")
          .merge(df_driver, on="driver_key", how="left"))

merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18001 entries, 0 to 18000
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   claim_number             18001 non-null  int64  
 1   subrogation              17999 non-null  float64
 2   claim_est_payout         18000 non-null  float64
 3   liab_prct                18000 non-null  float64
 4   claim_date               18000 non-null  object 
 5   claim_day_of_week        18000 non-null  object 
 6   channel                  18000 non-null  object 
 7   zip_code                 18000 non-null  float64
 8   witness_present_ind      18000 non-null  object 
 9   policy_report_filed_ind  18000 non-null  float64
 10  in_network_bodyshop      18000 non-null  object 
 11  accident_key             18001 non-null  int64  
 12  policyholder_key         18001 non-null  int64  
 13  vehicle_key              18001 non-null  int64  
 14  driver_key            

## Original dataset

In [17]:
df = pd.read_csv("../data/Training_TriGuard.csv")
df.head()

Unnamed: 0,subrogation,claim_number,year_of_born,gender,email_or_tel_available,safety_rating,annual_income,high_education_ind,address_change_ind,living_status,...,claim_est_payout,vehicle_made_year,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,age_of_DL,accident_type,in_network_bodyshop,vehicle_mileage
0,1.0,6090851,1990.0,F,0.0,75.0,70966.0,1.0,1.0,Rent,...,3218.84,2021.0,Large,16272.12725,red,21620.79697,25.0,multi_vehicle_clear,no,75421.0
1,0.0,4653734,1972.0,F,1.0,94.0,79723.0,1.0,1.0,Rent,...,1338.52,2025.0,Medium,34102.78197,silver,10840.5852,23.0,multi_vehicle_clear,yes,31988.0
2,0.0,1014777,2003.0,F,1.0,76.0,41527.0,1.0,1.0,Own,...,3540.05,2022.0,Compact,15000.0,silver,24318.12282,23.0,multi_vehicle_unclear,yes,60876.0
3,1.0,8101873,1983.0,F,1.0,54.0,42099.0,1.0,1.0,Rent,...,1507.94,2025.0,Medium,16984.45295,white,36958.26656,23.0,multi_vehicle_unclear,yes,152772.0
4,0.0,5081870,1985.0,F,1.0,54.0,47206.0,1.0,1.0,Own,...,5080.63,2021.0,Compact,46545.72863,blue,11779.17422,17.0,multi_vehicle_clear,yes,41151.0


In [18]:
merged_dropped = merged.drop(
    ["accident_key", "policyholder_key", "vehicle_key", "driver_key"], axis=1)
assert set(merged_dropped.columns) == set(df.columns), "Column names mismatch!"
print("✅ Columns match (ignoring order)")

✅ Columns match (ignoring order)


In [19]:
df_filtered = df[df['subrogation'].notna()]
merged_filtered = merged_dropped[merged_dropped['subrogation'].notna()]
print("merged shape:", merged_filtered.shape)
print("df shape:", df_filtered.shape)

merged shape: (17999, 29)
df shape: (17999, 29)


In [20]:
try:
    pd.testing.assert_frame_equal(merged_filtered.sort_index(
        axis=1), df_filtered.sort_index(axis=1))
    print("✅ DataFrames are identical")
except AssertionError as e:
    print("❌ DataFrames differ:\n", e)

✅ DataFrames are identical
