# Some LB probing results to share
In this notebook, I tested some basic assumptions about test dataset.
I demonstrated that following assumptions are all TRUE.
* There are no new site ID in test dataset.
* Patient IDs in train and test sets do not overlap
* Image IDs in train and test sets do not overlap
* There are no new laterality values in test dataset.
* There are new machine IDs in test dataset. (This is already raised by patriot in [here](https://www.kaggle.com/competitions/rsna-breast-cancer-detection/discussion/369362))
* There are no new view values in test dataset.
* No of images/patient are all >= 4
* Site ID is always the same for each patient.
* Age is always the same for each patient.
* There are no overlap of machine IDs between two sites in test dataset.
* Some patients underwent mammography with multiple machines.
* No. of images in site ID 1 > No. of images in site ID 2. (by @yujiariyasu)
* All patients have CC and MLO images for both sides
* More than 40% of images are from machine ID 49 (43% for train dataset) (by @kaggleqrdl)
* Mean age of patients is between 56-61 (58.6 for train set), and patients in site 1 are younger than those in site 2. (by @kaggleqrdl)
* 1-2% of patients use implants (1.4% for train set).(by @kaggleqrdl)

I am happy if anyone correct me if I am wrong.
I am also very happy if anyone share us other assumtions/hypothesis about test dataset.

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/train.csv")
test_df = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/test.csv")
sub_df = pd.read_csv("/kaggle/input/rsna-breast-cancer-detection/sample_submission.csv")

print("train shape:", train_df.shape)
print("test shape:", test_df.shape)
print("sub_df shape:", sub_df.shape)
display(train_df.head())
display(test_df.head())
display(sub_df.head())

train shape: (54706, 14)
test shape: (4, 9)
sub_df shape: (2, 2)


Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True


Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,implant,machine_id,prediction_id
0,2,10008,736471439,L,MLO,81,0,21,10008_L
1,2,10008,1591370361,L,CC,81,0,21,10008_L
2,2,10008,68070693,R,MLO,81,0,21,10008_R
3,2,10008,361203119,R,CC,81,0,21,10008_R


Unnamed: 0,prediction_id,cancer
0,10008_L,0.021168
1,10008_R,0.021168


In [3]:
def get_num_unique(train_df, test_df, col):
    all_df = pd.concat([train_df, test_df])
    num_unique_train = len(train_df[col].unique())
    num_unique_test = len(test_df[col].unique())
    num_unique_all = len(all_df[col].unique())
    return num_unique_train, num_unique_test, num_unique_all

def add_count(df, col):
    if type(col) == str:
        aggs = df.groupby(col, as_index=True)[col].count().rename(col + "_count")
    else:
        aggs = (
            df.groupby(col, as_index=False)[col[0]]
            .count()
            .rename("_".join(col) + "_count")
        )
    df = df.merge(aggs, on=col, how="inner")
    return df

In [4]:
hypotheses = []

# There are no new site ID in test dataset.

In [5]:
num_unique_train, num_unique_test, num_unique_all = get_num_unique(train_df, test_df, 'site_id')
print(f'num_unique_train: {num_unique_train}')
print(f'num_unique_test: {num_unique_test}')
print(f'num_unique_all: {num_unique_all}')
hypothesis = (num_unique_all == 2)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

num_unique_train: 2
num_unique_test: 1
num_unique_all: 2
hyposthesis: True


# Patient IDs in train and test sets do not overlap

In [6]:
num_unique_train, num_unique_test, num_unique_all = get_num_unique(train_df, test_df, 'patient_id')
print(f'num_unique_train: {num_unique_train}')
print(f'num_unique_test: {num_unique_test}')
print(f'num_unique_all: {num_unique_all}')
hypothesis = (num_unique_train + num_unique_test == num_unique_all)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

num_unique_train: 11913
num_unique_test: 1
num_unique_all: 11914
hyposthesis: True


# Image IDs in train and test sets do not overlap

In [7]:
num_unique_train, num_unique_test, num_unique_all = get_num_unique(train_df, test_df, 'image_id')
print(f'num_unique_train: {num_unique_train}')
print(f'num_unique_test: {num_unique_test}')
print(f'num_unique_all: {num_unique_all}')
hypothesis = (num_unique_train + num_unique_test == num_unique_all)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

num_unique_train: 54706
num_unique_test: 4
num_unique_all: 54710
hyposthesis: True


# There are no new laterality values in test dataset.

In [8]:
num_unique_train, num_unique_test, num_unique_all = get_num_unique(train_df, test_df, 'laterality')
print(f'num_unique_train: {num_unique_train}')
print(f'num_unique_test: {num_unique_test}')
print(f'num_unique_all: {num_unique_all}')
hypothesis = (num_unique_test == 2)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

num_unique_train: 2
num_unique_test: 2
num_unique_all: 2
hyposthesis: True


# There are new machine IDs in test dataset.

In [9]:
num_unique_train, num_unique_test, num_unique_all = get_num_unique(train_df, test_df, 'machine_id')
print(f'num_unique_train: {num_unique_train}')
print(f'num_unique_test: {num_unique_test}')
print(f'num_unique_all: {num_unique_all}')
hypothesis = (num_unique_train != num_unique_all)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

num_unique_train: 10
num_unique_test: 1
num_unique_all: 10
hyposthesis: False


# There are no new view values in test dataset.

In [10]:
num_unique_train, num_unique_test, num_unique_all = get_num_unique(train_df, test_df, 'view')
print(f'num_unique_train: {num_unique_train}')
print(f'num_unique_test: {num_unique_test}')
print(f'num_unique_all: {num_unique_all}')
hypothesis = (num_unique_all == 6)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

num_unique_train: 6
num_unique_test: 2
num_unique_all: 6
hyposthesis: True


# No of images/patient are all >= 4.

In [11]:
temp_df = add_count(test_df, 'patient_id').drop_duplicates('patient_id')
display(temp_df.head())
hypothesis = (temp_df.patient_id_count.min() >= 4)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,implant,machine_id,prediction_id,patient_id_count
0,2,10008,736471439,L,MLO,81,0,21,10008_L,4


hyposthesis: True


# Site ID is always the same for each patient.

In [12]:
len1 = len(test_df.drop_duplicates(['patient_id']))
len2 = len(test_df.drop_duplicates(['patient_id','site_id']))
print(f'len1: {len1}')
print(f'len2: {len2}')
hypothesis = (len1  == len2)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

len1: 1
len2: 1
hyposthesis: True


# Age is always the same for each patient.

In [13]:
len1 = len(test_df.drop_duplicates(['patient_id']))
len2 = len(test_df.drop_duplicates(['patient_id','age']))
print(f'len1: {len1}')
print(f'len2: {len2}')
hypothesis = (len1  == len2)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

len1: 1
len2: 1
hyposthesis: True


# There are no overlap of machine IDs between two sites in test dataset.

In [14]:
len1 = len(test_df.drop_duplicates(['machine_id']))
len2 = len(test_df.drop_duplicates(['site_id','machine_id']))
print(f'len1: {len1}')
print(f'len2: {len2}')
hypothesis = (len1  == len2)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

len1: 1
len2: 1
hyposthesis: True


# Some patients underwent mammography with multiple machines.

In [15]:
len1 = len(test_df.drop_duplicates(['patient_id']))
len2 = len(test_df.drop_duplicates(['patient_id','machine_id']))
print(f'len1: {len1}')
print(f'len2: {len2}')
hypothesis = (len1  != len2)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

len1: 1
len2: 1
hyposthesis: False


This hypothesis is true for train set

In [16]:
len1 = len(train_df.drop_duplicates(['patient_id']))
len2 = len(train_df.drop_duplicates(['patient_id','machine_id']))
print(f'len1: {len1}')
print(f'len2: {len2}')
hypothesis = (len1  != len2)
print(f'hyposthesis: {hypothesis}')
display(train_df[train_df.patient_id == 22637])

len1: 11913
len2: 11914
hyposthesis: True


Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
11792,1,22637,347017785,L,MLO,65.0,0,0,0,,0,A,93,False
11793,1,22637,1172178848,L,MLO,65.0,0,0,0,,0,A,93,False
11794,1,22637,1993684471,L,CC,65.0,0,0,0,,0,A,93,False
11795,1,22637,286197565,R,MLO,65.0,0,0,0,0.0,0,A,93,True
11796,1,22637,425671587,R,MLO,65.0,0,0,0,0.0,0,A,93,True
11797,1,22637,1344937704,R,CC,65.0,0,0,0,0.0,0,A,190,True
11798,1,22637,1574740592,R,CC,65.0,0,0,0,0.0,0,A,93,True


# No. of images in site ID 1 >  No. of images in site ID 2
Suggested by @yujiariyasu

In [17]:
mean_site_id_train = train_df.site_id.mean()
mean_site_id_test = test_df.site_id.mean()
print(f'mean site ID train: {mean_site_id_train}')
print(f'mean site ID test: {mean_site_id_test}')
hypothesis = (mean_site_id_test < 1.5)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

mean site ID train: 1.460406536760136
mean site ID test: 2.0
hyposthesis: False


# All patients have CC and MLO images for both sides

In [18]:
len1 = len(test_df.patient_id.unique())
len2 = len(test_df[(test_df.laterality == 'L')&(test_df.view == 'CC')].patient_id.unique())
len3 = len(test_df[(test_df.laterality == 'L')&(test_df.view == 'MLO')].patient_id.unique())
len4 = len(test_df[(test_df.laterality == 'R')&(test_df.view == 'CC')].patient_id.unique())
len5 = len(test_df[(test_df.laterality == 'R')&(test_df.view == 'MLO')].patient_id.unique())
print(f'len1: {len1}')
print(f'len2: {len2}')
print(f'len3: {len3}')
print(f'len4: {len4}')
print(f'len5: {len5}')
hypothesis = len1 == len2 == len3 == len4 == len5
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

len1: 1
len2: 1
len3: 1
len4: 1
len5: 1
hyposthesis: True


# More than 40% of images are from machine ID 49 (43% for train dataset)
Suggested by @kaggleqrdl

In [19]:
test_machine_49_count = len(test_df.query("machine_id == 49"))
test_len = len(test_df)
test_machine_49_ratio = test_machine_49_count/test_len
print(f'test_machine_49_count: {test_machine_49_count}')
print(f'test_len: {test_len}')
print(f'test_machine_49_ratio: {test_machine_49_ratio}')
hypothesis = test_machine_49_ratio > 0.40
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

test_machine_49_count: 0
test_len: 4
test_machine_49_ratio: 0.0
hyposthesis: False


In [20]:
train_machine_49_count = len(train_df.query("machine_id == 49"))
train_len = len(train_df)
train_machine_49_ratio = train_machine_49_count/train_len
print(f'train_machine_49_count: {train_machine_49_count}')
print(f'train_len: {train_len}')
print(f'train_machine_49_ratio: {train_machine_49_ratio}')

train_machine_49_count: 23529
train_len: 54706
train_machine_49_ratio: 0.43009907505575257


# Mean age of patients is between 56-61 (58.6 for train set), and patients in site 1 are younger than those in site 2.
Suggested by @kaggleqrdl

In [21]:
mean_age_train = train_df.drop_duplicates('patient_id').age.mean()
mean_age_site1_train = train_df[train_df.site_id == 1].drop_duplicates('patient_id').age.mean()
mean_age_site2_train = train_df[train_df.site_id == 2].drop_duplicates('patient_id').age.mean()
mean_age_test = test_df.drop_duplicates('patient_id').age.mean()
mean_age_site1_test = test_df[test_df.site_id == 1].drop_duplicates('patient_id').age.mean()
mean_age_site2_test = test_df[test_df.site_id == 2].drop_duplicates('patient_id').age.mean()
print(f'mean_age_train: {mean_age_train}')
print(f'mean_age_site1_train: {mean_age_site1_train}')
print(f'mean_age_site2_train: {mean_age_site2_train}')
print(f'mean_age_test: {mean_age_test}')
print(f'mean_age_site1_test: {mean_age_site1_test}')
print(f'mean_age_site2_test: {mean_age_site2_test}')
hypothesis = (mean_age_test > 56)&(61 > mean_age_test)&(mean_age_site1_test < mean_age_site2_test)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

mean_age_train: 58.63821923561529
mean_age_site1_train: 57.34509466437177
mean_age_site2_train: 59.87087776866284
mean_age_test: 81.0
mean_age_site1_test: nan
mean_age_site2_test: 81.0
hyposthesis: False


# 1-2% of patients use implants (1.4% for train set).
Suggested by @kaggleqrdl

In [22]:
mean_implant_train = train_df.drop_duplicates(['patient_id','laterality']).implant.mean()
mean_implant_test = test_df.drop_duplicates(['patient_id','laterality']).implant.mean()
print(f'mean_implant_train: {mean_implant_train}')
print(f'mean_implant_test: {mean_implant_test}')
hypothesis = (mean_implant_test > 0.01)&(0.02 > mean_implant_test)
print(f'hyposthesis: {hypothesis}')
hypotheses.append(hypothesis)

mean_implant_train: 0.014354066985645933
mean_implant_test: 0.0
hyposthesis: False


# If the submission of this notebook is successful, all hypotheses are TRUE

In [23]:
print(hypotheses)
print(all(hypotheses))

[True, True, True, True, False, True, True, True, True, True, False, False, True, False, False, False]
False


In [24]:
if all(hypotheses):
    submission = pd.DataFrame(data={'prediction_id': test_df['prediction_id'].unique(), 'cancer': np.random.random(len(test_df['prediction_id'].unique()))})
else:
    submission = pd.DataFrame(data={'prediction_id': test_df['prediction_id'], 'cancer': np.random.random(len(test_df))})
submission.to_csv('submission.csv', index=False)
display(submission.head())

Unnamed: 0,prediction_id,cancer
0,10008_L,0.876897
1,10008_L,0.599107
2,10008_R,0.74573
3,10008_R,0.793492
