In [1]:
import pandas as pd

revised = pd.read_csv("data/full_data_csv/2016_2023_MRD_VNX_SVT_DoubleCheck2020.csv")
inuse = pd.read_csv("data/end/2016_2023_MRD_VNX_SVT.csv")

revised['date'] = pd.to_datetime(revised["date"], dayfirst=True, format="mixed").dt.date
inuse['date'] = pd.to_datetime(inuse.date)
inuse["date"] = inuse.date.dt.date

# Comparing data sources

In [2]:
print("inuse\n")
inuse.head()

inuse



Unnamed: 0,date,location,sample,date_sample,event,before event,after event,medium,label,count,coef,week,doy,year,color,image
0,2016-07-05,MRD,MRD1,"('2016-07-05', 'MRD1')",True,False,False,EasyGel,Bioindicator,0.0,250,27,187,2016,big_blue,none
1,2016-07-12,MRD,MRD1,"('2016-07-12', 'MRD1')",True,False,False,EasyGel,Bioindicator,22.0,25,28,194,2016,big_blue,none
2,2016-07-19,MRD,MRD1,"('2016-07-19', 'MRD1')",False,False,True,EasyGel,Bioindicator,8.0,25,29,201,2016,big_blue,none
3,2016-06-21,MRD,MRD1,"('2016-06-21', 'MRD1')",False,True,False,EasyGel,Bioindicator,2.0,100,25,173,2016,big_blue,none
4,2016-06-28,MRD,MRD1,"('2016-06-28', 'MRD1')",False,True,False,EasyGel,Bioindicator,0.0,25,26,180,2016,big_blue,none


In [3]:
print("revised\n")
revised.head()

revised



Unnamed: 0,date,location,sample,date_sample,event,before event,after event,medium,label,count,coef,week,doy,year,color,image
0,2016-07-05,MRD,MRD1,"('2016-07-05', 'MRD1')",True,False,False,EasyGel,Bioindicator,0.0,250,27,187,2016,big_blue,none
1,2016-07-12,MRD,MRD1,"('2016-07-12', 'MRD1')",True,False,False,EasyGel,Bioindicator,22.0,25,28,194,2016,big_blue,none
2,2016-07-19,MRD,MRD1,"('2016-07-19', 'MRD1')",False,False,True,EasyGel,Bioindicator,8.0,25,29,201,2016,big_blue,none
3,2016-06-21,MRD,MRD1,"('2016-06-21', 'MRD1')",False,True,False,EasyGel,Bioindicator,2.0,100,25,173,2016,big_blue,none
4,2016-06-28,MRD,MRD1,"('2016-06-28', 'MRD1')",False,True,False,EasyGel,Bioindicator,0.0,25,26,180,2016,big_blue,none


## number of records

In [4]:
print(f'records inuse = {len(inuse)}')
print(f'records revised = {len(revised)}')

records inuse = 3036
records revised = 3036


## mediums

In [5]:
revised.medium.unique()

array(['EasyGel', 'ECC-A', 'ECC', 'E coli', 'LB', 'Levine'], dtype=object)

In [6]:
inuse.medium.unique()

array(['EasyGel', 'ECC-A', 'ECC', 'E coli', 'LB', 'Levine'], dtype=object)

## samples

In [7]:
print(f'inuse unique date_samples = {inuse.date_sample.nunique()}')

inuse unique date_samples = 573


In [8]:
print(f'revised unique date_samples = {revised.date_sample.nunique()}')

revised unique date_samples = 573


## locations

In [9]:
print(f'inuse unique locations = {inuse.location.unique()}')

inuse unique locations = ['MRD' 'SVT' 'VNX']


In [10]:
print(f'revised unique locations = {revised.location.unique()}')

revised unique locations = ['MRD' 'SVT' 'VNX']


### samples by location

In [11]:
print("inuse\n")
inuse.groupby("location").date_sample.nunique()

inuse



location
MRD    188
SVT    194
VNX    191
Name: date_sample, dtype: int64

In [12]:
print("revised\n")
revised.groupby("location").date_sample.nunique()

revised



location
MRD    188
SVT    194
VNX    191
Name: date_sample, dtype: int64

## Differences

### Count differences

In [13]:
difs = inuse.compare(revised, align_axis=1, result_names=("inuse", "revised"), keep_equal=False)
count_difs = difs["count"].dropna()
coef_difs = difs["coef"].dropna()

In [14]:
dssamps_changed = inuse.loc[count_difs.index][["date", "sample"]]
dssamps_changed["date"] = pd.to_datetime(dssamps_changed["date"]).dt.date
changed = dssamps_changed["sample"].unique()
nchanged = len(dssamps_changed)

print(f"The number of inuse records where the count will changed = {nchanged}\n")
print(f"The samples that will be changed\n {changed}\n")
print(f"The samples are on these dates\n {dssamps_changed['date'].unique()}")

The number of inuse records where the count will changed = 69

The samples that will be changed
 ['SVT1' 'MRD1' 'SVT2' 'SVT3' 'MRD2' 'MRD3' 'VNX1' 'VNX2' 'VNX3']

The samples are on these dates
 [datetime.date(2020, 6, 11) datetime.date(2020, 7, 9)
 datetime.date(2020, 8, 6) datetime.date(2020, 6, 18)]


In [15]:
inuse.loc[count_difs.index].head()

Unnamed: 0,date,location,sample,date_sample,event,before event,after event,medium,label,count,coef,week,doy,year,color,image
1649,2020-06-11,SVT,SVT1,"(datetime.date(2020, 6, 11), 'SVT1')",False,True,False,Levine,Bioindicator,10.0,100,24,163,2020,green_met,IMG_9315.JPG
1653,2020-06-11,MRD,MRD1,"(datetime.date(2020, 6, 11), 'MRD1')",False,True,False,Levine,Bioindicator,10.0,100,24,163,2020,green_met,IMG_9319.JPG
1794,2020-07-09,SVT,SVT2,"(datetime.date(2020, 7, 9), 'SVT2')",False,False,False,Levine,Coliform,0.0,100,28,191,2020,purple,IMG_0097.JPG
1795,2020-07-09,SVT,SVT3,"(datetime.date(2020, 7, 9), 'SVT3')",False,False,False,Levine,Coliform,0.0,100,28,191,2020,purple,IMG_0098.JPG
1797,2020-07-09,MRD,MRD1,"(datetime.date(2020, 7, 9), 'MRD1')",False,False,False,Levine,Coliform,0.0,100,28,191,2020,purple,IMG_0100.JPG


### Coefficient differences

In [16]:
coef_changed = inuse.loc[coef_difs.index][["date", "sample"]]
coef_changed["date"] = pd.to_datetime(coef_changed["date"]).dt.date
changed = coef_changed["sample"].unique()
nchanged = len(coef_changed)

print(f"The number of inuse records where the coefficient will changed = {nchanged}\n")
print(f"The samples that will be changed\n {changed}\n")
print(f"The samples are on these dates\n {coef_changed['date'].unique()}")

The number of inuse records where the coefficient will changed = 242

The samples that will be changed
 ['VNX1' 'SVT1' 'MRD1' 'VNX2' 'SVT2' 'MRD2' 'VNX3' 'SVT3' 'MRD3']

The samples are on these dates
 [datetime.date(2017, 7, 17) datetime.date(2017, 7, 24)
 datetime.date(2017, 7, 3)]


In [17]:
inuse.loc[coef_difs.index].head()

Unnamed: 0,date,location,sample,date_sample,event,before event,after event,medium,label,count,coef,week,doy,year,color,image
2142,2017-07-17,VNX,VNX1,"('2017-07-17', 'VNX1')",False,False,True,EasyGel,Bioindicator,0.0,100,29,198,2017,fluo_halo,VNX1-19_51_41.jpg
2143,2017-07-17,SVT,SVT1,"('2017-07-17', 'SVT1')",False,False,True,EasyGel,Bioindicator,0.0,100,29,198,2017,fluo_halo,SVT1-19_26_35.jpg
2144,2017-07-17,MRD,MRD1,"('2017-07-17', 'MRD1')",False,False,True,EasyGel,Bioindicator,0.0,100,29,198,2017,fluo_halo,MRD1-18_38_11.jpg
2148,2017-07-24,SVT,SVT1,"('2017-07-24', 'SVT1')",False,False,True,Levine,Bioindicator,0.0,100,30,205,2017,fluo_halo,SVT1-22_53_48.JPG
2149,2017-07-24,SVT,SVT1,"('2017-07-24', 'SVT1')",False,False,True,EasyGel,Bioindicator,0.0,100,30,205,2017,fluo_halo,SVT1-13_10_25.JPG
