# 1 Combine and Dedupe Raw Datasets

**Raw Datasets:**
1. `NAnderson2020MendeleyMangoNIRData.csv`
2. `2022_ABS_AllMangoNIR.csv`
3. `MarceloSeveralVarieties_2018-2020_absorbancedata.csv`

File (1) is the originally published dataset. Files (2) and (3) were sourced from the `researchdata.cqu.edu/sensors-in-ag` shared drive.

(2) was compiled by Nicholas Anderson.
(3) was collected and compiled by Marcelo.

The file `all_combined.xlsx` was manually prepared by combining all three datasets into one file.

The following notebook is intended to dedupe the data as there may be overlap between the datasets or within a datasets.

This will be done by identifying unique spectra, as it is deemed nearly impossible that the absorbance at every wavelength could be exactly the same.


## Setup

In [1]:
import pandas as pd

In [2]:
df_all = pd.read_excel("data/raw/0_all_combined.xlsx")
print(f"Number of rows: {len(df_all)}")
df_all.head(5)

Number of rows: 105495


Unnamed: 0,origin,population,date,season,region,cultivar,type,temp,heat_units,dm,...,1173,1176,1179,1182,1185,1188,1191,1194,1197,1200
0,NAnderson2020MendeleyMangoNIRData,2,2015-10-02,1,NT,Caly,Hard Green,Mid,,16.792506,...,0,0,0,0,0,0,0,0,0,0
1,NAnderson2020MendeleyMangoNIRData,2,2015-10-02,1,NT,Caly,Hard Green,Mid,,16.792506,...,0,0,0,0,0,0,0,0,0,0
2,NAnderson2020MendeleyMangoNIRData,2,2015-10-02,1,NT,Caly,Hard Green,Mid,,16.070979,...,0,0,0,0,0,0,0,0,0,0
3,NAnderson2020MendeleyMangoNIRData,2,2015-10-02,1,NT,Caly,Hard Green,Mid,,16.070979,...,0,0,0,0,0,0,0,0,0,0
4,NAnderson2020MendeleyMangoNIRData,2,2015-10-02,1,NT,Caly,Hard Green,Mid,,16.394013,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# get wavelengths
wavelengths = df_all.filter(regex="^\d+", axis=1).columns.tolist()
pd.DataFrame(wavelengths)

Unnamed: 0,0
0,285
1,288
2,291
3,294
4,297
...,...
301,1188
302,1191
303,1194
304,1197


## Dedupe duplicate Spectra

In [4]:
# unique spectra
df_unique = (
    df_all[~df_all.duplicated(subset=wavelengths, keep=False)]
)
print(f"Number of unique spectra: {len(df_unique)}")

Number of unique spectra: 82683


In [5]:
# identify duplicate spectra
df_duplicates = (
    df_all[df_all.duplicated(subset=wavelengths, keep=False)]
    .sort_values(by=(wavelengths + ["origin"]))
)
df_duplicates.insert(
    loc=0,
    column="dup_num",
    value=(df_duplicates.groupby(wavelengths).ngroup() + 1)
)
print(f"Number of duplicate spectra: {len(df_duplicates)}")
df_duplicates

Number of duplicate spectra: 22812


Unnamed: 0,dup_num,origin,population,date,season,region,cultivar,type,temp,heat_units,...,1173,1176,1179,1182,1185,1188,1191,1194,1197,1200
19100,1,2022_ABS_AllMangoNIR,5,2015-10-23,1,NT,KP,Hard Green,Low,,...,0,0,0,0,0,0,0,0,0,0
23980,1,2022_ABS_AllMangoNIR,10,2015-10-25,1,NT,HG,Hard Green,Low,,...,0,0,0,0,0,0,0,0,0,0
19115,2,2022_ABS_AllMangoNIR,5,2015-10-23,1,NT,KP,Hard Green,Low,,...,0,0,0,0,0,0,0,0,0,0
23995,2,2022_ABS_AllMangoNIR,10,2015-10-25,1,NT,HG,Hard Green,Low,,...,0,0,0,0,0,0,0,0,0,0
19148,3,2022_ABS_AllMangoNIR,5,2015-10-23,1,NT,KP,Hard Green,Low,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1887,11404,NAnderson2020MendeleyMangoNIRData,10,2015-10-25,1,NT,HG,Hard Green,Mid,,...,0,0,0,0,0,0,0,0,0,0
37274,11405,2022_ABS_AllMangoNIR,48,2017-09-27,3,NT,1201,Hard Green,No,,...,0,0,0,0,0,0,0,0,0,0
8550,11405,NAnderson2020MendeleyMangoNIRData,48,2017-09-27,3,NT,1201,Hard Green,No,,...,0,0,0,0,0,0,0,0,0,0
12823,11406,MarceloSeveralVarieties_2018-2020_absorbancedata,HG1,NaT,2020,QLD,HoneyGold,,,1573.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# remove duplicates from different origin files
df_duplicates.insert(
    loc=11,
    column="dm_rounded",
    value=df_duplicates["dm"].round(2)
)
df_duplicates = (
    df_duplicates
    .drop_duplicates(
        subset=df_duplicates.drop(["origin", "sample", "dm"], axis=1).columns.tolist(),
        keep="last"
    )
)
df_unique = pd.concat([
    df_unique,
    (
        df_duplicates[~df_duplicates.duplicated(subset=wavelengths, keep=False)]
        .drop(["dup_num", "dm_rounded"], axis=1)
    )
])
df_duplicates = (
    df_duplicates[df_duplicates.duplicated(subset=wavelengths, keep=False)]
    .sort_values(by=(wavelengths + ["origin"]))
)
print(f"Number of unique spectra: {len(df_unique)}")

Number of unique spectra: 93824


In [7]:
# duplicate spectra for instrument 15016 populations 5 and 10
# fitted pls and decided it must be from population 10
df_unique = pd.concat([
    df_unique,
    (
        df_duplicates.query("population == 10 and instrument == 15016")
        .drop(["dup_num", "dm_rounded"], axis=1)
    )
])
df_duplicates = df_duplicates[
    ~((df_duplicates["population"].isin([5, 10])) & (df_duplicates["instrument"] == 15016))
]
print(f"Number of unique spectra: {len(df_unique)}")

Number of unique spectra: 93924


In [8]:
# duplicate spectra between origin files with difference for cultivar same dm
# go with published dataset version
df_unique = pd.concat([
    df_unique,
    (
        df_duplicates[
            (
                    (df_duplicates["population"] == 20)
                    & (df_duplicates["instrument"] == 15006)
                    & (df_duplicates["origin"] == "NAnderson2020MendeleyMangoNIRData")
            )
        ]
        .drop(["dup_num", "dm_rounded"], axis=1)
    )
])
df_duplicates = df_duplicates[
    ~((df_duplicates["population"] == 20) & (df_duplicates["instrument"] == 15006))
]
print(f"Number of unique spectra: {len(df_unique)}")

Number of unique spectra: 94004


In [9]:
# duplicate spectra between origin files with difference for cultivar same dm
# go with published dataset version
df_unique = pd.concat([
    df_unique,
    (
        df_duplicates[
            (
                    (df_duplicates["population"] == 52)
                    & (df_duplicates["instrument"] == 15006)
                    & (df_duplicates["origin"] == "NAnderson2020MendeleyMangoNIRData")
            )
        ]
        .drop(["dup_num", "dm_rounded"], axis=1)
    )
])
df_duplicates = df_duplicates[
    ~((df_duplicates["population"] == 52) & (df_duplicates["instrument"] == 15006))
]
df_duplicates = df_duplicates[
    ~((df_duplicates["population"] == "52b") & (df_duplicates["instrument"] == 15006))
]
print(f"Number of unique spectra: {len(df_unique)}")

Number of unique spectra: 94082


In [10]:
# keep the rest of the original file
df_unique = pd.concat([
    df_unique,
    (
        df_duplicates[
            (
                (df_duplicates["origin"] == "NAnderson2020MendeleyMangoNIRData")
            )
        ]
        .drop(["dup_num", "dm_rounded"], axis=1)
    )
])
df_duplicates = df_duplicates.query("dup_num not in (4806, 5592, 5780, 10531)")
print(f"Number of unique spectra: {len(df_unique)}")

Number of unique spectra: 94086


In [11]:
# remaining duplicates
df_duplicates

Unnamed: 0,dup_num,origin,population,date,season,region,cultivar,type,temp,heat_units,...,1173,1176,1179,1182,1185,1188,1191,1194,1197,1200
23198,3577,2022_ABS_AllMangoNIR,9,2015-10-23,1,NT,KP,Hard Green,No,,...,0,0,0,0,0,0,0,0,0,0
23199,3577,2022_ABS_AllMangoNIR,9,2015-10-23,1,NT,KP,Hard Green,No,,...,0,0,0,0,0,0,0,0,0,0
26091,7991,2022_ABS_AllMangoNIR,12,2015-11-02,1,NT,Caly,Hard Green,High,,...,0,0,0,0,0,0,0,0,0,0
67615,7991,2022_ABS_AllMangoNIR,111,2019-01-25,4,QLD,Caly,Ripen,No,,...,0,0,0,0,0,0,0,0,0,0
12823,11406,MarceloSeveralVarieties_2018-2020_absorbancedata,HG1,NaT,2020,QLD,HoneyGold,,,1573.0,...,0,0,0,0,0,0,0,0,0,0
12824,11406,MarceloSeveralVarieties_2018-2020_absorbancedata,HG1,NaT,2020,QLD,HoneyGold,,,1573.0,...,0,0,0,0,0,0,0,0,0,0


## Output deduped data

In [12]:
df_unique.to_csv("data/interim/1_combined_deduped.csv", index=False)