## In this cell, the original data is loaded, and the Start and End columns are removed because they are not used in the classification procedure.

In [None]:
import pandas as pd
from pathlib import Path

Subset = "Subset_C" # could be Subset_A, Subset_B, Subset_C, etc.

Original_file_path = Path(f"{Subset}/1_Original_Data/Labeled_Data_With_Features.xlsx")
Review_file_path = Path(f"{Subset}/2_Clean_Data/Labeled_Data_to_Review.xlsx")

df = pd.read_excel(Original_file_path)
filtered_df = (
    df[df['Specie ID'].notna() & (df['Specie ID'] != "noise")]
    .drop(columns=['Start', 'End'])
)
filtered_df.head()

Unnamed: 0,File,Specie ID,FminVoc,FmaxVoc,Fdom,FCC1,FCC2,FCC3,FCC4,FCC5,...,FCC19,FCC20,FCC21,FCC22,FCC23,SpectralCentroid,Bandwidth,SpectralFlatness,Length,DeltaFreq
0,SMA03126_20210611_200000.wav,Boana_platanera,1.2,2500,2529.980488,-2.929353,2.864602,-1.81967,-1.013172,-2.809621,...,-0.088081,0.805879,0.583809,-0.054626,0.516533,142.705564,208.074249,0.052909,1.999,2498.8
1,SMA03126_20210611_200000.wav,Boana_platanera,2.0,2500,2530.780488,-3.852549,2.218462,-2.664421,-2.01189,-3.159486,...,0.139753,0.489719,-0.282179,-0.593289,-0.227796,148.697025,215.702444,0.056735,1.999,2498.0
2,SMA03126_20210611_200000.wav,Boana_platanera,8.8,2500,2537.580488,-2.007076,3.238212,-1.505718,-0.753454,-2.435039,...,-0.019906,0.923981,0.524091,-0.043802,0.336594,170.685233,294.070649,0.064193,1.999,2491.2
3,SMA03126_20210611_200000.wav,Boana_platanera,9.6,2500,2538.380488,-1.842339,3.285941,-1.423196,0.407423,-1.323533,...,-0.357885,0.675144,-0.244911,-0.29599,0.055367,160.675951,259.236291,0.05737,1.999,2490.4
4,SMA03126_20210611_200000.wav,Boana_platanera,12.8,2500,2518.165854,-2.599878,1.658349,-2.119815,-1.37177,-1.665656,...,0.038951,-0.581507,-1.090474,-1.311246,-0.66718,155.110997,263.300619,0.060152,1.999,2487.2


## In the following cell, the flags file is loaded. Since it does not contain features, it is merged with the previous DataFrame.

In [23]:
review_df = pd.read_excel(Review_file_path)
review_df = review_df[["Specie ID", "Original Index", "review"]]
review_df.set_index("Original Index", inplace=True)

filtered_with_review = filtered_df.copy()
filtered_with_review = filtered_with_review.join(review_df["review"])

# --- Verification of index alignment ---
common_index = filtered_df.index.intersection(review_df.index)

print("=== Index Matching Summary ===")
print(f"Total rows in filtered_df: {len(filtered_df)}")
print(f"Total rows in review_df: {len(review_df)}")
print(f"Total matching indices: {len(common_index)}")

# Count rows that ended up with NaN in 'review'
nan_rows = filtered_with_review["review"].isna().sum()
print(f"Rows in filtered_with_review with NaN in 'review': {nan_rows}")

# Optional: show sample indices without matches
missing_in_filtered = review_df.index.difference(filtered_df.index)
missing_in_review = filtered_df.index.difference(review_df.index)

if len(missing_in_filtered) > 0:
    print("\nExamples of indices in review_df not found in filtered_df:")
    print(missing_in_filtered[:10].tolist())

if len(missing_in_review) > 0:
    print("\nExamples of indices in filtered_df not found in review_df:")
    print(missing_in_review[:10].tolist())

filtered_with_review = filtered_with_review[filtered_with_review['review'] == 0].drop(columns=['review'])
filtered_with_review.head(10)


=== Index Matching Summary ===
Total rows in filtered_df: 1382
Total rows in review_df: 1382
Total matching indices: 1382
Rows in filtered_with_review with NaN in 'review': 0


Unnamed: 0,File,Specie ID,FminVoc,FmaxVoc,Fdom,FCC1,FCC2,FCC3,FCC4,FCC5,...,FCC19,FCC20,FCC21,FCC22,FCC23,SpectralCentroid,Bandwidth,SpectralFlatness,Length,DeltaFreq
0,SMA03126_20210611_200000.wav,Boana_platanera,1.2,2500,2529.980488,-2.929353,2.864602,-1.81967,-1.013172,-2.809621,...,-0.088081,0.805879,0.583809,-0.054626,0.516533,142.705564,208.074249,0.052909,1.999,2498.8
1,SMA03126_20210611_200000.wav,Boana_platanera,2.0,2500,2530.780488,-3.852549,2.218462,-2.664421,-2.01189,-3.159486,...,0.139753,0.489719,-0.282179,-0.593289,-0.227796,148.697025,215.702444,0.056735,1.999,2498.0
2,SMA03126_20210611_200000.wav,Boana_platanera,8.8,2500,2537.580488,-2.007076,3.238212,-1.505718,-0.753454,-2.435039,...,-0.019906,0.923981,0.524091,-0.043802,0.336594,170.685233,294.070649,0.064193,1.999,2491.2
3,SMA03126_20210611_200000.wav,Boana_platanera,9.6,2500,2538.380488,-1.842339,3.285941,-1.423196,0.407423,-1.323533,...,-0.357885,0.675144,-0.244911,-0.29599,0.055367,160.675951,259.236291,0.05737,1.999,2490.4
4,SMA03126_20210611_200000.wav,Boana_platanera,12.8,2500,2518.165854,-2.599878,1.658349,-2.119815,-1.37177,-1.665656,...,0.038951,-0.581507,-1.090474,-1.311246,-0.66718,155.110997,263.300619,0.060152,1.999,2487.2
5,SMA03126_20210611_200000.wav,Boana_platanera,17.2,2500,2522.565854,-2.231358,3.275191,-1.493123,2.892798,-2.114041,...,-0.326249,-0.642316,-0.361097,-0.156929,-0.281566,161.134801,265.225309,0.058895,2.799,2482.8
6,SMA03126_20210611_200000.wav,Boana_platanera,25.2,2500,2530.565854,-1.903604,3.403911,-1.553224,1.873809,-1.544293,...,0.193577,-0.548168,-0.291512,0.010809,0.167503,162.60107,273.974257,0.052909,2.399,2474.8
7,SMA03126_20210611_200000.wav,Boana_platanera,29.2,2500,2534.565854,-1.467641,3.513595,-0.824378,-2.062945,-1.601723,...,-0.071716,1.93924,0.600671,-0.085757,0.30496,142.140039,236.901585,0.040205,1.999,2470.8
8,SMA03126_20210611_200000.wav,Boana_platanera,35.2,2500,2517.15122,-0.785767,3.239143,-1.170075,1.087914,-0.383085,...,-0.033328,-0.003907,-0.474978,-0.033348,0.122121,196.594358,339.436896,0.060914,2.399,2464.8
9,SMA03126_20210611_200000.wav,Boana_platanera,46.0,2500,2527.95122,-0.636378,3.048971,-1.369568,0.232421,-0.16225,...,0.24354,-0.413989,-0.32306,0.18223,-0.039081,191.605863,342.639751,0.058356,1.999,2454.0


## Now, the cleaned data is saved for use in the classification task

In [24]:
filtered_df.to_csv(f"{Subset}/1_Original_Data/original_data_for_classification.csv", index=True)
filtered_with_review.to_csv(f"{Subset}/2_Clean_Data/cleaned_data_for_classification.csv", index=True)