In [2]:
import os
from collections import defaultdict
import pandas as pd

In [20]:
def get_rows(input_file):
    assert os.path.isfile(input_file) == True, f"File `{input_file}` not found."
    with open(input_file, mode='r') as fs:
        rows = fs.read().splitlines()
    return rows


def get_stats(rows):
    if isinstance(rows, str):
        rows = get_rows(rows) #the input rows is a path instead of a list
    class_stats = defaultdict(int)
    classes = list(map(lambda x: x.split("/")[1], rows))
    for cls in classes:
        class_stats[cls] += 1
    print(class_stats)
    return class_stats

def dict_arithmetic(dictA, dictB, operation='-'):
    new_dict = defaultdict(int)
    keysA = set(dictA.keys())
    keysB = set(dictB.keys())
    keys = keysA.union(keysB)
    if operation == '-':
        for key in keys:
            new_dict[key] = max(0, dictA[key] - dictB[key])
    if operation == '+':
        for key in keys:
            new_dict[key] = max(0, dictA[key] + dictB[key])

    return new_dict 
        


In [21]:
orig_stats = get_stats("/Users/test/Documents/Projects/Master/nips4bplus/txts/cleaned_train.txt")
inter_stats = get_stats("/Users/test/Documents/Projects/Master/nips4bplus/txts/cleaned_train-interpolation.txt")
noise_stats = get_stats("/Users/test/Documents/Projects/Master/nips4bplus/txts/cleaned_train-noise.txt")
noise_inter_stats = get_stats("/Users/test/Documents/Projects/Master/nips4bplus/txts/cleaned_train-noise-inter.txt")
orig_noise_inter_stats = get_stats("/Users/test/Documents/Projects/Master/nips4bplus/txts/cleaned_train-orig-noise-inter.txt")


defaultdict(<class 'int'>, {'Alaarv_song': 16, 'Cetcet_song': 16, 'Erirub_song': 21, 'Galcri_call': 18, 'Parate_song': 18, 'Serser_song': 15, 'Sylcan_call': 15, 'Sylcan_song': 18, 'Turmer_call': 19, 'Turphi_song': 21})
defaultdict(<class 'int'>, {'Parate_song': 90, 'Serser_song': 75, 'Erirub_song': 105, 'Sylcan_song': 90, 'Galcri_call': 90, 'Alaarv_song': 80, 'Turphi_song': 105, 'Turmer_call': 95, 'Cetcet_song': 80, 'Sylcan_call': 75})
defaultdict(<class 'int'>, {'Parate_song': 90, 'Serser_song': 75, 'Erirub_song': 105, 'Sylcan_song': 90, 'Galcri_call': 90, 'Alaarv_song': 80, 'Turphi_song': 105, 'Turmer_call': 95, 'Cetcet_song': 80, 'Sylcan_call': 75})
defaultdict(<class 'int'>, {'Parate_song': 180, 'Serser_song': 150, 'Erirub_song': 210, 'Sylcan_song': 180, 'Galcri_call': 180, 'Alaarv_song': 160, 'Turphi_song': 210, 'Turmer_call': 190, 'Cetcet_song': 160, 'Sylcan_call': 150})
defaultdict(<class 'int'>, {'Parate_song': 198, 'Serser_song': 165, 'Erirub_song': 231, 'Sylcan_song': 198, 'G

In [22]:
dict_arithmetic(inter_stats, orig_stats, '-')

defaultdict(int,
            {'Sylcan_song': 72,
             'Sylcan_call': 60,
             'Cetcet_song': 64,
             'Erirub_song': 84,
             'Alaarv_song': 64,
             'Turmer_call': 76,
             'Galcri_call': 72,
             'Turphi_song': 84,
             'Parate_song': 72,
             'Serser_song': 60})

In [24]:
import plotly.express as px

long_df = px.data.medals_long()

fig = px.bar(long_df, x="nation", y="count", color="medal", title="Long-Form Input")
fig.show()
long_df.head()

Unnamed: 0,nation,medal,count
0,South Korea,gold,24
1,China,gold,10
2,Canada,gold,9
3,South Korea,silver,13
4,China,silver,15


In [37]:
df = []

diffs = {
    "interpolation": dict_arithmetic(inter_stats, orig_stats, '-'),
    "noise": dict_arithmetic(noise_stats, orig_stats, '-'),
    "original": orig_stats
}

for aug_type, stats in diffs.items():
    # print(stats)
    for classname, count in stats.items():
        df.append([classname, aug_type, count])
    # print(len(df))

df = pd.DataFrame.from_records(df, columns=['Class Name', 'Augmentation Type', 'Samples Count'])
# fig = px.bar(df, x="Class Name", y="Samples Count", color="Augmentation Type", title="Class Distribution by Augmentation methods")
fig = px.bar(df, y="Class Name", x="Samples Count", color="Augmentation Type", title="Class Distribution by Augmentation methods", orientation='h', height=800)
fig.show()
df.head()

Unnamed: 0,Class Name,Augmentation Type,Samples Count
0,Sylcan_song,interpolation,72
1,Sylcan_call,interpolation,60
2,Cetcet_song,interpolation,64
3,Erirub_song,interpolation,84
4,Alaarv_song,interpolation,64
