# Food format for denoising

In [13]:
def format(input_path, out_path):
    import pandas as pd
    df = pd.read_csv(input_path, sep='\t')
    df['class_name'] = df['image_name'].apply(lambda x: x.split('/')[0])
    newdf = df[['image_name', 'class_name']]
    newdf.to_csv(out_path, index=False, header=None, sep=' ')
    
format('../../trained_models/knn-food-result/food101n_k250_50Kremoved.tsv', '../../trained_models/food101n_20e/knn250_denoised_kv.txt')
format('../../trained_models/knn-food-result/food101n_k150_50Kremoved.tsv', '../../trained_models/food101n_20e/knn150_denoised_kv.txt')
format('../../trained_models/knn-food-result/food101n_k50_50Kremoved.tsv', '../../trained_models/food101n_20e/knn50_denoised_kv.txt')
format('../../trained_models/knn-food-result/food101n_k10_50Kremoved.tsv', '../../trained_models/food101n_20e/knn10_denoised_kv.txt')

# Clothing format for denoising

In [15]:
def format(input_path, out_path):
    import pandas as pd
    train_1m = pd.read_csv('../datasets/clothing1m/')
    
    
    
    df = pd.read_csv(input_path, sep='\t')
    df['class_name'] = df['image_name'].apply(lambda x: x.split('/')[0])
    newdf = df[['image_name', 'class_name']]
    newdf.to_csv(out_path, index=False, header=None, sep=' ')
    
# format('../../trained_models/knn-food-result/food101n_k250_50Kremoved.tsv', '../../trained_models/food101n_20e/knn250_denoised_kv.txt')
# format('../../trained_models/knn-food-result/food101n_k150_50Kremoved.tsv', '../../trained_models/food101n_20e/knn150_denoised_kv.txt')
# format('../../trained_models/knn-food-result/food101n_k50_50Kremoved.tsv', '../../trained_models/food101n_20e/knn50_denoised_kv.txt')
# format('../../trained_models/knn-food-result/food101n_k10_50Kremoved.tsv', '../../trained_models/food101n_20e/knn10_denoised_kv.txt')

# Food and Clothing KNN BASELINE P-R eval

In [69]:
# evaluate on validation set
import pandas as pd

def evaluate(images, vlabels, clean_images, classes):
    import numpy as np
    from sklearn.metrics import f1_score, classification_report, precision_recall_fscore_support
    
    set_clean = set(clean_images)

    preds = []
    for i in range(len(images)):
        img = images[i]
        vlabel = vlabels[i]
        # print(img, vlabel)
        if img in set_clean:
            preds.append(1)
        else:
            preds.append(0)
        # break
    
    print('done preds')
    targets = np.array(vlabels)
    preds = np.array(preds)
    print(np.sum(targets), len(targets), np.sum(preds), len(preds))
    
    C = np.max(classes) + 1
    per_class_accuracies = np.zeros((C,))
    for c in range(C):
        ind = np.where(classes == c)[0]
        class_acc = f1_score(targets[ind], preds[ind], average='micro')
        per_class_accuracies[c] = class_acc
    
    print('P/R/F1 (noise)', precision_recall_fscore_support(targets, preds, pos_label=0, average='binary'))
    print('f1_metrics (macro/unweighted mean)', f1_score(targets, preds, average='macro'))
    acc_classes = np.mean(per_class_accuracies)
    print('avg accuracy over classes', acc_classes, 'AvgErrorRate', 1 - acc_classes)
    
    report_additional = False
    if report_additional:
        print()
        acc = f1_score(targets, preds, average='micro')
        print('f1_metrics (accuracy/micro)', acc, 'ErrorRate', 1 - acc)
        print('P/R/F1 (clean)', precision_recall_fscore_support(targets, preds, pos_label=1, average='binary'))
        print('f1_metrics (weighted mean of f1)', f1_score(targets, preds, average='weighted'))
    # cr = classification_report(targets, preds)


def calculate_label_noise_accuracy(clean_file, val_file):
    clean_df = pd.read_csv(clean_file, sep='\t', header=0)
    print('num_clean', len(clean_df))
    val_df = pd.read_csv(val_file, sep='\t', header=None)[[0, 1, 2, 3]]
    classes = val_df[2]
    sclasses = np.sort(np.unique(classes))
    cmap = dict(zip(sclasses, np.arange(len(sclasses))))
    classes = val_df[2].apply(lambda x: cmap[x])
    vlabels = val_df[3]
    images = val_df[1]
    clean_images = clean_df['image_name']
    evaluate(images, vlabels, clean_images, classes)
    print('-------------')
    return images, vlabels, clean_images, classes

images, vlabels, clean_images, classes = calculate_label_noise_accuracy('../../trained_models/knn-food-result/food101n_k250_50Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/food101n_20e/cleannet_val.tsv')

# calculate_label_noise_accuracy('../../trained_models/knn-food-result/food101n_k150_50Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/food101n_20e/cleannet_val.tsv')

# calculate_label_noise_accuracy('../../trained_models/knn-food-result/food101n_k50_50Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/food101n_20e/cleannet_val.tsv')

# calculate_label_noise_accuracy('../../trained_models/knn-food-result/food101n_k10_50Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/food101n_20e/cleannet_val.tsv')

print('done')


calculate_label_noise_accuracy('../../trained_models/knn-clothing-result/clothing1m_k250_100Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/clothing1m_10e/cleannet_val.tsv')
# calculate_label_noise_accuracy('../../trained_models/knn-clothing-result/clothing1m_k150_100Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/clothing1m_10e/cleannet_val.tsv')
# calculate_label_noise_accuracy('../../trained_models/knn-clothing-result/clothing1m_k50_100Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/clothing1m_10e/cleannet_val.tsv')
# calculate_label_noise_accuracy('../../trained_models/knn-clothing-result/clothing1m_k10_100Kremoved.tsv', '/home/krsharma/ClassificationImageText/trained_models/clothing1m_10e/cleannet_val.tsv')

print('done')

0.2663, 0.220, 0.2423, 0.5403
310000 - 260009

0.433, 0.109, 0.166, 0.442
1000000 - 907465


num_clean 260009
done preds
3824 4741 3991 4741
P/R/F1 (noise) (0.2693333333333333, 0.2202835332606325, 0.24235152969406118, None)
f1_metrics (macro/unweighted mean) 0.5403696228124817
avg accuracy over classes 0.7336881730935796 AvgErrorRate 0.2663118269064204
-------------
done
num_clean 907465
done preds
4591 7465 6580 7465
P/R/F1 (noise) (0.3525423728813559, 0.10855949895615867, 0.1660015961691939, None)
f1_metrics (macro/unweighted mean) 0.44268211578220684
avg accuracy over classes 0.5668072318642982 AvgErrorRate 0.43319276813570184
-------------
done


92535