In [49]:
from os.path import dirname, abspath
import sys
import torch
import clip
from PIL import Image
import os
from config import CFG
from fast_bert.prediction import BertClassificationPredictor
import pandas as pd
import numpy as np
from Entrainement.CamemBERT.camemBERT import text_prepare
from tqdm import tqdm
import warnings
warnings. simplefilter(action='ignore', category=Warning)
import argparse
import glob
from datetime import datetime

def simple_CLIP(image_path, labels, model, preprocess):
    # inputs : image_path, labels (liste)
    text = clip.tokenize(labels).to(CFG.device)
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(CFG.device)
    with torch.no_grad():
        logits_per_image, _ = model(image, text)
        prediction = logits_per_image.softmax(dim=-1).cpu().numpy()
    max_index = np.argmax(prediction)
    return (labels[max_index], prediction[0][max_index])

def get_dist(description, version_BERT):
    DATA_PATH = os.path.join(CFG.path_bert,'Data/')
    MODEL_PATH = os.path.join(CFG.path_models,'CamemBERT',  'CamemBERT_v{}'.format(version_BERT))
    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH,
        label_path=DATA_PATH,  # location for labels.csv file
        multi_label=True,
        model_type='camembert-base',
        do_lower_case=False,
        device=None)
    prediction = predictor.predict(text_prepare(description))
    return prediction[0][0], prediction[0][1]

def get_dist_batch(texts,  predictor):    
    texts = [text_prepare(text) for text in texts]
    prediction = []
    for text in texts:
        prediction.append(predictor.predict(text))
    preds = [p[0][0] for p in prediction]
    scores = [p[0][1] for p in prediction]
    return preds, scores

def get_clip(image, df_label, model, preprocess):
    label_clip, score_clip = simple_CLIP(os.path.join(CFG.path_data,'Predictions_classification', image), df_label.en, model, preprocess)
    return label_clip, score_clip

def write_csv(df, df_label, threshold_clip, threshold_dist, predictor, model, preprocess):
    list_label_dist, list_score_dist = get_dist_batch(df.description.tolist(), predictor)
    result = []
    for i in range(len(df)):
        label_clip, score_clip = get_clip(df.image.iloc[i], df_label, model, preprocess)
        if list_label_dist[i].lower() == df_label[df_label['en']==label_clip].fr.values[0].lower():
            result.append(df_label[df_label['en']==label_clip].fr.values[0].lower())
        else:
            if score_clip > threshold_clip and list_score_dist[i] < threshold_dist :
                result.append(df_label[df_label['en']==label_clip].fr.values[0].lower())
            elif score_clip < threshold_clip and list_score_dist[i] > threshold_dist :
                result.append(list_label_dist[i])
            else:
                # Vérification humaine (API)
                result.append('Need Human Verif')
    df['resultats'] = result
    return df

#------------------------------------------------------------------------------------------------------#
def main_performance():
    model, preprocess = clip.load("ViT-B/32", device=CFG.device)
    version = len(os.listdir(os.path.join(CFG.path_models,'CamemBERT')))
    DATA_PATH = os.path.join(CFG.path_bert,'Data/')
    MODEL_PATH = os.path.join(CFG.path_models,'CamemBERT',  'CamemBERT_v{}'.format(version))
    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH,
        label_path=DATA_PATH,  # location for labels.csv file
        multi_label=True,
        model_type='camembert-base',
        do_lower_case=False,
        device=None)
    csv = glob.glob(os.path.join(CFG.path_data,'Predictions_classification','*.csv'))
    if len(csv)>1:
        print('Trop de csv')
        exit()
    elif len(csv)==0:
        print('Pas de csv')
        exit()
    else:
        csv = csv[0]
    df = pd.read_csv(csv, index_col=False)
    df.dropna(subset=['description'], inplace=True)
    # df = dict({'image' : 'image.jpg',
    #            'description : 'courte description'})
    df_label = pd.read_csv(CFG.path_labels)
    # df_label = dict({'niv1' : 'Label1, Label2, ...'
    #                  'niv2' : 'Label3, Label4, ...' })
    threshold_clip_list = np.linspace(0,1,11)
    threshold_dist_list = np.linspace(0,1,11)
        
    score=[]
    t_c=[]
    t_d=[]
    for threshold_clip in threshold_clip_list:
        for threshold_dist in tqdm(threshold_dist_list):
            df = write_csv(df, df_label, threshold_clip, threshold_dist, predictor, model, preprocess)
            vrai = 0
            tot = len(df)
            for i in range(tot):
                if df.label.iloc[i]== df.resultats.iloc[i]:
                    vrai +=1
            score.append(vrai/tot)
            t_c.append(threshold_clip)
            t_d.append(threshold_dist)
    df_perf = pd.DataFrame(list(zip(t_c,t_d,score)), columns=['treshold_CLIP', 'treshold_camemBERT', 'score'])
    df_perf.to_csv(os.path.join(CFG.path, 'Resultats', 'Classification','performance_BERT_v{}.csv'.format(version)))

In [50]:
main_performance()

  0%|          | 0/11 [00:38<?, ?it/s]
  0%|          | 0/11 [00:38<?, ?it/s]


KeyboardInterrupt: 

In [43]:
version = len(os.listdir(os.path.join(CFG.path_models,'CamemBERT')))
csv = glob.glob(os.path.join(CFG.path_data,'Predictions_classification','*.csv'))
images = glob.glob(os.path.join(CFG.path_data,'Predictions_classification','*.jpg'))
if len(csv)>1:
    print('Trop de csv')
    exit()
elif len(csv)==0:
    print('Pas de csv')
    exit()
else:
    csv = csv[0]
df = pd.read_csv(csv, index_col=False)
df_label = pd.read_csv(CFG.path_labels)
labels = df_label.en.tolist()

In [18]:
labels.index(df_label[df_label.fr=='Saucisses'].en.values[0])

IndexError: index 0 is out of bounds for axis 0 with size 0

In [20]:
for i in range(len(df_label)):
    if df_label.fr.iloc[i].endswith(' '):
        df_label.fr.iloc[i] = df_label.fr.iloc[i][:-1]
    if df_label.en.iloc[i].endswith(' '):
        df_label.en.iloc[i] = df_label.en.iloc[i][:-1]

In [21]:
df_label.to_csv(CFG.path_labels,index=False)

In [24]:
df.image.iloc[0]

'image_157.jpg'

In [29]:
l = [label +'_clip' for label in labels]+[label +'_bert' for label in labels]

In [38]:
res = np.zeros((len(df), len(l)))

In [39]:
len(res[0])

158

In [44]:
df[[label +'_clip' for label in labels]+[label +'_bert' for label in labels]] = res

In [48]:
df.shape

(894, 162)

In [45]:
[1.3489e-02 1.3077e-02 2.1136e-04 1.2040e-04 1.6201e-04 9.4700e-04
 2.4948e-03 7.7724e-05 1.5223e-04 1.5795e-05 1.2283e-02 3.1888e-05
 7.0810e-05 3.9697e-05 5.1498e-04 1.4359e-02 1.6687e-01 1.1384e-05
 1.8433e-02 2.0046e-03 6.7532e-05 2.0170e-04 2.0695e-03 3.5004e-02
 4.4220e-02 1.1563e-05 6.5565e-07 7.5698e-06 4.1604e-05 6.5446e-05
 8.2302e-04 2.0266e-06 4.9114e-04 2.2292e-05 6.7949e-06 3.8505e-05
 1.2550e-03 2.1470e-04 1.6365e-03 4.2458e-03 1.7977e-03 7.0810e-05
 2.0266e-05 8.9874e-03 7.2122e-06 1.0794e-04 3.5584e-05 5.0962e-05
 2.2292e-05 2.3007e-05 1.5795e-05 5.6863e-05 8.3590e-04 4.2458e-03
 1.3864e-04 2.8610e-05 1.8477e-05 1.5497e-06 1.0567e-03 3.2902e-05
 3.3569e-03 6.1512e-05 2.1136e-04 5.7757e-05 1.4219e-03 2.3580e-04
 2.4185e-03 4.1122e-03 4.0283e-02 1.6272e-05 4.5717e-05 1.9519e-01
 1.4603e-05 3.9429e-01 1.3361e-03 3.0279e-04 3.1412e-05 9.0897e-05
 3.2539e-03]

Unnamed: 0.1,Unnamed: 0,image,description,label,Food accessories_clip,House cleaning accessories_clip,Aerosols for cleaning_clip,Pastry aids_clip,Alcohols_clip,Baby food_clip,...,Soda_bert,Soups_bert,Meat Succedanes_bert,Kits and brushes_bert,Chopped meats_bert,Prepared poultry_bert,Yoghurt and white cheese_bert,Wine_bert,Unprepared poultry_bert,Flour Starch_bert
0,0,image_157.jpg,2 steaks haches 12 mg bio biol charal st hache...,Viandes hachees,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,image_3631.jpg,veloute pyrenees pomme nature cazaubon reg caz...,Desserts compotes,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,image_2212.jpg,sauce soja sucree suziwan sauce soja sucree137...,Sauces tartinables,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,image_1220.jpg,biscuits chocolat delacre cookies choco 150g,biscuiterie sucree et salee cereales ptit dej,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,image_4823.jpg,oralb gencives purify extra douce brosse dents...,trousses brosses,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889,889,image_3689.jpg,biscuits nappes chocolat noir bio bjorg biscui...,biscuiterie sucree et salee cereales ptit dej,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
890,890,image_4270.jpg,lentilles vertes bio 500g bio tartines potager...,Pates Riz Ble Lentilles Semoules non prep,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
891,891,image_4640.jpg,kit liquide vaisselle ecocert 1l atelier diy p...,Produits lave vaisselle,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
892,892,image_819.jpg,beurre cacahuetes bio sans sucres ajoutes bio ...,Sauces tartinables,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
