# Script Modèle par CNN CORVEN CHIFFOLEAU 

In [1]:
#inportation des packages et librairies nécessaire à la realisation du projet. 
! pip install timm
! pip install --upgrade wandb
! pip install transformers
! pip install --user albumentations



In [5]:
#Appel des packages et librairies necessaire à la realisation du projet
import os
import gc
import cv2
import copy
import time
import random
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [6]:
import base64
import pickle

In [7]:
#librairie qui vont servire à  télécherger les images
from io import BytesIO


In [8]:
# Libraries qui vont permettre de manipuler les images
import numpy as np
import pandas as pd

In [9]:
# Libraries Pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

In [10]:
import joblib
from tqdm import tqdm
from collections import defaultdict

In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

In [12]:
import timm

In [13]:
# Permet de transdormé les models
import transformers
from transformers import AutoTokenizer, AutoModel

In [14]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

In [15]:
# package de misen en forme texteuelle 
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# Permet de decrire les message d'erreur. 
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [16]:
import wandb

In [17]:
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
     #utilisation d'une API 
    api_key = user_secrets.get_secret("wandb_api")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"


In [19]:
#Training Configuration

In [18]:
#Configuration du modèle considéré comme le plus pertinent. 
CONFIG = {"seed": 2021,
          "epochs": 3,
          "img_size": 256,
          "image_model_name": "tf_efficientnet_b0",
          "text_model_name": "xlm-roberta-base",
          "embedding_size": 256,
          "train_batch_size": 10,
          "valid_batch_size": 10,
          "learning_rate": 0.10,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 0.50,
          "T_max": 50,
          "weight_decay": 0.10,
          "max_length": 32,
          "n_accumulate": 1,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['text_model_name'])

In [21]:
CONFIG

{'seed': 2021,
 'epochs': 3,
 'img_size': 256,
 'image_model_name': 'tf_efficientnet_b0',
 'text_model_name': 'xlm-roberta-base',
 'embedding_size': 256,
 'train_batch_size': 10,
 'valid_batch_size': 10,
 'learning_rate': 0.1,
 'scheduler': 'CosineAnnealingLR',
 'min_lr': 0.5,
 'T_max': 50,
 'weight_decay': 0.1,
 'max_length': 32,
 'n_accumulate': 1,
 'device': device(type='cpu'),
 'tokenizer': PreTrainedTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})}

In [20]:
                                         # définition de la reproductibilité 

In [19]:
#On définit ici un noyaux de base qui va eléminer le facteur aléatoire des
# modèle afin que les resultat soient les mêmes à chaque fois que le script sera
# utlisé. Anisi le modèle aura une meuilleur reproductibilité. Cepedant on pert un 
# peu en représentativité.
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
# Définition des options torch, qui utilise cuDNN de NVIDIA 
# cuDNN va permettre de faire des reseaux de neuronnes profonds en optimisaznt les performance du CPU
#Or cuDNN avec des convolutions CUDA peu etre sources de non-déterminisme sur plusieur exécution.
#Quand une convolution cuDNN est applelé avec un nouvelle esenmble de paramétre de taille,
# une fonctinnlaité peut venir exécuter plusieur algorothme de convolution en les comparant pour trouver les plus perfromants. 
#En raison du bruit de l'analyse comparative, mais également du matériel utilisé, 
# l'analyse comparative peut selectoinné des modèle différent. Ainsi ici La désactivation de la fonction d'analyse comparative a
# amène cuDNN à sélectionner de manière déterministe un algorithme, mais au prix de performances réduites. fonction : torch.backends.cudnn.benchmark = False
#
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [22]:
                                           # imoprtation des données 

In [20]:
import json

In [21]:
#wandb.init() va permettre d'enregistré les données dans une exécustion. 
# Les données seront synchronisé avec wandb.ai qui permet une visualisation des données et des reusltat 

run = wandb.init(project="Wikipedia", 
                 anonymous="must")
artifact = run.use_artifact('dchanda/Wikipedia/Wiki-data:latest', type='dataset')
artifact_dir = artifact.download()
run.finish()

for file in os.listdir(artifact_dir):
    filepath = os.path.join(artifact_dir, file)
    with open(filepath, "rb") as fp:
        data = pickle.load(fp)
        data= data[:500]

wandb: Currently logged in as: anony-mouse-158971 (use `wandb login --relogin` to force relogin)


wandb: Downloading large artifact Wiki-data:latest, 2168.54MB. 1 files... Done. 0:0:0


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [22]:
random.shuffle(data)

train_data = data[:50]
valid_data = data[450:]
print(f"Number of training samples: {len(train_data)}")
print(f"Number of validation samples: {len(valid_data)}")

Number of training samples: 50
Number of validation samples: 50


In [28]:
# initialisation des classes data 

In [42]:
class WikipediaDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, transforms=None):
        self.data = data
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.transforms = transforms
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        image_bytes = base64.b64decode(self.data[index]["b64_bytes"])
        img = np.asarray(Image.open(BytesIO(image_bytes)).convert("RGB"))
        caption = random.choice(self.data[index]["caption_title_and_reference_description"])
        caption = caption.replace("[SEP]", "</s>") # sep token for xlm-roberta
        inputs = self.tokenizer.encode_plus(
                caption,
                truncation=True,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length'
            )
        target = self.data[index]['target']
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'image': img,
            'target': torch.tensor(target, dtype=torch.long)
        }

In [43]:
data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.HorizontalFlip(p=0.5),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

In [44]:
train_dataset = WikipediaDataset(train_data, CONFIG["tokenizer"], CONFIG["max_length"], 
                                 transforms=data_transforms["train"])
train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                          num_workers=4, shuffle=True, pin_memory=True, drop_last=True)

valid_dataset = WikipediaDataset(valid_data, CONFIG["tokenizer"], CONFIG["max_length"], 
                                 transforms=data_transforms["valid"])
valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                          num_workers=4, shuffle=False, pin_memory=True)

In [57]:
valid_dataset[1]

{'ids': tensor([     0,  13177,   2647,     23,     56,   2179,      2, 151839,   4117,
              2,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'image': tensor([[[1.4269, 1.4612, 1.5125,  ..., 1.6153, 1.5297, 1.4783],
          [1.4612, 1.5125, 1.5810,  ..., 1.5982, 1.5639, 1.5125],
          [1.5125, 1.5125, 1.5297,  ..., 1.6495, 1.5982, 1.5468],
          ...,
          [1.4269, 1.2728, 1.2214,  ..., 1.2557, 1.2557, 1.2214],
          [1.4098, 1.2899, 1.2214,  ..., 1.2385, 1.2214, 1.1872],
          [1.0844, 0.9474, 0.8789,  ..., 1.2043, 1.1872, 1.1358]],
 
         [[1.3256, 1.3431, 1.3957,  ..., 1.5707, 1.5357, 1.4832],
          [1.4482, 1.5007, 1.5707,  ..., 1.6057, 1.5707, 1.5532],
          [1.5882, 1

In [49]:
train_dataset[2]

{'ids': tensor([    0, 12982, 10821,  1596,     2, 95659, 12190,    13,     9,  7251,
            86,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'image': tensor([[[ 0.5193,  0.5536,  0.5707,  ...,  0.7591,  0.7419,  0.7248],
          [ 0.5364,  0.5707,  0.5878,  ...,  0.7933,  0.7762,  0.7591],
          [ 0.5536,  0.5878,  0.6049,  ...,  0.8104,  0.7933,  0.7762],
          ...,
          [-1.3130, -1.2788, -1.2788,  ..., -1.0390, -1.0390, -1.0390],
          [-1.3130, -1.2788, -1.2788,  ..., -1.0562, -1.0562, -1.0562],
          [-1.1932, -1.1932, -1.1760,  ..., -1.0733, -1.0390, -1.1247]],
 
         [[ 1.2556,  1.2906,  1.3081,  ...,  1.5007,  1.4832,  1.4657],
          [ 1.2731,  1.3081,  1.3256,  ...,  1.5357,  1.5182,  1.5007],
    

In [32]:
                                           #Creation du modèle 

In [29]:
class WikipediaModel(nn.Module):
    def __init__(self, image_model, text_model, embedding_size):
        super(WikipediaModel, self).__init__()
        self.image_model = timm.create_model(image_model, pretrained=True)
        self.n_features = self.image_model.classifier.in_features
        self.image_model.reset_classifier(0)
        self.image_drop = nn.Dropout(p=0.2)
        self.image_fc = nn.Linear(self.n_features, embedding_size)
        
        self.text_model = AutoModel.from_pretrained(text_model)
        self.text_drop = nn.Dropout(p=0.2)
        self.text_fc = nn.Linear(768, embedding_size)
        
        self.freeze_backbone()
        
    def forward(self, images, ids, mask):
        image_features = self.image_model(images)
        image_embeddings = self.image_fc(self.image_drop(image_features))
        
        out = self.text_model(input_ids=ids,attention_mask=mask,
                              output_hidden_states=False)
        out = self.text_drop(out[1])
        text_embeddings = self.text_fc(out)

        return image_embeddings, text_embeddings
    
    def freeze_backbone(self):
        for params in self.image_model.parameters():
            params.requires_grad = False
        # Only finetune final layer
        self.image_fc.weight.requires_grad = True
        self.image_fc.bias.requires_grad = True
        
        for params in self.text_model.parameters():
            params.requires_grad = False
        # Only finetune final layer
        self.text_fc.weight.requires_grad = True
        self.text_fc.bias.requires_grad = True
    

model = WikipediaModel(CONFIG['image_model_name'], CONFIG['text_model_name'], CONFIG['embedding_size'])
model.to(CONFIG['device']);

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
                                         #Fonction de pertes 

In [30]:
def criterion(outputs1, outputs2, targets):
    return nn.CosineEmbeddingLoss()(outputs1, outputs2, targets)

In [36]:
                                       #Fonction d'netrainement 

In [31]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        images = data['image'].to(device, dtype=torch.float)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)

        image_outputs, text_outputs = model(images, ids, mask)
        loss = criterion(image_outputs, text_outputs, targets)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [38]:
                                         #Fonction de validation 

In [32]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        images = data['image'].to(device, dtype=torch.float)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)
        
        image_outputs, text_outputs = model(images, ids, mask)
        loss = criterion(image_outputs, text_outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss

In [40]:
                                            #Entrainement 

In [33]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [34]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [35]:
optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer)

In [36]:
run = wandb.init(project='Wikipedia', 
                 config=CONFIG,
                 job_type='Train',
                 anonymous='must')

In [37]:
from tqdm.auto import tqdm
tqdm.pandas()

In [None]:

model, history = run_training(model, optimizer, scheduler, 
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])



In [None]:
run.finish()