In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import *
from pathlib import Path
import numpy as np

#### Choose device

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 2080 Ti


#### CSV to DF

In [9]:
csv_path = Path("path_to_CSVs")
delimiter='\t'

In [10]:
df_train = pd.read_csv(csv_path/'train.csv', delimiter=delimiter) 
print(df_train.shape)
df_train.head()

(11035, 203)


Unnamed: 0,HAL_ID,title,abstract,chim.anal,chim.cata,chim.chem,chim.cris,chim.geni,chim.inor,chim.mate,...,spi.other,spi.plasma,spi.signal,spi.tron,stat.ap,stat.co,stat.me,stat.ml,stat.ot,stat.th
0,hal-01397119,Fluidification du trafic Transilien : approche...,Avec plus d'un million de voyageurs quotidiens...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hal-03203665,Choix et attentes des enseignants de mathémati...,L'objectif de ce séminaire est de présenter le...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,inria-00386758,Apport de l'ACP probabiliste pour la gestion d...,"Dans cette présentation, nous nous intéressons...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,inria-00386607,Estimation Monte Carlo dans les processus ponc...,Nous proposons une approche de modélisation pa...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,inria-00494789,Convergence de la constante de Cheeger de grap...,Nous nous intéressons dans ce travail aux ense...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [5]:
df_valid = pd.read_csv(csv_path/'dev.csv', delimiter='\t') 
print(df_valid.shape)
df_valid.head()

(2366, 203)


Unnamed: 0,HAL_ID,title,abstract,chim.anal,chim.cata,chim.chem,chim.cris,chim.geni,chim.inor,chim.mate,...,spi.other,spi.plasma,spi.signal,spi.tron,stat.ap,stat.co,stat.me,stat.ml,stat.ot,stat.th
0,halshs-00183267,La place des principes dans la physique mathém...,Nous nous proposons de tenter de comprendre co...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hal-00770557,SONDY : une plateforme open-source d'analyse e...,Ce papier décrit la plateforme SONDY qui perme...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,halshs-00856441,Démarches d'investigation : exemples avec le b...,Le groupe TREMA-1 s'intéresse aux usages de re...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,inria-00386689,Théorie du risque en santé publique: Applicati...,Nous introduisons une nouvelle approche pour t...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,hal-00659007,Connaissances mathématiques et manuels d'ensei...,Ce papier présente un travail d'analyse des co...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_test = pd.read_csv(csv_path/'test.csv', delimiter='\t') 
print(df_test.shape)
df_test.head()

(2370, 203)


Unnamed: 0,HAL_ID,title,abstract,chim.anal,chim.cata,chim.chem,chim.cris,chim.geni,chim.inor,chim.mate,...,spi.other,spi.plasma,spi.signal,spi.tron,stat.ap,stat.co,stat.me,stat.ml,stat.ot,stat.th
0,inria-00494835,Modélisation des dépassements de seuils pour u...,La modélisation des événements extrêmes est de...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,inria-00386743,Estimation de la densité spectrale d'un champ ...,"Dans ce papier, nous étudions l'estimation de ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,inria-00494779,Analyse de données avec R - Complémentarité de...,L'objectif de cet exposé est de présenter les ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,hal-02017267,Changement de regard sur les figures : une étu...,Cette recherche entre en résonnance avec les t...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,hal-00530279,Analyse de sensibilité pour l'étude des paramè...,Le but du papier est d'étudier les paramètres ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### Get list of labels from DF columns

In [7]:
cols = df_train.columns
label_cols = list(cols[3:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)
print(num_labels)

Label columns:  ['chim.anal', 'chim.cata', 'chim.chem', 'chim.cris', 'chim.geni', 'chim.inor', 'chim.mate', 'chim.theo', 'info.comp', 'info.eiah', 'info.info-ai', 'info.info-ao', 'info.info-ar', 'info.info-au', 'info.info-bi', 'info.info-bt', 'info.info-cc', 'info.info-ce', 'info.info-cg', 'info.info-cl', 'info.info-cr', 'info.info-cv', 'info.info-cy', 'info.info-db', 'info.info-dc', 'info.info-dl', 'info.info-dm', 'info.info-ds', 'info.info-es', 'info.info-et', 'info.info-fl', 'info.info-gl', 'info.info-gr', 'info.info-gt', 'info.info-hc', 'info.info-ia', 'info.info-im', 'info.info-ir', 'info.info-it', 'info.info-iu', 'info.info-lg', 'info.info-lo', 'info.info-ma', 'info.info-mc', 'info.info-mm', 'info.info-mo', 'info.info-ms', 'info.info-na', 'info.info-ne', 'info.info-ni', 'info.info-oh', 'info.info-os', 'info.info-pf', 'info.info-pl', 'info.info-rb', 'info.info-ro', 'info.info-sc', 'info.info-sd', 'info.info-se', 'info.info-si', 'info.info-sy', 'info.info-ti', 'info.info-ts', 'info

In [8]:
#shuffle rows
df_train = df_train.sample(frac=1).reset_index(drop=True) 
df_valid = df_valid.sample(frac=1).reset_index(drop=True) 
df_test = df_test.sample(frac=1).reset_index(drop=True) 

In [9]:
df_train['one_hot_labels'] = list(df_train[label_cols].values)
df_valid['one_hot_labels'] = list(df_valid[label_cols].values)
df_test['one_hot_labels'] = list(df_test[label_cols].values)
df_train.head()

Unnamed: 0,HAL_ID,title,abstract,chim.anal,chim.cata,chim.chem,chim.cris,chim.geni,chim.inor,chim.mate,...,spi.plasma,spi.signal,spi.tron,stat.ap,stat.co,stat.me,stat.ml,stat.ot,stat.th,one_hot_labels
0,hal-00554461,Utilisation des Techniques Ultrasonores pour l...,L'huile d'Argan et l'huile d'olive sont parmi ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,hal-01464426,Classification d'objets 3D par extraction de s...,"Dans cet article, nous proposons une nouvelle ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
2,hal-03362118,Identification rapide des caractéristiques de ...,"Dans cette étude, nous nous intéressons à la v...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,hal-01300013,Potentiel du contrôle ultrasonore d’une plaque...,L'inspection en service de structures internes...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,cea-02562688,Comparaison des codes de calcul de dose Monte-...,Les systèmes de planification de traitement (T...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


#### Convert to DF values to list

In [10]:
train_labels = list(df_train.one_hot_labels.values)
train_text = list(df_train.abstract.values)

valid_labels = list(df_valid.one_hot_labels.values)
valid_text = list(df_valid.abstract.values)

test_labels = list(df_test.one_hot_labels.values)
test_text = list(df_test.abstract.values)

#### Tokenize texts and gete input_ids + attention masks

In [11]:
max_length = 512 # max sequence length
model_name = "flaubert/flaubert_base_cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/flaubert/flaubert_base_cased/resolve/main/config.json from cache at /home/haytame/.cache/huggingface/transformers/0b9ef58865bb61b2a44569c51b24b441c7b6b49ba63c659fc4ad5d61ffa011d6.c03a6cc0529664af7ebd7b4b385954d9cd0071c3d965d9377ab407e2eaa06918
Model config FlaubertConfig {
  "_name_or_path": "flaubert/flaubert_base_cased",
  "amp": 1,
  "architectures": [
    "FlaubertWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "bos_token_id": 0,
  "bptt": 512,
  "causal": false,
  "clip_grad_norm": 5,
  "dropout": 0.1,
  "emb_dim": 768,
  "embed_init_std": 0.02209708691207961,
  "encoder_only": true,
  "end_n_top": 5,
  "eos_index": 1,
  "fp16": true,
  "gelu_activation": true,
  "group_by_size": true,
  "id2lang": {
    "0": "fr"
  },
  "init_std": 0.02,
  "is_encoder": true,
  "lang2id": {
    "fr": 0
  },
  "lang_id": 0,
  "langs": [
    "fr"
  ],
  "layer_norm_eps": 1e-12,
  "layerdrop": 0.0,
  "lg_sampling

In [12]:
encodings_train = tokenizer.batch_encode_plus(train_text,max_length=max_length,padding='max_length',truncation=True) 
print('tokenizer outputs: ', encodings_train.keys())

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [13]:
encodings_valid = tokenizer.batch_encode_plus(valid_text,max_length=max_length,padding='max_length',truncation=True) 
print('tokenizer outputs: ', encodings_valid.keys())

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [14]:
encodings_test = tokenizer.batch_encode_plus(test_text,max_length=max_length,padding='max_length',truncation=True)
print('tokenizer outputs: ', encodings_test.keys())

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [15]:
train_input_ids = encodings_train['input_ids'] # tokenized and encoded sentences
train_attention_masks = encodings_train['attention_mask'] # attention masks

In [16]:
valid_input_ids = encodings_valid['input_ids']
valid_attention_masks = encodings_valid['attention_mask']
test_input_ids = encodings_test['input_ids']
test_attention_masks = encodings_test['attention_mask']

### Convert to tensors and Make Dataloaders

In [17]:
train_inputs_tensor = torch.tensor(np.array(train_input_ids))
train_masks_tensor = torch.tensor(np.array(train_attention_masks))
train_labels_tensor = torch.tensor(np.array(train_labels))

validation_inputs_tensor = torch.tensor(np.array(valid_input_ids))
validation_masks_tensor = torch.tensor(np.array(valid_attention_masks))
validation_labels_tensor = torch.tensor(np.array(valid_labels))

test_inputs_tensor = torch.tensor(np.array(test_input_ids))
test_masks_tensor = torch.tensor(np.array(test_attention_masks))
test_labels_tensor = torch.tensor(np.array(test_labels))

In [18]:
# Select a batch size for training. a power of 2 is recommended
batch_size = 8

train_data = TensorDataset(train_inputs_tensor, train_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs_tensor, validation_masks_tensor, validation_labels_tensor)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs_tensor, test_masks_tensor, test_labels_tensor)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [19]:
save_path = Path('FlauBERT/dataloaders/')

In [20]:
torch.save(train_dataloader,save_path/f'train_data_loader-{batch_size}-{max_length}')
torch.save(validation_dataloader,save_path/f'validation_data_loader-{batch_size}-{max_length}')
torch.save(test_dataloader,save_path/f'test_data_loader-{batch_size}-{max_length}')