In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
import torch

2024-04-12 13:57:36.611002: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-12 13:57:36.677428: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Télécharger les données

In [2]:
data_frame=pd.read_csv("/home/onyxia/work/NLP_3A_ENSAE/data_bases/data_cleaned.csv",sep=",")

In [3]:
data_frame.head()

Unnamed: 0.1,Unnamed: 0,sex,first_name,surname
0,0,femme,Marie,Chardon
1,1,homme,Louis,Lhopital
2,2,femme,Marie,Pyrin
3,3,femme,Marie,Lavocat
4,4,femme,Marguerite,Benne


In [4]:
texts = data_frame['first_name'].tolist()
labels = data_frame['sex'].tolist()


# Télécharger le modèle de Hagging face et manipulation des données pour le fine tuning

In [5]:
model_name = "padmajabfrl/Gender-Classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

## Je split les données en training et testing

In [6]:

train_texts, test_texts, train_labels,test_labels = train_test_split(texts, labels, test_size=.2)



In [7]:
# Je transforme mes données de façon à ce que les femmes soient encodées en 1 et les hommes en 0
label_map = {'homme': 0, 'femme': 1}  
train_labels = [label_map[label] for label in train_labels]
test_labels = [label_map[label] for label in test_labels]

Turn the data to Dataset

In [8]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label)
        }

    def __len__(self):
        return len(self.labels)


train_dataset = IMDbDataset(train_texts, train_labels, tokenizer, max_length=128)
test_dataset = IMDbDataset(test_texts, test_labels, tokenizer, max_length=128)


In [9]:


num_epochs=100
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Define your model
model = DistilBertForSequenceClassification.from_pretrained('padmajabfrl/Gender-Classification')
model.to(device)
model.train()

# Define your optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define your DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss:.4f}')

# Evaluation
model.eval()
# Define your evaluation loop if needed




Epoch 1/100, Loss: 0.4181
Epoch 2/100, Loss: 0.1376
Epoch 3/100, Loss: 0.0715
Epoch 4/100, Loss: 0.0991
Epoch 5/100, Loss: 0.0752
Epoch 6/100, Loss: 0.0631
Epoch 7/100, Loss: 0.0503
Epoch 8/100, Loss: 0.0498
Epoch 9/100, Loss: 0.0385
Epoch 10/100, Loss: 0.0415
Epoch 11/100, Loss: 0.0361
Epoch 12/100, Loss: 0.0361
Epoch 13/100, Loss: 0.0395
Epoch 14/100, Loss: 0.0349
Epoch 15/100, Loss: 0.0385
Epoch 16/100, Loss: 0.0403
Epoch 17/100, Loss: 0.0329
Epoch 18/100, Loss: 0.0415
Epoch 19/100, Loss: 0.0356
Epoch 20/100, Loss: 0.0361
Epoch 21/100, Loss: 0.0353
Epoch 22/100, Loss: 0.0328
Epoch 23/100, Loss: 0.0349
Epoch 24/100, Loss: 0.0360
Epoch 25/100, Loss: 0.0415
Epoch 26/100, Loss: 0.0353
Epoch 27/100, Loss: 0.0345
Epoch 28/100, Loss: 0.0500
Epoch 29/100, Loss: 0.0366
Epoch 30/100, Loss: 0.0352
Epoch 31/100, Loss: 0.0283
Epoch 32/100, Loss: 0.0359
Epoch 33/100, Loss: 0.0334
Epoch 34/100, Loss: 0.0361
Epoch 35/100, Loss: 0.0346
Epoch 36/100, Loss: 0.0368
Epoch 37/100, Loss: 0.0430
Epoch 38/1

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [10]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('padmajabfrl/Gender-Classification')

# Example input text
input_text = "Camille"

# Step 1: Tokenize the input text
inputs = tokenizer(input_text, return_tensors='pt')

# Step 2: Perform inference
with torch.no_grad():
    outputs = model(**inputs)

# Step 3: Get predictions
predictions = torch.argmax(outputs.logits, dim=1)

# Convert predictions to human-readable labels
label_map = {0: 'homme', 1: 'femme'}  # Define your label mapping
predicted_label = label_map[predictions.item()]

print("Predicted label:", predicted_label)


Predicted label: femme
