In [None]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
#import os
#import re
import nltk
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer , AutoTokenizer
from transformers import TFDistilBertForSequenceClassification  , AutoModelForTokenClassification ,AutoModelForSequenceClassification, BertConfig,BertForSequenceClassification
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from collections import Counter

In [None]:
def remove_punct(text):

#     new_text = word_tokenize(text)
#     new_text = list(filter(lambda token: token not in string.punctuation, new_text))
#     text = " ".join([word for word in new_text])
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_text = tokenizer.tokenize(text)
    text = " ".join([word for word in new_text])

    return text.strip()

In [None]:
# 1. Importer les bibliothèques nécessaires
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score

In [None]:
# 2. Préparer les données
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        item = {key: val.squeeze() for key, val in inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [None]:
# Charger votre dataset
df = pd.read_csv('/kaggle/input/celp/train_data.csv') # Mettre à jour avec le chemin correct
df["text"] = df["text"].apply(lambda text :remove_punct(text))
texts = df['text'].tolist()
labels = df['level'].tolist()

## Modélisation

In [None]:
def one_hot_encoding(labels, num_classes):
    return torch.eye(num_classes)[labels]



In [None]:
# Diviser les données
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.15)
train_labels = one_hot_encoding(train_labels,6)
val_labels = one_hot_encoding(val_labels,6)

In [None]:
# Initialiser le tokenizer
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('AbdulSami/bert-base-cased-cefr',device = device)

In [None]:
# Créer les datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

In [None]:
# 3. Charger DistilBERT pré-entraîné
#model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)#
#model = AutoModelForTokenClassification.from_pretrained('AbdulSami/bert-base-cased-cefr', num_labels=6)
model_name = 'AbdulSami/bert-base-cased-cefr'
model_name = '/kaggle/working/results/checkpoint-2176'

config = BertConfig.from_pretrained(model_name, num_labels=6)

model = BertForSequenceClassification.from_pretrained(model_name, config=config)

model = model.to(device)

In [None]:
# 4. Définir le DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
! pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    labels = np.argmax(labels,axis = 1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# 5. Entraîner le modèle
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    weight_decay=0.02,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',            
)

from sklearn.utils import compute_class_weight
class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(df['level']),
    y = df['level']
    
)
def one_hot_encoding(labels, num_classes):
    return torch.eye(num_classes)[labels]

class CustomTrainer(Trainer):
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 6 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), one_hot_encoding(labels.to("cpu"),self.model.config.num_labels).to(device))
        
        return (loss, outputs) if return_outputs else loss


In [None]:
!wandb login --relogin 5f8ae5d90931deb3c0da631b259c98acbb7b9bcb

In [None]:
import wandb
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=10,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    weight_decay=0.02,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',            
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
# 6. Évaluer le modèle
trainer.evaluate()

In [None]:
# Obtenir les résultats de la prédiction
predictions = trainer.predict(val_dataset)

# Les logits des prédictions sont dans `predictions.predictions`
logits = predictions.predictions

# Convertir les logits en indices de classe prédite
y_pred = logits.argmax(axis=-1)

In [None]:
cm = confusion_matrix(val_dataset.labels, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3, 4, 5], yticklabels=[0, 1, 2, 3, 4, 5])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

In [None]:
cm = confusion_matrix(val_dataset.labels, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3, 4, 5], yticklabels=[0, 1, 2, 3, 4, 5])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix with custom loss')
plt.show()

In [None]:
# Obtenir les résultats de la prédiction
predictions = trainer.predict(train_dataset)

# Les logits des prédictions sont dans `predictions.predictions`
logits = predictions.predictions

# Convertir les logits en indices de classe prédite
y_pred_train = logits.argmax(axis=-1)

cm = confusion_matrix(train_dataset.labels, y_pred_train)

 
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3, 4, 5], yticklabels=[0, 1, 2, 3, 4, 5])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import pandas as pd
from torch.utils.data import DataLoader

# Charger les données de test
test_df = pd.read_csv("/kaggle/input/celp/test_data.csv")  # Mettre à jour avec le chemin correct
test_df["text"] = test_df["text"].apply(lambda text :remove_punct(text))
test_texts = test_df['text'].tolist()
test_ids = test_df['Id'].tolist()

# Préparer le dataset de test
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.tokenizer = tokenizer
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        item = {key: val.squeeze() for key, val in inputs.items()}
        return item

test_dataset = TestDataset(test_texts, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

# Faire des prédictions
model.eval()  # Mettre le modèle en mode évaluation
predictions = []

with torch.no_grad():
    for batch in test_loader:
        outputs = model(**{k: v.to(model.device) for k, v in batch.items()})
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

# Préparer le fichier de soumission
submission_df = pd.DataFrame({'Id': test_ids, 'level': predictions})
submission_df.to_csv('submission.csv', index=False)

print("Fichier de soumission créé avec succès !")

### Bert Finetuning using LORA Model 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Créer les datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
! pip install peft
from peft import LoraConfig, TaskType
from peft import get_peft_model

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', 
    num_labels=6
)
model = get_peft_model(model, lora_config)

# Regarder le nombre de paramètre d'entrainement :
def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("The nom")

In [None]:
!pip install evaluate
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=5,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,   
    weight_decay=0.02,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',            
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()