In [1]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
from torch_geometric.data import Data
import torch.nn as nn
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
import pandas as pd
import difflib

import sys
sys.path.append('../AJA')
import AJA as aja

In [2]:
# récupération des données 
df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = aja.get_data()

In [3]:
# feature extraction

from sklearn.preprocessing import StandardScaler

# node

scaler = StandardScaler()
# sentence length normalized
df_train_nodes['sentence_length'] = df_train_nodes['text'].apply(lambda s: len(s.split()))
df_train_nodes['sentence_length'] = scaler.fit_transform(df_train_nodes['sentence_length'].values.reshape(-1, 1))
df_test_nodes['sentence_length'] = df_test_nodes['text'].apply(lambda s: len(s.split()))
df_test_nodes['sentence_length'] = scaler.transform(df_test_nodes['sentence_length'].values.reshape(-1, 1))

df_train_nodes['nb_occurences'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_train_nodes['nb_occurences'] = scaler.fit_transform(df_train_nodes['nb_occurences'].values.reshape(-1, 1))
df_test_nodes['nb_occurences'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test_nodes['nb_occurences'] = scaler.transform(df_test_nodes['nb_occurences'].values.reshape(-1, 1))


df_train_nodes['nb_words_more_5'] = df_train_nodes['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_train_nodes['nb_words_more_5'] = scaler.fit_transform(df_train_nodes['nb_words_more_5'].values.reshape(-1, 1))
df_test_nodes['nb_words_more_5'] = df_test_nodes['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test_nodes['nb_words_more_5'] = scaler.transform(df_test_nodes['nb_words_more_5'].values.reshape(-1, 1))


# speaker hot-one encoding
one_hot_encoded = pd.get_dummies(df_train_nodes['speaker_int'], prefix='speaker', dtype=int)
df_train_nodes = df_train_nodes.drop('speaker_int', axis=1)
df_train_nodes = df_train_nodes.drop('speaker_text', axis=1)
df_train_nodes = pd.concat([df_train_nodes, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_test_nodes['speaker_int'], prefix='speaker', dtype=int)
df_test_nodes = df_test_nodes.drop('speaker_int', axis=1)
df_test_nodes = df_test_nodes.drop('speaker_text', axis=1)
df_test_nodes = pd.concat([df_test_nodes, one_hot_encoded], axis=1)

In [4]:

new_df = pd.DataFrame({
        'transcription': df_train_edges['transcription'],
        'start': df_train_edges['end'],
        'end': df_train_edges['start'],
        'type_int': 16 + df_train_edges['type_int'],
        'type_text': df_train_edges['type_text'] + "_reverse"
    })
df_train_edges = pd.concat([df_train_edges, new_df], ignore_index=True)

new_df = pd.DataFrame({
        'transcription': df_test_edges['transcription'],
        'start': df_test_edges['end'],
        'end': df_test_edges['start'],
        'type_int': 16 + df_test_edges['type_int'],
        'type_text': df_test_edges['type_text'] + "_reverse"
    })
df_test_edges = pd.concat([df_test_edges, new_df], ignore_index=True)




In [5]:
# Charger l'extension autoreload
%load_ext autoreload

# Configurer autoreload pour recharger tous les modules avant l'exécution de chaque cellule
%autoreload 2

In [6]:
# creation des graphs
train_graphs, test_graphs = aja.make_graphs(df_train_nodes, df_train_edges, df_test_nodes, df_test_edges)
N_features = train_graphs['ES2002a'].x.shape[1]
train_graphs, validation_graphs = aja.train_validation_split(train_graphs, 0.2)

In [7]:
N_features

391

In [8]:
class MultiChannelsGCN(torch.nn.Module):
    
    def __init__(self, channels, input_dim, post_conv_dim, output_dim, identity=False):
        super(MultiChannelsGCN, self).__init__()
        self.identity = identity
        self.channels = channels
        self.input_dim = input_dim
        self.post_conv_dim = post_conv_dim
        self.output_dim = output_dim
        self.GCN = nn.ModuleList([GCNConv(input_dim, post_conv_dim) for _ in range(channels)])
        if identity:
            self.dense = nn.Linear(post_conv_dim * (channels + 1), output_dim)
            self.denseID = nn.Linear(input_dim, post_conv_dim)
        else:
            self.dense = nn.Linear(post_conv_dim * channels, output_dim)

    def forward(self, nodes, edges):
        X = []
        for k in range(self.channels):
            if len(edges[k]) == 0:
                x = torch.zeros(nodes.shape[0], self.post_conv_dim)
            else:
                x = F.relu(self.GCN[k](nodes, edges[k]))
            X.append(x)
        if self.identity:
            X.append(F.relu(self.denseID(nodes)))
        concat = torch.cat(X, dim=1)
        return F.relu(self.dense(concat))

In [9]:
# on définie son plus beau modèle

class NodeClassifier(torch.nn.Module):
    def __init__(self, channels, input_dim):
        super(NodeClassifier, self).__init__()
        self.GCN1 = MultiChannelsGCN(channels, input_dim, 50, 20, identity=True)
        self.dense1 = nn.Linear(20,1)

    def forward(self, data):
        nodes, edges = data.x, data.edge_index
        x = self.GCN1(nodes, edges)
        x = self.dense1(x)
        x = torch.sigmoid(x)
        return x

    def predict(self, graph):
        self.eval()
        with torch.no_grad():
            logits = self.forward(graph)
        return np.array((logits > 0.5).int()).flatten()

In [10]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import f1_score

class AjaPyTorchWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def fit(self, train_graph_dict, validation_graph_dict, verbose=1, max_epochs=10):
        # Training logic using your PyTorch model
        # ...
                
        # Move the model and data to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(device)

        # Use DataLoader to create batches of data
        train_loader = DataLoader(list(train_graph_dict.values()), batch_size=1, shuffle=True)
        N_train = len(train_loader)
        validation_loader = DataLoader(list(validation_graph_dict.values()), batch_size=1, shuffle=False)
        N_validation = len(validation_loader)

        if verbose > 0:
            print('Training on', N_train, 'graphs, validating on', N_validation, 'graphs')

        # Train the model
        model_name = "model_py_torch"
        best_f1_score = 0
        for epoch in range(max_epochs):
            if verbose > 0:
                print('- Epoch', f'{epoch + 1:03d}', '-')
            # training
            self.model.train()
            total_loss = 0
            for data in train_loader:
                data = data.to(device)
                self.optimizer.zero_grad()
                output = self.model(data).squeeze()
                loss = self.criterion(output, data.y.float())
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
            average_loss = total_loss / N_train
            if verbose > 1:
                print('Loss:', f'{average_loss:.4f}')
            

            # Evaluate the model on the training set
            self.model.eval()
            f1_moyen_train = 0
            for data in train_loader:
                data = data.to(device)
                y_pred = self.model.predict(data)
                y_true = data.y.cpu().numpy()
                f1 = f1_score(y_true, y_pred)
                f1_moyen_train += f1
            f1_moyen_train /= N_train
            if verbose > 1:
                print('F1 train:', f1_moyen_train)

            # Evaluate the model on the validation set
            self.model.eval()
            f1_moyen_valid = 0
            for data in validation_loader:
                data = data.to(device)
                y_pred = self.model.predict(data)
                y_true = data.y.cpu().numpy()
                f1 = f1_score(y_true, y_pred)
                f1_moyen_valid += f1
            f1_moyen_valid /= N_validation
            if verbose > 1:
                print('F1 valid:', f1_moyen_valid)

            # callbacks ou autre
            if f1_moyen_valid > best_f1_score:
                if verbose > 1:
                    print('It\'s better !' )
                torch.save(self.model.state_dict(), "training_states/" + model_name + "-best.pth")
            else:
                self.optimizer.param_groups[0]['lr'] /= 2
                if verbose > 1:
                    print('Learning rate reduced to:', self.optimizer.param_groups[0]['lr'])
            if verbose > 1:
                print('')
        
        if verbose > 0:
            print('Training finished !')

        self.model.load_state_dict(torch.load("training_states/" + model_name + "-best.pth"))

    def predict(self, graphs_dict):
        # Prediction logic using your PyTorch model
        # ...
        self.model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        result = {}
        for key, graph in graphs_dict.items():
            data = graph.to(device)
            y_pred = self.model.predict(data)
            result[key] = y_pred
        return result


    def score(self, graphs_dict):
        # Scoring logic using your PyTorch model
        # ...
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        validation_loader = DataLoader(list(graphs_dict.values()), batch_size=1, shuffle=False)
        N_validation = len(validation_loader)
        self.model.eval()
        f1_moyen_valid = 0
        for data in validation_loader:
            data = data.to(device)
            y_pred = self.model.predict(data)
            y_true = data.y.cpu().numpy()
            f1 = f1_score(y_true, y_pred)
            f1_moyen_valid += f1
        f1_moyen_valid /= N_validation
        return f1_moyen_valid


In [11]:
from sklearn.metrics import f1_score

def f1_moyen(pred_dict, true_graphs_dict):
    f1_moyen = 0
    for key, pred in pred_dict.items():
        y_true = true_graphs_dict[key].y.numpy()
        f1_moyen += f1_score(y_true, pred)
    f1_moyen /= len(pred_dict)
    return f1_moyen

In [143]:
df = pd.DataFrame(columns=['A', 'B'])
df.loc[len(df)] = {'A': 1, 'B': 2}
df

Unnamed: 0,A,B
0,1,2


In [18]:
alpha, lr, nb_epochs = 4, 0.01, 3

pytorch_model = NodeClassifier(32, N_features)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([alpha]), reduction='mean')
optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)
model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)
train_graphs, validation_graphs = aja.train_validation_split({**train_graphs, **validation_graphs}, 0.2)
model.fit(train_graphs, validation_graphs, max_epochs=nb_epochs, verbose=2)
print(model.score(validation_graphs))

Training on 78 graphs, validating on 19 graphs
- Epoch 001 -
Loss: 1.0113
F1 train: 0.5796432758090442
F1 valid: 0.5701720316700023
It's better !

- Epoch 002 -
Loss: 0.9904
F1 train: 0.5865508169597621
F1 valid: 0.5662656080982903
It's better !

- Epoch 003 -
Loss: 0.9781
F1 train: 0.5954466131521925
F1 valid: 0.565267445569058
It's better !

Training finished !
0.565267445569058


In [30]:
df = pd.DataFrame(columns=['alpha', 'nb_epochs', 'lr', 'f1_score_moyen', 'std'])

In [34]:
kappa_list = [3,4,5]
nb_epochs_list = [3]
lr_list = [0.01,0.001]

for kappa, lr, nb_epochs in tqdm([(kappa, lr, nb_epochs) for kappa in kappa_list for lr in lr_list for nb_epochs in nb_epochs_list]):
    # Fit, predict, and score using scikit-learn-like API
    f1s = []
    for k in range(3):
        pytorch_model = NodeClassifier(32, N_features)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]))
        optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)
        model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)
        train_graphs, validation_graphs = aja.train_validation_split({**train_graphs, **validation_graphs}, 0.2)
        model.fit(train_graphs, validation_graphs, max_epochs=nb_epochs, verbose=0)
        y_pred = model.predict(test_graphs)
        f1s.append(model.score(validation_graphs))
    df.loc[len(df)] = {'kappa': kappa, 'lr': lr, 'nb_epochs': nb_epochs, 'f1_score_moyen': np.mean(f1s), 'std': np.std(f1s)}

  0%|          | 0/6 [00:00<?, ?it/s]

 17%|█▋        | 1/6 [01:54<09:33, 114.73s/it]


KeyboardInterrupt: 

In [33]:
df

Unnamed: 0,alpha,nb_epochs,lr,f1_score_moyen,std
0,3,3,0.01,0.528567,0.026694
1,3,3,0.001,0.558419,0.007501
2,3,3,0.01,0.564446,0.010139
3,3,3,0.001,0.56562,0.005277
4,3,3,0.01,0.551379,0.013195
5,3,3,0.001,0.576493,0.012533


In [36]:
pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting future (from hyperopt)
  Downloading future-0.18.3.tar.gz (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.9/840.9 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting cloudpickle (from hyperopt)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting py4j (from hyperopt)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Building wheels for collected packa

In [39]:
from hyperopt import fmin, tpe, hp

# Définir les hyperparamètres à optimiser
space = {
    'lr': hp.loguniform('lr', np.log(0.00001), np.log(0.1)),
    'kappa': hp.uniform('kappa', 1, 10),
    'epochs': hp.choice('epochs', [1, 2, 3, 4, 5, 6])
}

def objective(params):
    global train_graphs, validation_graphs, test_graphs, N_features

    # Convertir 'epochs' en int car hp.choice retourne un index
    epochs = int(params['epochs'])
    kappa = params['kappa']
    lr = params['lr']

    f1s = []
    for k in range(3):
        pytorch_model = NodeClassifier(32, N_features)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]))
        optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)
        model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)
        train_graphs, validation_graphs = aja.train_validation_split({**train_graphs, **validation_graphs}, 0.2)
        model.fit(train_graphs, validation_graphs, max_epochs=epochs, verbose=0)
        y_pred = model.predict(test_graphs)
        f1s.append(model.score(validation_graphs))

    return -np.mean(f1s)  # Hyperopt minimise la fonction, donc nous utilisons -F1

# Utiliser l'algorithme TPE pour l'optimisation bayésienne
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20)

# Afficher les meilleurs hyperparamètres trouvés
print("Meilleurs hyperparamètres:", best)


  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 20/20 [22:01<00:00, 66.09s/trial, best loss: -0.5863224603588252]
Meilleurs hyperparamètres: {'epochs': 1, 'kappa': 6.059207177707741, 'lr': 0.005724757872918405}


In [27]:
import random

def get_bagging_models(n_bagging, train_graphs):
    models = []
    scores = []
    for i in range(n_bagging):
        print('Bagging', i+1)
        pytorch_model = NodeClassifier(32, N_features)
        kapa = 4
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kapa]), reduction='mean')
        optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.01)
        model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)
        keys = list(train_graphs.keys())
        bagging_train_graphs = {}
        bagging_validation_graphs = {}
        samples = random.choices(keys, k=len(keys))
        c_train = 0
        c_validation = 0
        for key in keys:
            if key in samples:
                bagging_train_graphs[c_train] = train_graphs[key]
                c_train += 1
            else:
                bagging_validation_graphs[c_validation] = train_graphs[key]
                c_validation += 1
        model.fit(bagging_train_graphs, bagging_validation_graphs, max_epochs=3, verbose=0)
        models.append(model)
        score = model.score(bagging_validation_graphs)
        scores.append(score)
        print('F1 score:', score)
    return models, scores

def predict_bagging(models, scores, graphs_dict):
    result = {}
    for key, graph in graphs_dict.items():
        y_pred = 0
        for i in range(len(models)):
            y_pred += models[i].predict({key: graph})[key]
        y_pred =  y_pred / len(models)
        y_pred = np.array((y_pred > 0.5).astype(int)).flatten()
        result[key] = y_pred
    return result



In [28]:
models, scores = get_bagging_models(5, train_graphs)
valid_predicitons = predict_bagging(models, scores, validation_graphs)
print(f1_moyen(valid_predicitons, validation_graphs))

Bagging 1
F1 score: 0.5790468997882872
Bagging 2
F1 score: 0.5639653174552394
Bagging 3
F1 score: 0.5748271393944064
Bagging 4
F1 score: 0.554062916995801
Bagging 5
F1 score: 0.5800797051670757
0.5567616820082427


In [89]:
models = get_bagging_models(10, {**train_graphs, **validation_graphs})
test_prediction = predict_bagging(models, test_graphs)
aja.make_test_csv_submission_from_dict(test_prediction, 'bagging')

Bagging 1
F1 score: 0.5722531018403533
Bagging 2
F1 score: 0.5548872439522106
Bagging 3
F1 score: 0.5438859282730878
Bagging 4
F1 score: 0.5529450826470101
Bagging 5
F1 score: 0.5876842166891876
Bagging 6
F1 score: 0.5715153451886076
Bagging 7
F1 score: 0.5719939080620162
Bagging 8
F1 score: 0.5855953315690103
Bagging 9
F1 score: 0.5609144545158112
Bagging 10
F1 score: 0.5563821152963317
