In [1]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
from torch_geometric.data import Data
import torch.nn as nn
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.append('../AJA')
import AJA as aja

  from .autonotebook import tqdm as notebook_tqdm


In [58]:
# récupération des données 
df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = aja.get_data()

# feature extraction

from sklearn.preprocessing import StandardScaler

# node

scaler = StandardScaler()
# sentence length normalized
df_train_nodes['sentence_length'] = df_train_nodes['text'].apply(lambda s: len(s.split()))
df_train_nodes['sentence_length'] = scaler.fit_transform(df_train_nodes['sentence_length'].values.reshape(-1, 1))
df_test_nodes['sentence_length'] = df_test_nodes['text'].apply(lambda s: len(s.split()))
df_test_nodes['sentence_length'] = scaler.transform(df_test_nodes['sentence_length'].values.reshape(-1, 1))

df_train_nodes['nb_occurences'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_train_nodes['nb_occurences'] = scaler.fit_transform(df_train_nodes['nb_occurences'].values.reshape(-1, 1))
df_test_nodes['nb_occurences'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test_nodes['nb_occurences'] = scaler.transform(df_test_nodes['nb_occurences'].values.reshape(-1, 1))

df_train_nodes['nb_words_more_7'] = df_train_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_train_nodes['nb_words_more_7'] = scaler.fit_transform(df_train_nodes['nb_words_more_7'].values.reshape(-1, 1))
df_test_nodes['nb_words_more_7'] = df_test_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test_nodes['nb_words_more_7'] = scaler.transform(df_test_nodes['nb_words_more_7'].values.reshape(-1, 1))

# speaker hot-one encoding
one_hot_encoded = pd.get_dummies(df_train_nodes['speaker_int'], prefix='speaker', dtype=int)
df_train_nodes = df_train_nodes.drop('speaker_int', axis=1)
df_train_nodes = df_train_nodes.drop('speaker_text', axis=1)
df_train_nodes = pd.concat([df_train_nodes, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_test_nodes['speaker_int'], prefix='speaker', dtype=int)
df_test_nodes = df_test_nodes.drop('speaker_int', axis=1)
df_test_nodes = df_test_nodes.drop('speaker_text', axis=1)
df_test_nodes = pd.concat([df_test_nodes, one_hot_encoded], axis=1)

# TFIDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train_nodes['text'])
df_train_nodes['tfidf_sum'] = tfidf_matrix.sum(axis=1)
df_train_nodes['tfidf_max'] = tfidf_matrix.max(axis=1).toarray().flatten()

tfidf_matrix_test = tfidf_vectorizer.fit_transform(df_test_nodes['text'])
df_test_nodes['tfidf_sum'] = tfidf_matrix_test.sum(axis=1)
df_test_nodes['tfidf_max'] = tfidf_matrix_test.max(axis=1).toarray().flatten()

df_train_nodes['tfidf_sum'] = scaler.fit_transform(df_train_nodes['tfidf_sum'].values.reshape(-1,1))
df_test_nodes['tfidf_sum'] = scaler.transform(df_test_nodes['tfidf_sum'].values.reshape(-1,1))

df_train_nodes['tfidf_max'] = scaler.fit_transform(df_train_nodes['tfidf_max'].values.reshape(-1,1))
df_test_nodes['tfidf_max'] = scaler.transform(df_test_nodes['tfidf_max'].values.reshape(-1,1))

df_train_nodes['yeah'] = df_train_nodes['text'].apply(lambda x: 1 if 'yeah' in x.lower() else 0)
df_test_nodes['yeah'] = df_test_nodes['text'].apply(lambda x: 1 if 'yeah' in x.lower() else 0)


# edge
new_df = pd.DataFrame({
        'transcription': df_train_edges['transcription'],
        'start': df_train_edges['end'],
        'end': df_train_edges['start'],
        'type_int': 16 + df_train_edges['type_int'],
        'type_text': df_train_edges['type_text'] + "_reverse"
    })
df_train_edges = pd.concat([df_train_edges, new_df], ignore_index=True)

new_df = pd.DataFrame({
        'transcription': df_test_edges['transcription'],
        'start': df_test_edges['end'],
        'end': df_test_edges['start'],
        'type_int': 16 + df_test_edges['type_int'],
        'type_text': df_test_edges['type_text'] + "_reverse"
    })
df_test_edges = pd.concat([df_test_edges, new_df], ignore_index=True)


In [59]:
# creation des graphs
train_graphs, test_graphs = aja.make_graphs(df_train_nodes, df_train_edges, df_test_nodes, df_test_edges)
N_features = train_graphs['ES2002a'].x.shape[1]
train_graphs, validation_graphs = aja.train_validation_split(train_graphs, 0.2)

In [60]:
N_features

394

In [61]:
df_train_nodes.head(20)

Unnamed: 0,transcription,line,text,label,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,...,sentence_length,nb_occurences,nb_words_more_7,speaker_0,speaker_1,speaker_2,speaker_3,tfidf_sum,tfidf_max,yeah
0,ES2002a,0,Okay,0,-0.057809,-0.085828,-0.03572,-0.011185,0.062363,-0.023545,...,-1.008131,-0.368253,-0.647917,1,0,0,0,-1.168929,1.439285,0
1,ES2002a,1,Right,0,-0.054862,0.047607,-0.032626,-0.010949,-0.035741,-0.051808,...,-1.008131,-0.368253,-0.647917,1,0,0,0,-1.168929,1.439285,0
2,ES2002a,2,<vocalsound> Um well this is the kick-off meet...,1,-0.054665,-0.073837,-0.017161,-0.064276,0.004937,0.062475,...,0.789302,-0.368253,0.456915,1,0,0,0,1.228067,-0.927421,0
3,ES2002a,3,Um <vocalsound> and um,0,-0.010416,-0.072719,-0.017206,-0.088992,-0.048035,0.051155,...,-0.558773,1.365643,-0.647917,1,0,0,0,-0.514793,0.709401,0
4,ES2002a,4,this is just what we're gonna be doing over th...,0,-0.028654,-0.015151,0.09591,-0.059113,0.042067,0.033088,...,1.088874,-0.368253,-0.647917,1,0,0,0,1.838146,-1.725435,0
5,ES2002a,5,"Um so first of all , just to kind of make sure...",0,-0.028386,-0.046021,0.023957,-0.064278,-0.0064,-0.002545,...,1.688019,-0.368253,-0.647917,1,0,0,0,1.729041,-1.117057,0
6,ES2002a,6,I'm Laura and I'm the project manager .,0,0.01553,-0.059705,0.051351,0.020685,0.07222,-0.019084,...,0.040372,-0.368253,-0.647917,1,0,0,0,-0.087169,0.102399,0
7,ES2002a,7,<vocalsound> Do you want to introduce yourself...,0,-0.079113,-0.097972,0.070705,-0.016207,-0.01667,-0.007734,...,0.190158,-0.368253,1.561747,1,0,0,0,0.585792,-0.726315,0
8,ES2002a,8,Great .,0,-0.106075,-0.030617,-0.090078,0.004343,0.043766,0.066486,...,-0.858345,-0.368253,-0.647917,0,1,0,0,-1.168929,1.439285,0
9,ES2002a,9,"Hi , I'm David and I'm supposed to be an indus...",0,-0.014584,0.074371,0.048543,-0.006655,-0.041892,-0.029181,...,0.789302,-0.368253,2.66658,0,0,1,0,0.832774,-1.193867,0


In [62]:
class MultiChannelsGCN(torch.nn.Module):
    
    def __init__(self, channels, input_dim, post_conv_dim, output_dim, identity=False):
        super(MultiChannelsGCN, self).__init__()
        self.identity = identity
        self.channels = channels
        self.input_dim = input_dim
        self.post_conv_dim = post_conv_dim
        self.output_dim = output_dim
        self.GCN = nn.ModuleList([GCNConv(input_dim, post_conv_dim) for _ in range(channels)])
        if identity:
            self.dense = nn.Linear(post_conv_dim * (channels + 1), output_dim)
            self.denseID = nn.Linear(input_dim, post_conv_dim)
        else:
            self.dense = nn.Linear(post_conv_dim * channels, output_dim)

    def forward(self, nodes, edges):
        X = []
        for k in range(self.channels):
            if len(edges[k]) == 0:
                x = torch.zeros(nodes.shape[0], self.post_conv_dim)
            else:
                x = F.relu(self.GCN[k](nodes, edges[k]))
            X.append(x)
        if self.identity:
            X.append(F.relu(self.denseID(nodes)))
        concat = torch.cat(X, dim=1)
        return F.relu(self.dense(concat))

In [67]:
# on définie son plus beau modèle

class NodeClassifier(torch.nn.Module):
    def __init__(self, channels, input_dim):
        super(NodeClassifier, self).__init__()
        self.GCN1 = MultiChannelsGCN(channels, input_dim, 50, 20, identity=True)
        #self.dropout = nn.Dropout(0.3)  
        self.dense1 = nn.Linear(20,1)


    def forward(self, data):
        nodes, edges = data.x, data.edge_index
        x = self.GCN1(nodes, edges)
        #x = self.dropout(x)
        x = self.dense1(x)
        x = torch.sigmoid(x)
        return x

    def predict(self, graph):
        self.eval()
        with torch.no_grad():
            logits = self.forward(graph)
        return np.array((logits > 0.5).int()).flatten()

In [68]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import f1_score

class AjaPyTorchWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def fit(self, train_graph_dict, validation_graph_dict, verbose=1, max_epochs=10):
        # Training logic using your PyTorch model
        # ...
                
        # Move the model and data to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(device)

        # Use DataLoader to create batches of data
        train_loader = DataLoader(list(train_graph_dict.values()), batch_size=1, shuffle=True)
        N_train = len(train_loader)
        validation_loader = DataLoader(list(validation_graph_dict.values()), batch_size=1, shuffle=False)
        N_validation = len(validation_loader)

        if verbose > 0:
            print('Training on', N_train, 'graphs, validating on', N_validation, 'graphs')

        # Train the model
        model_name = "model_py_torch"
        best_f1_score = 0
        for epoch in range(max_epochs):
            if verbose > 0:
                print('- Epoch', f'{epoch + 1:03d}', '-')
            # training
            self.model.train()
            total_loss = 0
            for data in train_loader:
                data = data.to(device)
                self.optimizer.zero_grad()
                output = self.model(data).squeeze()
                loss = self.criterion(output, data.y.float())
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
            average_loss = total_loss / N_train
            if verbose > 1:
                print('Loss:', f'{average_loss:.4f}')
            

            # Evaluate the model on the training set
            self.model.eval()
            f1_moyen_train = 0
            for data in train_loader:
                data = data.to(device)
                y_pred = self.model.predict(data)
                y_true = data.y.cpu().numpy()
                f1 = f1_score(y_true, y_pred)
                f1_moyen_train += f1
            f1_moyen_train /= N_train
            if verbose > 1:
                print('F1 train:', f1_moyen_train)

            # Evaluate the model on the validation set
            self.model.eval()
            f1_moyen_valid = 0
            for data in validation_loader:
                data = data.to(device)
                y_pred = self.model.predict(data)
                y_true = data.y.cpu().numpy()
                f1 = f1_score(y_true, y_pred)
                f1_moyen_valid += f1
            f1_moyen_valid /= N_validation
            if verbose > 1:
                print('F1 valid:', f1_moyen_valid)

            # callbacks ou autre
            if f1_moyen_valid > best_f1_score:
                if verbose > 1:
                    print('It\'s better !' )
                torch.save(self.model.state_dict(), "training_states/" + model_name + "-best.pth")
            else:
                self.optimizer.param_groups[0]['lr'] /= 2
                if verbose > 1:
                    print('Learning rate reduced to:', self.optimizer.param_groups[0]['lr'])
            if verbose > 1:
                print('')
        
        if verbose > 0:
            print('Training finished !')

        self.model.load_state_dict(torch.load("training_states/" + model_name + "-best.pth"))

    def predict(self, graphs_dict):
        # Prediction logic using your PyTorch model
        # ...
        self.model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        result = {}
        for key, graph in graphs_dict.items():
            data = graph.to(device)
            y_pred = self.model.predict(data)
            result[key] = y_pred
        return result


    def score(self, graphs_dict):
        # Scoring logic using your PyTorch model
        # ...
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        validation_loader = DataLoader(list(graphs_dict.values()), batch_size=1, shuffle=False)
        N_validation = len(validation_loader)
        self.model.eval()
        f1_moyen_valid = 0
        for data in validation_loader:
            data = data.to(device)
            y_pred = self.model.predict(data)
            y_true = data.y.cpu().numpy()
            f1 = f1_score(y_true, y_pred)
            f1_moyen_valid += f1
        f1_moyen_valid /= N_validation
        return f1_moyen_valid
    
    def predict_proba(self, graphs_dict):
        # Prediction logic using your PyTorch model
        # ...
        self.model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        result = {}
        for key, graph in graphs_dict.items():
            data = graph.to(device)
            output = self.model(data).squeeze()
            # Ajouter une fonction d'activation (softmax, sigmoïde, etc.) selon votre modèle
            probabilities = torch.softmax(output, dim=-1)
            result[key] = probabilities.cpu().detach().numpy()
        return result


In [69]:
from sklearn.metrics import f1_score

def f1_moyen(pred_dict, true_graphs_dict):
    f1_moyen = 0
    for key, pred in pred_dict.items():
        y_true = true_graphs_dict[key].y.numpy()
        f1_moyen += f1_score(y_true, pred)
    f1_moyen /= len(pred_dict)
    return f1_moyen

In [72]:
# Create an instance of your PyTorch model
pytorch_model = NodeClassifier(32, N_features)

# Define the loss function and optimizer
alpha = 0.15730642604852357
lr = 0.002272131994333311
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor((1 - alpha) / alpha), reduction='mean')
optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)

# Create an instance of the custom wrapper
model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)

# Fit, predict, and score using scikit-learn-like API
model.fit(train_graphs, validation_graphs, max_epochs=20,verbose=2)
y_pred = model.predict(test_graphs)
print(model.score(validation_graphs))

Training on 78 graphs, validating on 19 graphs
- Epoch 001 -
Loss: 1.0914
F1 train: 0.556367328343316
F1 valid: 0.5731097011052926
It's better !

- Epoch 002 -
Loss: 1.0677
F1 train: 0.5732777806549613
F1 valid: 0.5816558826246276
It's better !

- Epoch 003 -
Loss: 1.0606
F1 train: 0.5719429191063462
F1 valid: 0.5810750006328472
It's better !

- Epoch 004 -
Loss: 1.0548
F1 train: 0.597146585157545
F1 valid: 0.5947773616474321
It's better !

- Epoch 005 -
Loss: 1.0494
F1 train: 0.6050428200579929
F1 valid: 0.5954199027780251
It's better !

- Epoch 006 -
Loss: 1.0441
F1 train: 0.6130855106725387
F1 valid: 0.5970201378144628
It's better !

- Epoch 007 -
Loss: 1.0379
F1 train: 0.6273025746205838
F1 valid: 0.6008671292580287
It's better !

- Epoch 008 -
Loss: 1.0331
F1 train: 0.6379921709749488
F1 valid: 0.5962404102914051
It's better !

- Epoch 009 -
Loss: 1.0263
F1 train: 0.6551096707680432
F1 valid: 0.6005420673174923
It's better !

- Epoch 010 -
Loss: 1.0216
F1 train: 0.6213508652823464

In [73]:
import random

def get_bagging_models(n_bagging, train_graphs):
    models = []
    for i in range(n_bagging):
    
        print('Bagging', i+1)
        pytorch_model = NodeClassifier(32, N_features)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor((1 - alpha) / alpha), reduction='mean')
        optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.01)
        model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)
        keys = list(train_graphs.keys())
        bagging_train_graphs = {}
        bagging_validation_graphs = {}
        samples = random.choices(keys, k=len(keys))
        c_train = 0
        c_validation = 0
        for key in keys:
            if key in samples:
                bagging_train_graphs[c_train] = train_graphs[key]
                c_train += 1
            else:
                bagging_validation_graphs[c_validation] = train_graphs[key]
                c_validation += 1
        model.fit(bagging_train_graphs, bagging_validation_graphs, max_epochs=6, verbose=0)
        models.append(model)
        print('F1 score:', model.score(bagging_validation_graphs))
    return models

def predict_bagging(models, graphs_dict):
    result = {}
    for key, graph in graphs_dict.items():
        y_pred = 0
        for model in models:
            y_pred += model.predict({key: graph})[key]
        y_pred =  y_pred / len(models)
        y_pred = np.array((y_pred > 0.5).astype(int)).flatten()
        result[key] = y_pred
    return result



In [74]:
models = get_bagging_models(10, {**train_graphs, **validation_graphs})
test_prediction = predict_bagging(models, test_graphs)

Bagging 1
F1 score: 0.6004894427974068
Bagging 2
F1 score: 0.5405661662038173
Bagging 3
F1 score: 0.5913789909886713
Bagging 4
F1 score: 0.5797480448358324
Bagging 5
F1 score: 0.5891011636575166
Bagging 6
F1 score: 0.5985572465862873
Bagging 7
F1 score: 0.5725509234428762
Bagging 8
F1 score: 0.5185109392906673
Bagging 9
F1 score: 0.5533477511707815
Bagging 10
F1 score: 0.5494165920520147


In [75]:
aja.make_test_csv_submission_from_dict(test_prediction, 'bagging_alice')