In [10]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
from torch_geometric.data import Data
import torch.nn as nn
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.append('../AJA')
import AJA as aja

In [11]:
# récupération des données 
df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = aja.get_data()

In [12]:
# feature extraction

from sklearn.preprocessing import StandardScaler

# node

scaler = StandardScaler()
# sentence length normalized
df_train_nodes['sentence_length'] = df_train_nodes['text'].apply(lambda s: len(s.split()))
df_train_nodes['sentence_length'] = scaler.fit_transform(df_train_nodes['sentence_length'].values.reshape(-1, 1))
df_test_nodes['sentence_length'] = df_test_nodes['text'].apply(lambda s: len(s.split()))
df_test_nodes['sentence_length'] = scaler.transform(df_test_nodes['sentence_length'].values.reshape(-1, 1))

df_train_nodes['nb_occurences'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_train_nodes['nb_occurences'] = scaler.fit_transform(df_train_nodes['nb_occurences'].values.reshape(-1, 1))
df_test_nodes['nb_occurences'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test_nodes['nb_occurences'] = scaler.transform(df_test_nodes['nb_occurences'].values.reshape(-1, 1))


df_train_nodes['nb_words_more_5'] = df_train_nodes['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_train_nodes['nb_words_more_5'] = scaler.fit_transform(df_train_nodes['nb_words_more_5'].values.reshape(-1, 1))
df_test_nodes['nb_words_more_5'] = df_test_nodes['text'].apply(lambda x: sum(len(mot) > 5 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test_nodes['nb_words_more_5'] = scaler.transform(df_test_nodes['nb_words_more_5'].values.reshape(-1, 1))


# speaker hot-one encoding
one_hot_encoded = pd.get_dummies(df_train_nodes['speaker_int'], prefix='speaker', dtype=int)
df_train_nodes = df_train_nodes.drop('speaker_int', axis=1)
df_train_nodes = df_train_nodes.drop('speaker_text', axis=1)
df_train_nodes = pd.concat([df_train_nodes, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_test_nodes['speaker_int'], prefix='speaker', dtype=int)
df_test_nodes = df_test_nodes.drop('speaker_int', axis=1)
df_test_nodes = df_test_nodes.drop('speaker_text', axis=1)
df_test_nodes = pd.concat([df_test_nodes, one_hot_encoded], axis=1)

# TFIDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train_nodes['text'])
df_train_nodes['tfidf_sum'] = tfidf_matrix.sum(axis=1)
df_train_nodes['tfidf_max'] = tfidf_matrix.max(axis=1).toarray().flatten()

tfidf_matrix_test = tfidf_vectorizer.fit_transform(df_test_nodes['text'])
df_test_nodes['tfidf_sum'] = tfidf_matrix_test.sum(axis=1)
df_test_nodes['tfidf_max'] = tfidf_matrix_test.max(axis=1).toarray().flatten()

df_train_nodes['tfidf_sum'] = scaler.fit_transform(df_train_nodes['tfidf_sum'].values.reshape(-1,1))
df_test_nodes['tfidf_sum'] = scaler.transform(df_test_nodes['tfidf_sum'].values.reshape(-1,1))

df_train_nodes['tfidf_max'] = scaler.fit_transform(df_train_nodes['tfidf_max'].values.reshape(-1,1))
df_test_nodes['tfidf_max'] = scaler.transform(df_test_nodes['tfidf_max'].values.reshape(-1,1))

# Numbers of exclamation
def has_exclamation(text):
    # Check if '!' in the text is a digit
    return int(any(char == '!' for char in text))

df_train_nodes['has_exclamation'] = df_train_nodes['text'].apply(has_exclamation)
df_test_nodes['has_exclamation'] = df_test_nodes['text'].apply(has_exclamation)

# edge

new_df = pd.DataFrame({
        'transcription': df_train_edges['transcription'],
        'start': df_train_edges['end'],
        'end': df_train_edges['start'],
        'type_int': 16 + df_train_edges['type_int'],
        'type_text': df_train_edges['type_text'] + "_reverse"
    })
df_train_edges = pd.concat([df_train_edges, new_df], ignore_index=True)

new_df = pd.DataFrame({
        'transcription': df_test_edges['transcription'],
        'start': df_test_edges['end'],
        'end': df_test_edges['start'],
        'type_int': 16 + df_test_edges['type_int'],
        'type_text': df_test_edges['type_text'] + "_reverse"
    })
df_test_edges = pd.concat([df_test_edges, new_df], ignore_index=True)


In [13]:
# creation des graphs
train_graphs, test_graphs = aja.make_graphs(df_train_nodes, df_train_edges, df_test_nodes, df_test_edges)
N_features = train_graphs['ES2002a'].x.shape[1]
train_graphs, validation_graphs = aja.train_validation_split(train_graphs, 0.2)

In [14]:
N_features

394

In [15]:
class MultiChannelsGCN(torch.nn.Module):
    
    def __init__(self, channels, input_dim, post_conv_dim, output_dim, identity=False):
        super(MultiChannelsGCN, self).__init__()
        self.identity = identity
        self.channels = channels
        self.input_dim = input_dim
        self.post_conv_dim = post_conv_dim
        self.output_dim = output_dim
        self.GCN = nn.ModuleList([GCNConv(input_dim, post_conv_dim) for _ in range(channels)])
        if identity:
            self.dense = nn.Linear(post_conv_dim * (channels + 1), output_dim)
            self.denseID = nn.Linear(input_dim, post_conv_dim)
        else:
            self.dense = nn.Linear(post_conv_dim * channels, output_dim)

    def forward(self, nodes, edges):
        X = []
        for k in range(self.channels):
            if len(edges[k]) == 0:
                x = torch.zeros(nodes.shape[0], self.post_conv_dim)
            else:
                x = F.relu(self.GCN[k](nodes, edges[k]))
            X.append(x)
        if self.identity:
            X.append(F.relu(self.denseID(nodes)))
        concat = torch.cat(X, dim=1)
        return F.relu(self.dense(concat))

In [16]:
# on définie son plus beau modèle

class NodeClassifier(torch.nn.Module):
    def __init__(self, channels, input_dim):
        super(NodeClassifier, self).__init__()
        self.GCN1 = MultiChannelsGCN(channels, input_dim, 50, 20, identity=True)
        self.dense1 = nn.Linear(20,1)

    def forward(self, data):
        nodes, edges = data.x, data.edge_index
        x = self.GCN1(nodes, edges)
        x = self.dense1(x)
        x = torch.sigmoid(x)
        return x

    def predict(self, graph):
        self.eval()
        with torch.no_grad():
            logits = self.forward(graph)
        return np.array((logits > 0.5).int()).flatten()

In [17]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import f1_score

class AjaPyTorchWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def fit(self, train_graph_dict, validation_graph_dict, verbose=1, max_epochs=10):
        # Training logic using your PyTorch model
        # ...
                
        # Move the model and data to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(device)

        # Use DataLoader to create batches of data
        train_loader = DataLoader(list(train_graph_dict.values()), batch_size=1, shuffle=True)
        N_train = len(train_loader)
        validation_loader = DataLoader(list(validation_graph_dict.values()), batch_size=1, shuffle=False)
        N_validation = len(validation_loader)

        if verbose > 0:
            print('Training on', N_train, 'graphs, validating on', N_validation, 'graphs')

        # Train the model
        model_name = "model_py_torch"
        best_f1_score = 0
        for epoch in range(max_epochs):
            if verbose > 0:
                print('- Epoch', f'{epoch + 1:03d}', '-')
            # training
            self.model.train()
            total_loss = 0
            for data in train_loader:
                data = data.to(device)
                self.optimizer.zero_grad()
                output = self.model(data).squeeze()
                loss = self.criterion(output, data.y.float())
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
            average_loss = total_loss / N_train
            if verbose > 1:
                print('Loss:', f'{average_loss:.4f}')
            

            # Evaluate the model on the training set
            self.model.eval()
            f1_moyen_train = 0
            for data in train_loader:
                data = data.to(device)
                y_pred = self.model.predict(data)
                y_true = data.y.cpu().numpy()
                f1 = f1_score(y_true, y_pred)
                f1_moyen_train += f1
            f1_moyen_train /= N_train
            if verbose > 1:
                print('F1 train:', f1_moyen_train)

            # Evaluate the model on the validation set
            self.model.eval()
            f1_moyen_valid = 0
            for data in validation_loader:
                data = data.to(device)
                y_pred = self.model.predict(data)
                y_true = data.y.cpu().numpy()
                f1 = f1_score(y_true, y_pred)
                f1_moyen_valid += f1
            f1_moyen_valid /= N_validation
            if verbose > 1:
                print('F1 valid:', f1_moyen_valid)

            # callbacks ou autre
            if f1_moyen_valid > best_f1_score:
                if verbose > 1:
                    print('It\'s better !' )
                torch.save(self.model.state_dict(), "training_states/" + model_name + "-best.pth")
            else:
                self.optimizer.param_groups[0]['lr'] /= 2
                if verbose > 1:
                    print('Learning rate reduced to:', self.optimizer.param_groups[0]['lr'])
            if verbose > 1:
                print('')
        
        if verbose > 0:
            print('Training finished !')

        self.model.load_state_dict(torch.load("training_states/" + model_name + "-best.pth"))

    def predict(self, graphs_dict):
        # Prediction logic using your PyTorch model
        # ...
        self.model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        result = {}
        for key, graph in graphs_dict.items():
            data = graph.to(device)
            y_pred = self.model.predict(data)
            result[key] = y_pred
        return result


    def score(self, graphs_dict):
        # Scoring logic using your PyTorch model
        # ...
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        validation_loader = DataLoader(list(graphs_dict.values()), batch_size=1, shuffle=False)
        N_validation = len(validation_loader)
        self.model.eval()
        f1_moyen_valid = 0
        for data in validation_loader:
            data = data.to(device)
            y_pred = self.model.predict(data)
            y_true = data.y.cpu().numpy()
            f1 = f1_score(y_true, y_pred)
            f1_moyen_valid += f1
        f1_moyen_valid /= N_validation
        return f1_moyen_valid


In [18]:
from sklearn.metrics import f1_score

def f1_moyen(pred_dict, true_graphs_dict):
    f1_moyen = 0
    for key, pred in pred_dict.items():
        y_true = true_graphs_dict[key].y.numpy()
        f1_moyen += f1_score(y_true, pred)
    f1_moyen /= len(pred_dict)
    return f1_moyen

In [19]:
# Create an instance of your PyTorch model
pytorch_model = NodeClassifier(32, N_features)

# Define the loss function and optimizer
alpha = 0.2
gamma = 5
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor((1 - alpha) / alpha), reduction='mean')
optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.01)

# Create an instance of the custom wrapper
model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)

# Fit, predict, and score using scikit-learn-like API
model.fit(train_graphs, validation_graphs, max_epochs=3,verbose=2)
y_pred = model.predict(test_graphs)
print(model.score(validation_graphs))

Training on 78 graphs, validating on 19 graphs
- Epoch 001 -
Loss: 0.9713
F1 train: 0.5661158314610242
F1 valid: 0.5912123177345279
It's better !

- Epoch 002 -
Loss: 0.9603
F1 train: 0.5634495313502981
F1 valid: 0.5974746753121714
It's better !

- Epoch 003 -
Loss: 0.9583
F1 train: 0.5823335544330736
F1 valid: 0.5823908371796195
It's better !

Training finished !
0.5823908371796195


In [20]:
import random

def get_bagging_models(n_bagging, train_graphs):
    models = []
    for i in range(n_bagging):
    
        print('Bagging', i+1)
        pytorch_model = NodeClassifier(32, N_features)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor((1 - alpha) / alpha), reduction='mean')
        optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.01)
        model = AjaPyTorchWrapper(pytorch_model, criterion, optimizer)
        keys = list(train_graphs.keys())
        bagging_train_graphs = {}
        bagging_validation_graphs = {}
        samples = random.choices(keys, k=len(keys))
        c_train = 0
        c_validation = 0
        for key in keys:
            if key in samples:
                bagging_train_graphs[c_train] = train_graphs[key]
                c_train += 1
            else:
                bagging_validation_graphs[c_validation] = train_graphs[key]
                c_validation += 1
        model.fit(bagging_train_graphs, bagging_validation_graphs, max_epochs=6, verbose=0)
        models.append(model)
        print('F1 score:', model.score(bagging_validation_graphs))
    return models

def predict_bagging(models, graphs_dict):
    result = {}
    for key, graph in graphs_dict.items():
        y_pred = 0
        for model in models:
            y_pred += model.predict({key: graph})[key]
        y_pred =  y_pred / len(models)
        y_pred = np.array((y_pred > 0.5).astype(int)).flatten()
        result[key] = y_pred
    return result



In [21]:
models = get_bagging_models(20, {**train_graphs, **validation_graphs})
test_prediction = predict_bagging(models, test_graphs)

Bagging 1
F1 score: 0.5418720827493562
Bagging 2
F1 score: 0.5760723471307322
Bagging 3
F1 score: 0.573934647793857
Bagging 4
F1 score: 0.5574754223449246
Bagging 5
F1 score: 0.5651889111887681
Bagging 6
F1 score: 0.563435467595367
Bagging 7
F1 score: 0.5657750517298114
Bagging 8
F1 score: 0.5923173802751973
Bagging 9
F1 score: 0.5563101020608077
Bagging 10
F1 score: 0.5628204850707317
Bagging 11
F1 score: 0.5779065429136968
Bagging 12
F1 score: 0.5857145705481172
Bagging 13
F1 score: 0.5490777230632956
Bagging 14
F1 score: 0.5885866699891155
Bagging 15
F1 score: 0.5824984894193863
Bagging 16
F1 score: 0.5720166011850949
Bagging 17
F1 score: 0.6004841394091989
Bagging 18
F1 score: 0.5772022298236318
Bagging 19
F1 score: 0.5764042544336382
Bagging 20
F1 score: 0.5908846427505574


In [22]:
aja.make_test_csv_submission_from_dict(test_prediction, 'bagging_has_exclamation')