In [2]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
from torch_geometric.data import Data
import torch.nn as nn
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.append('../AJA')
import AJA as aja

In [4]:
# récupération des données 
df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = aja.get_data()

# feature extraction

from sklearn.preprocessing import StandardScaler

# node

scaler = StandardScaler()
# sentence length normalized
df_train_nodes['sentence_length'] = df_train_nodes['text'].apply(lambda s: len(s.split()))
df_train_nodes['sentence_length'] = scaler.fit_transform(df_train_nodes['sentence_length'].values.reshape(-1, 1))
df_test_nodes['sentence_length'] = df_test_nodes['text'].apply(lambda s: len(s.split()))
df_test_nodes['sentence_length'] = scaler.transform(df_test_nodes['sentence_length'].values.reshape(-1, 1))

df_train_nodes['nb_occurences'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_train_nodes['nb_occurences'] = scaler.fit_transform(df_train_nodes['nb_occurences'].values.reshape(-1, 1))
df_test_nodes['nb_occurences'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test_nodes['nb_occurences'] = scaler.transform(df_test_nodes['nb_occurences'].values.reshape(-1, 1))


df_train_nodes['nb_words_more_7'] = df_train_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_train_nodes['nb_words_more_7'] = scaler.fit_transform(df_train_nodes['nb_words_more_7'].values.reshape(-1, 1))
df_test_nodes['nb_words_more_7'] = df_test_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test_nodes['nb_words_more_7'] = scaler.transform(df_test_nodes['nb_words_more_7'].values.reshape(-1, 1))


# speaker hot-one encoding
one_hot_encoded = pd.get_dummies(df_train_nodes['speaker_int'], prefix='speaker', dtype=int)
df_train_nodes = df_train_nodes.drop('speaker_int', axis=1)
df_train_nodes = df_train_nodes.drop('speaker_text', axis=1)
df_train_nodes = pd.concat([df_train_nodes, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_test_nodes['speaker_int'], prefix='speaker', dtype=int)
df_test_nodes = df_test_nodes.drop('speaker_int', axis=1)
df_test_nodes = df_test_nodes.drop('speaker_text', axis=1)
df_test_nodes = pd.concat([df_test_nodes, one_hot_encoded], axis=1)

# TFIDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train_nodes['text'])
df_train_nodes['tfidf_sum'] = tfidf_matrix.sum(axis=1)
df_train_nodes['tfidf_max'] = tfidf_matrix.max(axis=1).toarray().flatten()

tfidf_matrix = tfidf_vectorizer.fit_transform(df_test_nodes['text'])
df_test_nodes['tfidf_sum'] = tfidf_matrix.sum(axis=1)
df_test_nodes['tfidf_max'] = tfidf_matrix.max(axis=1).toarray().flatten()

df_train_nodes['tfidf_sum'] = scaler.fit_transform(df_train_nodes['tfidf_sum'].values.reshape(-1,1))
df_test_nodes['tfidf_sum'] = scaler.transform(df_test_nodes['tfidf_sum'].values.reshape(-1,1))

df_train_nodes['tfidf_max'] = scaler.fit_transform(df_train_nodes['tfidf_max'].values.reshape(-1,1))
df_test_nodes['tfidf_max'] = scaler.transform(df_test_nodes['tfidf_max'].values.reshape(-1,1))

# Numbers
df_train_nodes['has_number'] = df_train_nodes['text'].str.contains(r'\d').astype(int)
df_test_nodes['has_number'] = df_test_nodes['text'].str.contains(r'\d').astype(int)

# Scaling of bert
for i in range(384):  # Assurez-vous que la borne supérieure est correcte
    col_name = f'bert_{i}'
    df_train_nodes[col_name] = scaler.fit_transform(df_train_nodes[col_name].values.reshape(-1, 1))
    df_test_nodes[col_name] = scaler.transform(df_test_nodes[col_name].values.reshape(-1, 1))

# edge

new_df = pd.DataFrame({
        'transcription': df_train_edges['transcription'],
        'start': df_train_edges['end'],
        'end': df_train_edges['start'],
        'type_int': 16 + df_train_edges['type_int'],
        'type_text': df_train_edges['type_text'] + "_reverse"
    })
df_train_edges = pd.concat([df_train_edges, new_df], ignore_index=True)

new_df = pd.DataFrame({
        'transcription': df_test_edges['transcription'],
        'start': df_test_edges['end'],
        'end': df_test_edges['start'],
        'type_int': 16 + df_test_edges['type_int'],
        'type_text': df_test_edges['type_text'] + "_reverse"
    })
df_test_edges = pd.concat([df_test_edges, new_df], ignore_index=True)

In [5]:
# creation des graphs
train_graphs, test_graphs = aja.make_graphs(df_train_nodes, df_train_edges, df_test_nodes, df_test_edges)
N_features = train_graphs['ES2002a'].x.shape[1]
train_graphs, validation_graphs = aja.train_validation_split(train_graphs, 0.2)

In [6]:
class MultiChannelsGCN(torch.nn.Module):
    def __init__(self, channels, input_dim, post_conv_dim, output_dim, identity=False):
        super(MultiChannelsGCN, self).__init__()
        self.identity = identity
        self.channels = channels
        self.input_dim = input_dim
        self.post_conv_dim = post_conv_dim
        self.output_dim = output_dim
        self.GCN = nn.ModuleList([GCNConv(input_dim, post_conv_dim) for _ in range(channels)])
        if identity:
            self.dense = nn.Linear(post_conv_dim * (channels + 1), output_dim)
            self.denseID = nn.Linear(input_dim, post_conv_dim)
        else:
            self.dense = nn.Linear(post_conv_dim * channels, output_dim)

    def forward(self, nodes, edges):
        X = []
        for k in range(self.channels):
            if len(edges[k]) == 0:
                x = torch.zeros(nodes.shape[0], self.post_conv_dim)
            else:
                x = F.relu(self.GCN[k](nodes, edges[k]))
            X.append(x)
        if self.identity:
            X.append(F.relu(self.denseID(nodes)))
        concat = torch.cat(X, dim=1)
        return F.relu(self.dense(concat))

In [7]:
# on définie son plus beau modèle

class NodeClassifier(torch.nn.Module):
    def __init__(self, channels, input_dim):
        super(NodeClassifier, self).__init__()
        self.threshold = 0.5
        self.GCN1 = MultiChannelsGCN(channels, input_dim, 50, 20, identity=True) 
        self.dense1 = nn.Linear(20,1)

    def forward(self, data):
        nodes, edges = data.x, data.edge_index
        
        x = self.GCN1(nodes, edges)
        x = self.dense1(x)
        return x

    def set_threshold(self, t):
        self.threshold = t

    def predict(self, graph): #version sans threshold
        self.eval()
        with torch.no_grad():
            logits = self.forward(graph)
        return np.array((logits > 0.5).int()).flatten()

In [8]:
N_features = train_graphs['ES2002a'].x.shape[1]
N_features

394

In [9]:
# Move the instantiation of the model outside the training loop
model = NodeClassifier(32, N_features)
model_name='test'

# Move the model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Use DataLoader to create batches of data
train_loader = DataLoader(list(train_graphs.values()), batch_size=1, shuffle=True)
N_train = len(train_loader)

'''train_loader = DataLoader(list(train_graphs.values()) + list(validation_graphs.values()), batch_size=1, shuffle=True)
N_train = len(train_loader)'''

validation_loader = DataLoader(list(validation_graphs.values()), batch_size=1, shuffle=False)
N_validation = len(validation_loader)

# Define the loss function and optimizer
alpha = 0.2
gamma = 5
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor((1 - alpha) / alpha), reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

from sklearn.metrics import f1_score

# Train the model
best_f1_score = 0
for epoch in range(8):
    print('- Epoch', f'{epoch:03d}', '-')

    # training
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data).squeeze()
        loss = criterion(output, data.y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / N_train
    print('Loss:', f'{average_loss:.4f}')    

    # Evaluate the model on the training set

    model.eval()
    f1_moyen_train = 0
    for data in train_loader:
        data = data.to(device)
        y_pred = model.predict(data)
        y_true = data.y.cpu().numpy()
        f1 = f1_score(y_true, y_pred)
        f1_moyen_train += f1
    f1_moyen_train /= N_train
    print('F1 train:', f1_moyen_train)

    # Evaluate the model on the validation set
    model.eval()
    f1_moyen_valid = 0
    for data in validation_loader:
        data = data.to(device)
        y_pred = model.predict(data)
        y_true = data.y.cpu().numpy()
        f1 = f1_score(y_true, y_pred)
        f1_moyen_valid += f1
    f1_moyen_valid /= N_validation
    print('F1 valid:', f1_moyen_valid)

    # callbacks ou autre
    if f1_moyen_valid > best_f1_score:
        torch.save(model.state_dict(), "training_states/" + model_name + "-best.pth")
    else:
        optimizer.param_groups[0]['lr'] /= 2
        print('Learning rate reduced to:', optimizer.param_groups[0]['lr'])
    print('')

model.load_state_dict(torch.load("training_states/" + model_name + "-best.pth"))

- Epoch 000 -
Loss: 0.9435
F1 train: 0.0
F1 valid: 0.0
Learning rate reduced to: 0.005

- Epoch 001 -
Loss: 0.7571
F1 train: 0.5609456675663822
F1 valid: 0.5600560804111948

- Epoch 002 -
Loss: 0.7179
F1 train: 0.5794802836829595
F1 valid: 0.5656761360426578

- Epoch 003 -
Loss: 0.6827
F1 train: 0.5968818132056519
F1 valid: 0.5593698555603713

- Epoch 004 -
Loss: 0.6566
F1 train: 0.6142832008986046
F1 valid: 0.5744332676163223

- Epoch 005 -
Loss: 0.6218
F1 train: 0.6213664038056581
F1 valid: 0.5747801651586235

- Epoch 006 -
Loss: 0.5962
F1 train: 0.6402600419288927
F1 valid: 0.5808028967030276

- Epoch 007 -
Loss: 0.5591
F1 train: 0.6612199797350832
F1 valid: 0.5673895419032122



<All keys matched successfully>

In [None]:
# quand on est content on fait une submission !
aja.make_test_csv_submission(model, test_graphs, 'has_number_part_data')