In [1]:
!pip install numpy
!pip install matplotlib
!pip install torch
!pip install torch-geometric
!pip install tqdm
!pip install pandas
!pip install scikit-learn
!pip install seaborn



In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
import torch.nn as nn
import pandas as pd

import data_manipulation
import model_definition

In [3]:
# lets get all the available data into pandas dataframes
# a bert embedding is also done for each node ie each sentence

df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = data_manipulation.get_data()

In [4]:
# lets visualize the data
df_train_nodes.head()

Unnamed: 0,transcription,line,speaker_int,speaker_text,text,label,bert_0,bert_1,bert_2,bert_3,...,bert_374,bert_375,bert_376,bert_377,bert_378,bert_379,bert_380,bert_381,bert_382,bert_383
0,ES2002a,0,0,PM,Okay,0,-0.057809,-0.085828,-0.03572,-0.011185,...,0.018063,-0.033183,-0.004249,-0.026428,0.074381,0.010209,0.085386,-0.014607,0.058432,-0.009739
1,ES2002a,1,0,PM,Right,0,-0.054862,0.047607,-0.032626,-0.010949,...,0.092259,0.034839,-0.02149,0.007297,0.027587,0.027128,0.14595,0.037911,0.073511,0.079932
2,ES2002a,2,0,PM,<vocalsound> Um well this is the kick-off meet...,1,-0.054665,-0.073837,-0.017161,-0.064276,...,0.035382,0.098955,-0.025984,0.077994,0.00358,0.03226,0.022304,0.059096,-0.036019,-0.00882
3,ES2002a,3,0,PM,Um <vocalsound> and um,0,-0.010416,-0.072719,-0.017206,-0.088992,...,0.006533,0.032185,0.010955,0.041298,-0.018026,0.050856,0.007696,0.041694,0.077368,-0.037393
4,ES2002a,4,0,PM,this is just what we're gonna be doing over th...,0,-0.028654,-0.015151,0.09591,-0.059113,...,0.108833,0.061266,-0.011521,-0.010543,0.010692,0.11778,-0.017561,-0.028903,0.007401,-0.005552


In [5]:
# lets visualize the data
df_train_edges.head()

Unnamed: 0,transcription,start,end,type_int,type_text
0,ES2002a,0,1,10,Continuation
1,ES2002a,1,2,10,Continuation
2,ES2002a,2,3,15,Explanation
3,ES2002a,3,4,2,Elaboration
4,ES2002a,4,5,10,Continuation


In [6]:
# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


# sentence length normalized
df_train_nodes['sentence_length'] = df_train_nodes['text'].apply(lambda s: len(s.split()))
df_train_nodes['sentence_length'] = scaler.fit_transform(df_train_nodes['sentence_length'].values.reshape(-1, 1))
df_test_nodes['sentence_length'] = df_test_nodes['text'].apply(lambda s: len(s.split()))
df_test_nodes['sentence_length'] = scaler.transform(df_test_nodes['sentence_length'].values.reshape(-1, 1))

df_train_nodes['nb_occurences'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_train_nodes['nb_occurences'] = scaler.fit_transform(df_train_nodes['nb_occurences'].values.reshape(-1, 1))
df_test_nodes['nb_occurences'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test_nodes['nb_occurences'] = scaler.transform(df_test_nodes['nb_occurences'].values.reshape(-1, 1))


df_train_nodes['nb_words_more_7'] = df_train_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_train_nodes['nb_words_more_7'] = scaler.fit_transform(df_train_nodes['nb_words_more_7'].values.reshape(-1, 1))
df_test_nodes['nb_words_more_7'] = df_test_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test_nodes['nb_words_more_7'] = scaler.transform(df_test_nodes['nb_words_more_7'].values.reshape(-1, 1))


# speaker hot-one encoding
one_hot_encoded = pd.get_dummies(df_train_nodes['speaker_int'], prefix='speaker', dtype=int)
df_train_nodes = df_train_nodes.drop('speaker_int', axis=1)
df_train_nodes = df_train_nodes.drop('speaker_text', axis=1)
df_train_nodes = pd.concat([df_train_nodes, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_test_nodes['speaker_int'], prefix='speaker', dtype=int)
df_test_nodes = df_test_nodes.drop('speaker_int', axis=1)
df_test_nodes = df_test_nodes.drop('speaker_text', axis=1)
df_test_nodes = pd.concat([df_test_nodes, one_hot_encoded], axis=1)


# TFIDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train_nodes['text'])
df_train_nodes['tfidf_sum'] = tfidf_matrix.sum(axis=1)
df_train_nodes['tfidf_max'] = tfidf_matrix.max(axis=1).toarray().flatten()

tfidf_matrix_test = tfidf_vectorizer.fit_transform(df_test_nodes['text'])
df_test_nodes['tfidf_sum'] = tfidf_matrix_test.sum(axis=1)
df_test_nodes['tfidf_max'] = tfidf_matrix_test.max(axis=1).toarray().flatten()

# Normalization of TFIDF feature
df_train_nodes['tfidf_sum'] = scaler.fit_transform(df_train_nodes['tfidf_sum'].values.reshape(-1,1))
df_test_nodes['tfidf_sum'] = scaler.transform(df_test_nodes['tfidf_sum'].values.reshape(-1,1))

df_train_nodes['tfidf_max'] = scaler.fit_transform(df_train_nodes['tfidf_max'].values.reshape(-1,1))
df_test_nodes['tfidf_max'] = scaler.transform(df_test_nodes['tfidf_max'].values.reshape(-1,1))


# dropping text
df_train_nodes = df_train_nodes.drop('text', axis=1)
df_test_nodes = df_test_nodes.drop('text', axis=1)

new_df = pd.DataFrame({
        'transcription': df_train_edges['transcription'],
        'start': df_train_edges['end'],
        'end': df_train_edges['start'],
        'type_int': 16 + df_train_edges['type_int'],
        'type_text': df_train_edges['type_text'] + "_reverse"
    })
df_train_edges = pd.concat([df_train_edges, new_df], ignore_index=True)

new_df = pd.DataFrame({
        'transcription': df_test_edges['transcription'],
        'start': df_test_edges['end'],
        'end': df_test_edges['start'],
        'type_int': 16 + df_test_edges['type_int'],
        'type_text': df_test_edges['type_text'] + "_reverse"
    })
df_test_edges = pd.concat([df_test_edges, new_df], ignore_index=True)

In [7]:
# lets visualize extracted features
df_train_nodes.head()

Unnamed: 0,transcription,line,label,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,...,bert_383,sentence_length,nb_occurences,nb_words_more_5,speaker_0,speaker_1,speaker_2,speaker_3,tfidf_sum,tfidf_max
0,ES2002a,0,0,-0.057809,-0.085828,-0.03572,-0.011185,0.062363,-0.023545,0.061487,...,-0.009739,-1.008131,-0.368253,-0.795267,1,0,0,0,-1.168929,1.439285
1,ES2002a,1,0,-0.054862,0.047607,-0.032626,-0.010949,-0.035741,-0.051808,0.052922,...,0.079932,-1.008131,-0.368253,-0.795267,1,0,0,0,-1.168929,1.439285
2,ES2002a,2,1,-0.054665,-0.073837,-0.017161,-0.064276,0.004937,0.062475,-0.030765,...,-0.00882,0.789302,-0.368253,0.930093,1,0,0,0,1.228067,-0.927421
3,ES2002a,3,0,-0.010416,-0.072719,-0.017206,-0.088992,-0.048035,0.051155,0.005855,...,-0.037393,-0.558773,1.365643,-0.795267,1,0,0,0,-0.514793,0.709401
4,ES2002a,4,0,-0.028654,-0.015151,0.09591,-0.059113,0.042067,0.033088,-0.07013,...,-0.005552,1.088874,-0.368253,0.354973,1,0,0,0,1.838146,-1.725435


In [8]:
# lets visualize extracted features
df_train_edges.head()

Unnamed: 0,transcription,start,end,type_int,type_text
0,ES2002a,0,1,10,Continuation
1,ES2002a,1,2,10,Continuation
2,ES2002a,2,3,15,Explanation
3,ES2002a,3,4,2,Elaboration
4,ES2002a,4,5,10,Continuation


In [9]:
# lets transform the dataframes into pytorch geometric data objects

train_graphs, test_graphs = data_manipulation.make_graphs(df_train_nodes, df_train_edges, df_test_nodes, df_test_edges)
N_features = train_graphs['ES2002a'].x.shape[1]
N_chanels = len(train_graphs['ES2002a'].edge_index)
print("Number of features per node: {}".format(N_features))
print("Number of channels: {}".format(N_chanels))

Number of features per node: 393
Number of channels: 32


In [10]:
# lets split the train data into train and validation
train_graphs, validation_graphs = data_manipulation.train_validation_split(train_graphs, 0.2)

In [11]:
print("number of train graphs: ", len(train_graphs))
print("number of validation graphs: ", len(validation_graphs))
print("number of test graphs: ", len(test_graphs))

number of train graphs:  78
number of validation graphs:  19
number of test graphs:  40


In [12]:
# Charger l'extension autoreload
%load_ext autoreload

# Configurer autoreload pour recharger tous les modules avant l'exécution de chaque cellule
%autoreload 2

In [13]:
# training and validation

kappa = 6
lr = 0.01
nb_epochs = 3

pytorch_model = model_definition.NodeClassifier(N_chanels, N_features)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]), reduction='mean')
optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)

model = model_definition.ModelWrapper(pytorch_model, criterion, optimizer)

model.fit(train_graphs, validation_graphs, max_epochs=nb_epochs, verbose=1)
validation_f1_score  = model.score(validation_graphs)

print("validation f1 score: ", validation_f1_score)

Training on 78 graphs, validating on 19 graphs
- Epoch 001 -


- Epoch 002 -
- Epoch 003 -
Training finished !
validation f1 score:  0.5466082449367847


In [14]:
# bagging model

bagging_models = []
for _ in range(3):
    kappa = 6
    lr = 0.01
    nb_epochs = 3
    pytorch_model = model_definition.NodeClassifier(N_chanels, N_features)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]), reduction='mean')
    optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)
    model = model_definition.ModelWrapper(pytorch_model, criterion, optimizer)
    bagging_models.append(model)

bagging_model = model_definition.BaggingModel(bagging_models)
bagging_model.fit(train_graphs, epochs=3, verbose=1)
f1 = bagging_model.score(validation_graphs)
print('')
print("Validation f1 score with bagging: ", f1)


Model 1
F1 score (out of the bag): 0.5700533298752926
Model 2
F1 score (out of the bag): 0.5756144272735946
Model 3
F1 score (out of the bag): 0.5451594903284138

Validation f1 score with bagging:  0.5638027121088327


In [15]:
# training on all data and prediction

bagging_models = []
for _ in range(5):
    kappa = 6
    lr = 0.01
    nb_epochs = 3
    pytorch_model = model_definition.NodeClassifier(N_chanels, N_features)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]), reduction='mean')
    optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)
    model = model_definition.ModelWrapper(pytorch_model, criterion, optimizer)
    bagging_models.append(model)

bagging_model_final = model_definition.BaggingModel(bagging_models)
bagging_model_final.fit({**train_graphs, **validation_graphs}, epochs=3, verbose=1)
prediction = bagging_model_final.predict(test_graphs)
data_manipulation.make_test_csv_submission_from_dict(prediction, 'submission.csv')

Model 1


F1 score (out of the bag): 0.5301394144195753
Model 2
F1 score (out of the bag): 0.5830411822607184
Model 3
F1 score (out of the bag): 0.5593591361814546
Model 4
F1 score (out of the bag): 0.6013722407297538
Model 5
F1 score (out of the bag): 0.5885367683899632
