In [1]:
!pip install numpy
!pip install matplotlib
!pip install torch
!pip install torch-geometric
!pip install tqdm
!pip install pandas
!pip install scikit-learn
!pip install seaborn



In [33]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import torch
import torch.nn as nn
import pandas as pd

import data_manipulation
import model_definition

In [34]:
# lets get all the available data into pandas dataframes
# a bert embedding is also done for each node ie each sentence

df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = data_manipulation.get_data()

In [35]:
# lets visualize the data
df_train_nodes.head()

Unnamed: 0,transcription,line,speaker_int,speaker_text,text,label,bert_0,bert_1,bert_2,bert_3,...,bert_374,bert_375,bert_376,bert_377,bert_378,bert_379,bert_380,bert_381,bert_382,bert_383
0,ES2002a,0,0,PM,Okay,0,-0.057809,-0.085828,-0.03572,-0.011185,...,0.018063,-0.033183,-0.004249,-0.026428,0.074381,0.010209,0.085386,-0.014607,0.058432,-0.009739
1,ES2002a,1,0,PM,Right,0,-0.054862,0.047607,-0.032626,-0.010949,...,0.092259,0.034839,-0.02149,0.007297,0.027587,0.027128,0.14595,0.037911,0.073511,0.079932
2,ES2002a,2,0,PM,<vocalsound> Um well this is the kick-off meet...,1,-0.054665,-0.073837,-0.017161,-0.064276,...,0.035382,0.098955,-0.025984,0.077994,0.00358,0.03226,0.022304,0.059096,-0.036019,-0.00882
3,ES2002a,3,0,PM,Um <vocalsound> and um,0,-0.010416,-0.072719,-0.017206,-0.088992,...,0.006533,0.032185,0.010955,0.041298,-0.018026,0.050856,0.007696,0.041694,0.077368,-0.037393
4,ES2002a,4,0,PM,this is just what we're gonna be doing over th...,0,-0.028654,-0.015151,0.09591,-0.059113,...,0.108833,0.061266,-0.011521,-0.010543,0.010692,0.11778,-0.017561,-0.028903,0.007401,-0.005552


In [36]:
# lets visualize the data
df_train_edges.head()

Unnamed: 0,transcription,start,end,type_int,type_text
0,ES2002a,0,1,5,Continuation
1,ES2002a,1,2,5,Continuation
2,ES2002a,2,3,11,Explanation
3,ES2002a,3,4,1,Elaboration
4,ES2002a,4,5,5,Continuation


In [37]:
# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


# sentence length normalized
df_train_nodes['sentence_length'] = df_train_nodes['text'].apply(lambda s: len(s.split()))
df_train_nodes['sentence_length'] = scaler.fit_transform(df_train_nodes['sentence_length'].values.reshape(-1, 1))
df_test_nodes['sentence_length'] = df_test_nodes['text'].apply(lambda s: len(s.split()))
df_test_nodes['sentence_length'] = scaler.transform(df_test_nodes['sentence_length'].values.reshape(-1, 1))

# Number of words with more than 7 letters
df_train_nodes['nb_words_more_7'] = df_train_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_train_nodes['nb_words_more_7'] = scaler.fit_transform(df_train_nodes['nb_words_more_7'].values.reshape(-1, 1))
df_test_nodes['nb_words_more_7'] = df_test_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test_nodes['nb_words_more_7'] = scaler.transform(df_test_nodes['nb_words_more_7'].values.reshape(-1, 1))


# speaker hot-one encoding
one_hot_encoded = pd.get_dummies(df_train_nodes['speaker_int'], prefix='speaker', dtype=int)
df_train_nodes = df_train_nodes.drop('speaker_int', axis=1)
df_train_nodes = df_train_nodes.drop('speaker_text', axis=1)
df_train_nodes = pd.concat([df_train_nodes, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_test_nodes['speaker_int'], prefix='speaker', dtype=int)
df_test_nodes = df_test_nodes.drop('speaker_int', axis=1)
df_test_nodes = df_test_nodes.drop('speaker_text', axis=1)
df_test_nodes = pd.concat([df_test_nodes, one_hot_encoded], axis=1)


# TFIDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train_nodes['text'])
df_train_nodes['tfidf_sum'] = tfidf_matrix.sum(axis=1)
df_train_nodes['tfidf_max'] = tfidf_matrix.max(axis=1).toarray().flatten()

tfidf_matrix_test = tfidf_vectorizer.fit_transform(df_test_nodes['text'])
df_test_nodes['tfidf_sum'] = tfidf_matrix_test.sum(axis=1)
df_test_nodes['tfidf_max'] = tfidf_matrix_test.max(axis=1).toarray().flatten()

# Normalization of TFIDF feature
df_train_nodes['tfidf_sum'] = scaler.fit_transform(df_train_nodes['tfidf_sum'].values.reshape(-1,1))
df_test_nodes['tfidf_sum'] = scaler.transform(df_test_nodes['tfidf_sum'].values.reshape(-1,1))

df_train_nodes['tfidf_max'] = scaler.fit_transform(df_train_nodes['tfidf_max'].values.reshape(-1,1))
df_test_nodes['tfidf_max'] = scaler.transform(df_test_nodes['tfidf_max'].values.reshape(-1,1))

# Scaling Bert
for i in range(384):
    col_name = f'bert_{i}'
    df_train_nodes[col_name] = scaler.fit_transform(df_train_nodes[col_name].values.reshape(-1, 1))
    df_test_nodes[col_name] = scaler.transform(df_test_nodes[col_name].values.reshape(-1, 1))

# onomatopoeias count
df_train_nodes['nb_onomatopoeias'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh', 'yeah']))
df_train_nodes['nb_onomatopoeias'] = scaler.fit_transform(df_train_nodes['nb_onomatopoeias'].values.reshape(-1, 1))
df_test_nodes['nb_onomatopoeias'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh', 'yeah']))
df_test_nodes['nb_onomatopoeias'] = scaler.transform(df_test_nodes['nb_onomatopoeias'].values.reshape(-1, 1))


# dropping text
df_train_nodes = df_train_nodes.drop('text', axis=1)
df_test_nodes = df_test_nodes.drop('text', axis=1)

# chanels 0-15 correspond to the original edges (direct direction for every link type)
# we add chanels 16-31 that correspond to the reverse edges (reverse direction for every link type)

new_df = pd.DataFrame({
        'transcription': df_train_edges['transcription'],
        'start': df_train_edges['end'],
        'end': df_train_edges['start'],
        'type_int': 16 + df_train_edges['type_int'],
        'type_text': df_train_edges['type_text'] + "_reverse"
    })
df_train_edges = pd.concat([df_train_edges, new_df], ignore_index=True)

new_df = pd.DataFrame({
        'transcription': df_test_edges['transcription'],
        'start': df_test_edges['end'],
        'end': df_test_edges['start'],
        'type_int': 16 + df_test_edges['type_int'],
        'type_text': df_test_edges['type_text'] + "_reverse"
    })
df_test_edges = pd.concat([df_test_edges, new_df], ignore_index=True)

  df_train_nodes['nb_onomatopoeias'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh', 'yeah']))
  df_test_nodes['nb_onomatopoeias'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh', 'yeah']))


In [38]:
# lets visualize extracted features
df_train_nodes.head()

Unnamed: 0,transcription,line,label,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,...,bert_383,sentence_length,nb_words_more_7,speaker_0,speaker_1,speaker_2,speaker_3,tfidf_sum,tfidf_max,nb_onomatopoeias
0,ES2002a,0,0,-0.322028,-1.528408,-0.774572,0.004107,0.759986,-0.338124,0.537238,...,-0.353896,-1.008131,-0.647917,1,0,0,0,-1.168929,1.439285,-0.415549
1,ES2002a,1,0,-0.268435,1.241315,-0.697893,0.0096,-0.996062,-0.923351,0.385835,...,1.772475,-1.008131,-0.647917,1,0,0,0,-1.168929,1.439285,-0.415549
2,ES2002a,2,1,-0.264855,-1.279508,-0.314608,-1.232834,-0.267942,1.443019,-1.093486,...,-0.33211,0.789302,0.456915,1,0,0,0,1.228067,-0.927421,-0.415549
3,ES2002a,3,0,0.539704,-1.256303,-0.315716,-1.80867,-1.216124,1.20864,-0.446154,...,-1.009652,-0.558773,-0.647917,1,0,0,0,-0.514793,0.709401,1.210442
4,ES2002a,4,0,0.208098,-0.061348,2.487742,-1.112549,0.396674,0.834525,-1.789326,...,-0.254613,1.088874,-0.647917,1,0,0,0,1.838146,-1.725435,-0.415549


In [39]:
# lets visualize extracted features
df_train_edges.head()

Unnamed: 0,transcription,start,end,type_int,type_text
0,ES2002a,0,1,5,Continuation
1,ES2002a,1,2,5,Continuation
2,ES2002a,2,3,11,Explanation
3,ES2002a,3,4,1,Elaboration
4,ES2002a,4,5,5,Continuation


In [40]:
# lets transform the dataframes into pytorch geometric data objects

# convert the dataframes into pytorch geometric data objects
train_graphs, test_graphs = data_manipulation.make_graphs(df_train_nodes, df_train_edges, df_test_nodes, df_test_edges)

N_features = train_graphs['ES2002a'].x.shape[1]
N_chanels = len(train_graphs['ES2002a'].edge_index)
print("Number of features per node: {}".format(N_features))
print("Number of channels: {}".format(N_chanels))

Number of features per node: 393
Number of channels: 32


In [41]:
# lets split the train data into train and validation
train_graphs, validation_graphs = data_manipulation.train_validation_split(train_graphs, 0.2)

In [42]:
print("number of train graphs: ", len(train_graphs))
print("number of validation graphs: ", len(validation_graphs))
print("number of test graphs: ", len(test_graphs))

number of train graphs:  78
number of validation graphs:  19
number of test graphs:  40


In [43]:
# training and validation

kappa = 6
lr = 0.01
nb_epochs = 3

pytorch_model = model_definition.NodeClassifier(N_chanels, N_features)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]), reduction='mean')
optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)

model = model_definition.ModelWrapper(pytorch_model, criterion, optimizer)


# training
model.fit(train_graphs, validation_graphs, max_epochs=nb_epochs, verbose=1)

# validation
validation_f1_score  = model.score(validation_graphs)
print("validation f1 score: ", validation_f1_score)

Training on 78 graphs, validating on 19 graphs
- Epoch 001 -


- Epoch 002 -
- Epoch 003 -
Training finished !
validation f1 score:  0.5229109113245345


In [44]:
# bagging model

bagging_models = []
for _ in range(3):
    kappa = 6
    lr = 0.01
    nb_epochs = 3
    pytorch_model = model_definition.NodeClassifier(N_chanels, N_features)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]), reduction='mean')
    optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)
    model = model_definition.ModelWrapper(pytorch_model, criterion, optimizer)
    bagging_models.append(model)

bagging_model = model_definition.BaggingModel(bagging_models)

# training
bagging_model.fit(train_graphs, epochs=3, verbose=1)

# validation
f1 = bagging_model.score(validation_graphs)
print('')
print("Validation f1 score with bagging: ", f1)


Model 1


F1 score (out of the bag): 0.5492438021018607
Model 2
F1 score (out of the bag): 0.5281157006553542
Model 3
F1 score (out of the bag): 0.5790647730552669

Validation f1 score with bagging:  0.5406856266361317


In [45]:
# training on all data and prediction

bagging_models = []
for _ in range(5):
    kappa = 6
    lr = 0.01
    nb_epochs = 3
    pytorch_model = model_definition.NodeClassifier(N_chanels, N_features)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([kappa]), reduction='mean')
    optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=lr)
    model = model_definition.ModelWrapper(pytorch_model, criterion, optimizer)
    bagging_models.append(model)

bagging_model_final = model_definition.BaggingModel(bagging_models)

# training on all data
bagging_model_final.fit({**train_graphs, **validation_graphs}, epochs=3, verbose=1)

# prediction
prediction = bagging_model_final.predict(test_graphs)

# submission
data_manipulation.make_test_csv_submission_from_dict(prediction, 'submission.csv')

Model 1
F1 score (out of the bag): 0.5973205013354136
Model 2
F1 score (out of the bag): 0.5923413731605103
Model 3
F1 score (out of the bag): 0.5795302272678795
Model 4
F1 score (out of the bag): 0.5324012691735934
Model 5
F1 score (out of the bag): 0.5132476099366469
