# Random Forest Model in order to compare with our best 

## import libraries

In [1]:
import json
from pathlib import Path
from jsonargparse import CLI
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import numpy as np
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
import torchtext
from torchtext.data import get_tokenizer
from collections import Counter
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Definition of data_set and vectorisation

In [2]:
def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("training")
path_to_test = Path("test")

#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

In [3]:
total_text = []

list_speaker = ['PM','ME','ID','UI']

with open("training_labels.json", "r") as file:
    training_labels = json.load(file)

X_train_text = {}   #dic of the text for every discussion for the train
X_train_speaker = {}
y_train = {}

X_val_text = {}
X_val_speaker = {}
y_val = {}

#all the val set which represent 1/5 of training_set
val_set = ['ES2002a', 'ES2005b', 'ES2006c', 'ES2007d', 'ES2009a', 'ES2010b', 'ES2012c', 'ES2013d', 'ES2016a', 'IS1000b', 'IS1001c', 'IS1002d', 'IS1004a', 'IS1005b', 'IS1006c', 'IS1007d', 'TS3008a', 'TS3009b', 'TS3010c', 'TS3011d']
train_set = []  #the other 4/5 of the training_set

for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    X_train_text_transcription = []   #tab of the text for this discussion
    X_train_speaker_transcription = []   #tab of one_hot_encending of speaker for this discussion
    for utterance in transcription: 
        X_train_text_transcription.append(utterance["text"])
        total_text.append(utterance["text"])
        speaker_one_hot = [0,0,0,0]
        speaker_one_hot[list_speaker.index(utterance["speaker"])] = 1
        X_train_speaker_transcription.append(speaker_one_hot)
    if transcription_id in val_set:
        y_val[transcription_id] = training_labels[transcription_id]
        X_val_text[transcription_id] = X_train_text_transcription
        X_val_speaker[transcription_id] = X_train_speaker_transcription
    else:
        train_set.append(transcription_id)
        y_train[transcription_id] = training_labels[transcription_id]
        X_train_text[transcription_id] = X_train_text_transcription
        X_train_speaker[transcription_id] = X_train_speaker_transcription

X_test_text = {}
X_test_speaker = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)
    X_test_text_transcription = []
    X_test_speaker_transcription = []
    for utterance in transcription:
        X_test_text_transcription.append(utterance["text"])
        total_text.append(utterance["text"])
        speaker_one_hot = [0,0,0,0]
        speaker_one_hot[list_speaker.index(utterance["speaker"])] = 1
        X_test_speaker_transcription.append(speaker_one_hot)
    X_test_text[transcription_id] = X_test_text_transcription
    X_test_speaker[transcription_id] = X_test_speaker_transcription

In [4]:
X_train_text_before = {}
X_train_text_after = {}

list_description = ["Parallel", "Correction", "Q-Elab", "Conditional", "Alternation", "Narration", "Background","Continuation", "Explanation", "Elaboration" , "Acknowledgement", "Comment", "Result", "Question-answer_pair", "Contrast", "Clarification_question"]

X_train_description_before = {}
X_train_description_after = {}
for transcription_id in train_set:
    with open(path_to_training / f"{transcription_id}.txt", "r") as file:
        transcription = file.readlines()

    X_train_text_before_transcription = ["BEGGINING"]
    total_text.append("BEGGINING")

    X_train_text_after_transcription = [""] * (len(transcription) + 1)
    
    vector = [0] * 16
    X_train_description_after_transcription = [vector.copy() for i in range(len(transcription) + 1)]

    description_one_hot = [0] * (len(list_description) + 1)
    description_one_hot[0] = 1
    X_train_description_before_transcription = [description_one_hot]

    for line in transcription:
        tab = line.split()

        description_one_hot = [0] * (len(list_description) + 1)
        description_one_hot[list_description.index(tab[1]) + 1] = 1
        X_train_description_before_transcription.append(description_one_hot)

        X_train_description_after_transcription[int(tab[0])][list_description.index(tab[1])] += 1
        
        X_train_text_before_transcription.append(X_train_text[transcription_id][int(tab[0])])

        X_train_text_after_transcription[int(tab[0])] += X_train_text[transcription_id][int(tab[2])]

    X_train_text_before[transcription_id] = X_train_text_before_transcription
    X_train_text_after[transcription_id] = X_train_text_after_transcription
    X_train_description_before[transcription_id] = X_train_description_before_transcription
    X_train_description_after[transcription_id] = X_train_description_after_transcription


X_val_text_before = {}
X_val_text_after = {}
X_val_description_before = {}
X_val_description_after = {}
for transcription_id in val_set:
    with open(path_to_training / f"{transcription_id}.txt", "r") as file:
        transcription = file.readlines()

    X_val_text_before_transcription = ["BEGGINING"]
    total_text.append("BEGGINING")

    X_val_text_after_transcription = [""] * (len(transcription) + 1)

    vector = [0] * 16
    X_val_description_after_transcription = [vector.copy() for i in range(len(transcription) + 1)]

    description_one_hot = [0] * (len(list_description) + 1)
    description_one_hot[0] = 1
    X_val_description_before_transcription = [description_one_hot]

    for line in transcription:
        tab = line.split()

        description_one_hot = [0] * (len(list_description) + 1)
        description_one_hot[list_description.index(tab[1]) + 1] = 1
        X_val_description_before_transcription.append(description_one_hot)

        X_val_description_after_transcription[int(tab[0])][list_description.index(tab[1])] += 1
        
        X_val_text_before_transcription.append(X_val_text[transcription_id][int(tab[0])])

        X_val_text_after_transcription[int(tab[0])] += X_val_text[transcription_id][int(tab[2])]

    X_val_text_before[transcription_id] = X_val_text_before_transcription
    X_val_text_after[transcription_id] = X_val_text_after_transcription
    X_val_description_before[transcription_id] = X_val_description_before_transcription
    X_val_description_after[transcription_id] = X_val_description_after_transcription


X_test_text_before = {}
X_test_text_after = {}
X_test_description_before = {}
X_test_description_after = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.txt", "r") as file:
        transcription = file.readlines()

    X_test_text_before_transcription = ["BEGGINING"]
    total_text.append("BEGGINING")

    X_test_text_after_transcription = [""] * (len(transcription) + 1)

    t = [0] * 16
    X_test_description_after_transcription = [t.copy() for i in range(len(transcription) + 1)]

    description_one_hot = [0] * (len(list_description) + 1)
    description_one_hot[0] = 1
    X_test_description_before_transcription = [description_one_hot]

    for line in transcription:
        tab = line.split()

        description_one_hot = [0] * (len(list_description) + 1)
        description_one_hot[list_description.index(tab[1]) + 1] = 1
        X_test_description_before_transcription.append(description_one_hot)

        X_test_description_after_transcription[int(tab[0])][list_description.index(tab[1])] += 1

        X_test_text_before_transcription.append(X_test_text[transcription_id][int(tab[0])])

        X_test_text_after_transcription[int(tab[0])] += X_test_text[transcription_id][int(tab[2])]

    X_test_text_before[transcription_id] = X_test_text_before_transcription
    X_test_text_after[transcription_id] = X_test_text_after_transcription
    X_test_description_before[transcription_id] = X_test_description_before_transcription
    X_test_description_after[transcription_id] = X_test_description_after_transcription

print(X_train_text["TS3012d"][1])
print(X_train_speaker["TS3012d"][1])
print(X_train_text_before["TS3012d"][1])
print(X_train_text_after["TS3012d"][1])
print(X_train_description_before["TS3012d"][1])
print(X_train_description_before["TS3012d"][1])

Uh we don't have any changes ,
[0, 0, 0, 1]
Can I close this ?
do we ?
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [5]:
X_train_text_tab = []
X_train_speaker_tab = []
X_train_text_before_tab = []
X_train_text_after_tab = []
X_train_description_before_tab = []
X_train_description_after_tab = []
y_train_tab = []

for transcription_id in train_set:
    X_train_text_tab.extend(X_train_text[transcription_id])
    X_train_speaker_tab.extend(X_train_speaker[transcription_id])
    X_train_text_before_tab.extend(X_train_text_before[transcription_id])
    X_train_text_after_tab.extend(X_train_text_after[transcription_id])
    X_train_description_before_tab.extend(X_train_description_before[transcription_id])
    X_train_description_after_tab.extend(X_train_description_after[transcription_id])
    y_train_tab.extend(y_train[transcription_id])

X_val_text_tab = []
X_val_speaker_tab = []
X_val_text_before_tab = []
X_val_text_after_tab = []
X_val_description_before_tab = []
X_val_description_after_tab = []
y_val_tab = []

for transcription_id in val_set:
    X_val_text_tab.extend(X_val_text[transcription_id])
    X_val_speaker_tab.extend(X_val_speaker[transcription_id])
    X_val_text_before_tab.extend(X_val_text_before[transcription_id])
    X_val_text_after_tab.extend(X_val_text_after[transcription_id])
    X_val_description_before_tab.extend(X_val_description_before[transcription_id])
    X_val_description_after_tab.extend(X_val_description_after[transcription_id])
    y_val_tab.extend(y_val[transcription_id])

X_test_text_tab = []
X_test_speaker_tab = []
X_test_text_before_tab = []
X_test_text_after_tab = []
X_test_description_before_tab = []
X_test_description_after_tab = []

for transcription_id in test_set:
    X_test_text_tab.extend(X_test_text[transcription_id])
    X_test_speaker_tab.extend(X_test_speaker[transcription_id])
    X_test_text_before_tab.extend(X_test_text_before[transcription_id])
    X_test_text_after_tab.extend(X_test_text_after[transcription_id])
    X_test_description_before_tab.extend(X_test_description_before[transcription_id])
    X_test_description_after_tab.extend(X_test_description_after[transcription_id])

In [6]:
tokenizer = get_tokenizer("basic_english")

words=[]
num_words = 1500

total_text.append("First")

for text in total_text:
    tokens=tokenizer(text)
    words.extend(tokens)

top = dict(Counter(words).most_common(1500))
vocab = torchtext.vocab.vocab(top, specials = ['<unk>', '<pad>'])

vocab.set_default_index(vocab['<unk>'])

In [7]:
max_len=60

def vectorize_sentences(reviews, max_len):
    vectors=[]
    for text in reviews:
        tokens=tokenizer(text)
        v=vocab.forward(tokens)
        if len(v) > max_len : v = v[:max_len]
        if len(v) < max_len : #padding
            tmp = np.full(max_len, vocab['<pad>'])
            tmp[0:len(v)]=v 
            v = tmp
        vectors.append(np.array(v))
    return np.array(vectors)

In [8]:
X_tr_text_vector = vectorize_sentences(X_train_text_tab, max_len)
X_tr_speaker_vector = np.array(X_train_speaker_tab)
X_tr_text_before_vector = vectorize_sentences(X_train_text_before_tab, max_len)
X_tr_text_after_vector = vectorize_sentences(X_train_text_after_tab, max_len)
X_tr_description_before_vector = np.array(X_train_description_before_tab)
X_tr_description_after_vector = np.array(X_train_description_after_tab)
y_tr_vector = np.array(y_train_tab).reshape(-1,1)

X_va_text_vector = vectorize_sentences(X_val_text_tab, max_len)
X_va_speaker_vector = np.array(X_val_speaker_tab)
X_va_text_before_vector = vectorize_sentences(X_val_text_before_tab, max_len)
X_va_text_after_vector = vectorize_sentences(X_val_text_after_tab, max_len)
X_va_description_before_vector = np.array(X_val_description_before_tab)
X_va_description_after_vector = np.array(X_val_description_after_tab)
y_va_vector = np.array(y_val_tab).reshape(-1,1)

## Random forest Model

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [10]:
X_train_rf = np.concatenate((X_tr_text_vector, X_tr_speaker_vector,
                          X_tr_text_before_vector, X_tr_text_after_vector,
                          X_tr_description_before_vector,
                          X_tr_description_after_vector), axis=1)



# Initialisation du modèle de Random Forest avec la métrique F1-score
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Entraînement du modèle
model_rf.fit(X_train_rf, y_tr_vector)


y_pred_train = model_rf.predict(X_train_rf)
y_pred_train_class = (y_pred_train>0.3).astype(int)
f1 = f1_score(y_tr_vector, y_pred_train_class)

print(f"F1-score sur les données d'entraînement : {f1}")

  return fit_method(estimator, *args, **kwargs)


KeyboardInterrupt: 

In [None]:
X_val_rf = np.concatenate((X_va_text_vector, X_va_speaker_vector,
                          X_va_text_before_vector, X_va_text_after_vector,
                          X_va_description_before_vector,
                          X_va_description_after_vector), axis=1)
y_pred_valid = model_rf.predict(X_val_rf)

y_pred_valid_class = (y_pred_valid>0.3).astype(int)
f1 = f1_score(y_va_vector, y_pred_valid_class)
print(f"F1-score sur les données de validation : {f1}")

F1-score sur les données d'entraînement : 0.5421530479896238


## SVM model

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score


svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')  

svm_model.fit(X_train_rf, y_tr_vector)  

# Prédiction sur les données d'entraînement pour calculer le F1-score
y_pred_train = svm_model.predict(X_train_rf)
f1 = f1_score(y_tr_vector, y_pred_train)

print(f"F1-score sur les données d'entraînement : {f1}")


  y = column_or_1d(y, warn=True)


F1-score sur les données d'entraînement : 0.5252324037184595


In [None]:
y_pred_valid = svm_model.predict(X_val_rf)

f1 = f1_score(y_va_vector, y_pred_valid)
print(f"F1-score sur les données de validation : {f1}")

F1-score sur les données de validation : 0.23408879230052776


## XG-Boost model

In [None]:
from xgboost import XGBRegressor


xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)  


xgb_model.fit(X_train_rf, y_tr_vector.ravel())


y_pred_train = xgb_model.predict(X_train_rf)

y_pred_train_class = (y_pred_train>0.3).astype(int)
f1 = f1_score(y_tr_vector, y_pred_train_class)

print(f"F1-score sur les données d'entraînement : {f1}")


F1-score sur les données d'entraînement : 0.6453870707226089


In [None]:
y_pred_valid = xgb_model.predict(X_val_rf)

y_pred_valid_class = (y_pred_valid>0.3).astype(int)
f1 = f1_score(y_va_vector, y_pred_valid_class)
print(f"F1-score sur les données d'entraînement : {f1}")

F1-score sur les données d'entraînement : 0.5620813210780495
