In [2]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

import sys
sys.path.append('../AJA')
import AJA as aja

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Data recovery
df_train_nodes, df_train_edges, df_test_nodes, df_test_edges = aja.get_data()

In [4]:
# Feature extraction

from sklearn.preprocessing import StandardScaler

# Node

scaler = StandardScaler()
# Sentence length normalized
df_train_nodes['sentence_length'] = df_train_nodes['text'].apply(lambda s: len(s.split()))
df_train_nodes['sentence_length'] = scaler.fit_transform(df_train_nodes['sentence_length'].values.reshape(-1, 1))
df_test_nodes['sentence_length'] = df_test_nodes['text'].apply(lambda s: len(s.split()))
df_test_nodes['sentence_length'] = scaler.transform(df_test_nodes['sentence_length'].values.reshape(-1, 1))

df_train_nodes['nb_occurences'] = df_train_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_train_nodes['nb_occurences'] = scaler.fit_transform(df_train_nodes['nb_occurences'].values.reshape(-1, 1))
df_test_nodes['nb_occurences'] = df_test_nodes['text'].apply(lambda x: sum(x.split().count(mot) for mot in ['uh', 'um', 'okay', '<', 'ah', 'oh']))
df_test_nodes['nb_occurences'] = scaler.transform(df_test_nodes['nb_occurences'].values.reshape(-1, 1))


df_train_nodes['nb_words_more_7'] = df_train_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_train_nodes['nb_words_more_7'] = scaler.fit_transform(df_train_nodes['nb_words_more_7'].values.reshape(-1, 1))
df_test_nodes['nb_words_more_7'] = df_test_nodes['text'].apply(lambda x: sum(len(mot) > 7 and mot.lower() != '<vocalsound>' for mot in x.split()))
df_test_nodes['nb_words_more_7'] = scaler.transform(df_test_nodes['nb_words_more_7'].values.reshape(-1, 1))


# Speaker hot-one encoding
one_hot_encoded = pd.get_dummies(df_train_nodes['speaker_int'], prefix='speaker', dtype=int)
df_train_nodes = df_train_nodes.drop('speaker_int', axis=1)
df_train_nodes = df_train_nodes.drop('speaker_text', axis=1)
df_train_nodes = pd.concat([df_train_nodes, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_test_nodes['speaker_int'], prefix='speaker', dtype=int)
df_test_nodes = df_test_nodes.drop('speaker_int', axis=1)
df_test_nodes = df_test_nodes.drop('speaker_text', axis=1)
df_test_nodes = pd.concat([df_test_nodes, one_hot_encoded], axis=1)

# TFIDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train_nodes['text'])
df_train_nodes['tfidf_sum'] = tfidf_matrix.sum(axis=1)
df_train_nodes['tfidf_max'] = tfidf_matrix.max(axis=1).toarray().flatten()

tfidf_matrix_test = tfidf_vectorizer.fit_transform(df_test_nodes['text'])
df_test_nodes['tfidf_sum'] = tfidf_matrix_test.sum(axis=1)
df_test_nodes['tfidf_max'] = tfidf_matrix_test.max(axis=1).toarray().flatten()

df_train_nodes['tfidf_sum'] = scaler.fit_transform(df_train_nodes['tfidf_sum'].values.reshape(-1,1))
df_test_nodes['tfidf_sum'] = scaler.transform(df_test_nodes['tfidf_sum'].values.reshape(-1,1))

df_train_nodes['tfidf_max'] = scaler.fit_transform(df_train_nodes['tfidf_max'].values.reshape(-1,1))
df_test_nodes['tfidf_max'] = scaler.transform(df_test_nodes['tfidf_max'].values.reshape(-1,1))

# Numbers
df_train_nodes['has_number'] = df_train_nodes['text'].str.contains(r'\d').astype(int)
df_test_nodes['has_number'] = df_test_nodes['text'].str.contains(r'\d').astype(int)

# Edge

new_df = pd.DataFrame({
        'transcription': df_train_edges['transcription'],
        'start': df_train_edges['end'],
        'end': df_train_edges['start'],
        'type_int': 16 + df_train_edges['type_int'],
        'type_text': df_train_edges['type_text'] + "_reverse"
    })
df_train_edges = pd.concat([df_train_edges, new_df], ignore_index=True)

new_df = pd.DataFrame({
        'transcription': df_test_edges['transcription'],
        'start': df_test_edges['end'],
        'end': df_test_edges['start'],
        'type_int': 16 + df_test_edges['type_int'],
        'type_text': df_test_edges['type_text'] + "_reverse"
    })
df_test_edges = pd.concat([df_test_edges, new_df], ignore_index=True)


In [5]:
df_train_nodes.head()

Unnamed: 0,transcription,line,text,label,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,...,sentence_length,nb_occurences,nb_words_more_7,speaker_0,speaker_1,speaker_2,speaker_3,tfidf_sum,tfidf_max,has_number
0,ES2002a,0,Okay,0,-0.057809,-0.085828,-0.03572,-0.011185,0.062363,-0.023545,...,-1.008131,-0.368253,-0.647917,1,0,0,0,-1.168929,1.439285,0
1,ES2002a,1,Right,0,-0.054862,0.047607,-0.032626,-0.010949,-0.035741,-0.051808,...,-1.008131,-0.368253,-0.647917,1,0,0,0,-1.168929,1.439285,0
2,ES2002a,2,<vocalsound> Um well this is the kick-off meet...,1,-0.054665,-0.073837,-0.017161,-0.064276,0.004937,0.062475,...,0.789302,-0.368253,0.456915,1,0,0,0,1.228067,-0.927421,0
3,ES2002a,3,Um <vocalsound> and um,0,-0.010416,-0.072719,-0.017206,-0.088992,-0.048035,0.051155,...,-0.558773,1.365643,-0.647917,1,0,0,0,-0.514793,0.709401,0
4,ES2002a,4,this is just what we're gonna be doing over th...,0,-0.028654,-0.015151,0.09591,-0.059113,0.042067,0.033088,...,1.088874,-0.368253,-0.647917,1,0,0,0,1.838146,-1.725435,0


In [6]:
X = df_train_nodes.drop(['transcription','text','label'],axis=1)
y = df_train_nodes[['label']]

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [39]:
# Initialisation
model = KNeighborsClassifier(n_neighbors=9)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nResults :")
print("Accuracy: ", accuracy)
print("F1-score: ", f1)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

conf_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

print("\nConfusion matrix :")
print(conf_df)

  return self._fit(X, y)



Results :
Accuracy:  0.8353872633390705
F1-score:  0.43965315209749234

Confusion matrix :
          Predicted 0  Predicted 1
Actual 0        11196          691
Actual 1         1700          938


In [11]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
# Initialisation
model = RandomForestClassifier(n_estimators=5,criterion='entropy')

# Train the model
with tqdm(total=1, desc="Training", unit="iteration") as pbar:
    model.fit(X_train, y_train)
    pbar.update(1)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nResults :")
print("Accuracy: ", accuracy)
print("F1-score: ", f1)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

conf_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

print("\nConfusion matrix :")
print(conf_df)

  return fit_method(estimator, *args, **kwargs)
Training: 100%|██████████| 1/1 [00:15<00:00, 15.06s/iteration]


Results :
Accuracy:  0.8117728055077452
F1-score:  0.3993848857644991

Confusion matrix :
          Predicted 0  Predicted 1
Actual 0        10882         1005
Actual 1         1729          909





In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [49]:
# Initialisation
model = GradientBoostingClassifier(learning_rate=0.01)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nResults :")
print("Accuracy: ", accuracy)
print("F1-score: ", f1)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

conf_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

print("\nConfusion matrix :")
print(conf_df)

  y = column_or_1d(y, warn=True)



Results :
Accuracy:  0.818382099827883
F1-score:  0.0

Confusion matrix :
          Predicted 0  Predicted 1
Actual 0        11887            0
Actual 1         2638            0


In [33]:
from sklearn.svm import SVC

In [34]:
# Initialisation
model = SVC()

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nResults :")
print("Accuracy: ", accuracy)
print("F1-score: ", f1)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

conf_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

print("\nConfusion matrix :")
print(conf_df)

  y = column_or_1d(y, warn=True)



Results :
Accuracy:  0.8492254733218588
F1-score:  0.48783910196445274

Confusion matrix :
          Predicted 0  Predicted 1
Actual 0        11292          595
Actual 1         1595         1043
