In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn import metrics
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from textwrap import wrap
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")

Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
@torch.no_grad()
def encode_sentence_and_extract_position(sentence, position):
    ids = tokenizer.encode(sentence)
    bert_output = model.forward(torch.tensor(ids).unsqueeze(0),
                                encoder_hidden_states = True)
    final_layer_embeddings = bert_output['last_hidden_state'].squeeze()
    if type(position) == int:
        return final_layer_embeddings[position].unsqueeze(0)
    elif type(position) == list:
        return torch.mean(
            final_layer_embeddings[position[0]:position[1]], 0
            ).unsqueeze(0)

def find_position_word(sentence, word):
    ids_word = tokenizer.encode(word)
    tokens_word = tokenizer.convert_ids_to_tokens(ids_word)[1:-1]
    ids_sentence = tokenizer.encode(sentence)
    tokens_sentence = tokenizer.convert_ids_to_tokens(ids_sentence)
    if len(tokens_word) == 1:
        position_word_in_sentence = tokens_sentence.index(tokens_word[0])
    else:
        position_word_in_sentence = [tokens_sentence.index(tokens_word[0]),tokens_sentence.index(tokens_word[-1])+1]
    return position_word_in_sentence

def get_embeddings_from_sentences(word, sentences):
    embeddings = []
    for index, sentence in sentences.iterrows():
        position = find_position_word(sentence['sentence'], word)
        embeddings.append(encode_sentence_and_extract_position(sentence['sentence'], position))

    return embeddings

def extend_df_with_pca(df):
    df_new = df.copy()

    pca = PCA(n_components=3)
    components = pca.fit_transform(matrix_np)

    df_new.insert(1, 'x', components[:,0])
    df_new.insert(2, 'y', components[:,1])
    df_new.insert(3, 'z', components[:,2])

    return df_new

# Load data

## Verbs

In [4]:
df_dragen = pd.read_excel('data/data_dragen_net.xlsx')

embeddings = get_embeddings_from_sentences('dragen', df_dragen)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_dragen)

In [22]:
df_broeden = pd.read_excel('data/data_broeden_net.xlsx')

embeddings = get_embeddings_from_sentences('broeden', df_broeden)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_broeden)

In [23]:
df_vechten = pd.read_excel('data/data_vechten_net.xlsx')

embeddings = get_embeddings_from_sentences('vechten', df_vechten)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_vechten)

## Nouns

In [24]:
df_breuk = pd.read_excel('data/data_breuk_net.xlsx')

embeddings = get_embeddings_from_sentences('breuk', df_breuk)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_breuk)

In [25]:
df_golf = pd.read_excel('data/data_golf_net.xlsx')

embeddings = get_embeddings_from_sentences('golf', df_golf)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_golf)



In [26]:
df_weg = pd.read_excel('data/data_weg_net.xlsx')

embeddings = get_embeddings_from_sentences('weg', df_weg)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_weg)

In [27]:
df_hoofd = pd.read_excel('data/data_hoofd_net.xlsx')

embeddings = get_embeddings_from_sentences('hoofd', df_hoofd)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_hoofd)

In [28]:
df_mes = pd.read_excel('data/data_mes_net.xlsx')

embeddings = get_embeddings_from_sentences('mes', df_mes)
emb_matrix = torch.cat(embeddings, dim=0)

matrix_np = emb_matrix.cpu().detach().numpy()

df_pca = extend_df_with_pca(df_mes)

# Visualisation

In [None]:
#2D
fig = px.scatter(df_pca, x='x', y='y', color='M/L',
                 color_discrete_map={'M': 'red', 'L': 'blue'},
                 hover_data='sentence')
fig.show()

In [None]:
#3D
fig = px.scatter_3d(
    df_pca, x='x', y='y', z='z', color='M/L',
    color_discrete_map={'M': 'red', 'L': 'blue'},
    hover_data='sentence'
)
fig.show()

# Clustering

In [7]:
df = df_dragen

## Two clusters

In [None]:
best_cluster = None
best_score = -np.inf

for random_state in range(1, 51):
    kmeans = KMeans(n_clusters=2, n_init="auto", random_state=random_state)
    cluster = kmeans.fit(matrix_np)
    score = cluster.score(matrix_np)
    
    if score > best_score:
        best_cluster = cluster
        best_score = score
        best_random_state = random_state

kmeans = KMeans(n_clusters=2, n_init="auto", random_state=best_random_state)
aggl_ward = AgglomerativeClustering(n_clusters=2)
aggl_complete = AgglomerativeClustering(n_clusters=2, linkage="complete")
aggl_average = AgglomerativeClustering(n_clusters=2, linkage="average")
aggl_single = AgglomerativeClustering(n_clusters=2, linkage="single")


clusters = [kmeans, aggl_ward, aggl_complete, aggl_average, aggl_single]

for cluster in clusters:
  cluster = cluster.fit(matrix_np)
  df[cluster] = cluster.labels_
  df_pca[cluster] = cluster.labels_
df

In [None]:
#visualisation
fig = px.scatter(df_pca, x='x', y='y', color='M/L',
                 color_discrete_map={'M': 'red', 'L': 'blue'},
                 symbol= kmeans,
                 hover_data='sentence')
fig.show()

In [None]:
#evaluation
df['M/L'] = df['M/L'].replace(['M (part of expression)', 'metonymy'], 'M')

scores = {}

for cluster in clusters:
  ARI = metrics.adjusted_rand_score(df["M/L"], df[cluster])
  Vmeasure = metrics.v_measure_score(df["M/L"], df[cluster])
  scores[cluster] = [ARI, Vmeasure]
df_scores = pd.DataFrame(scores, index=['ARI', 'Vmeasure'])

df_scores

## Dendrogram

In [12]:
def plot_dendrogram(model, **kwargs):
    children = model.children_
    distance = np.arange(children.shape[0])
    no_of_observations = np.arange(2, children.shape[0]+2)
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)
    dendrogram(linkage_matrix, **kwargs)

In [13]:
cluster=aggl_ward

In [14]:
labels=df['M/L'].values + ': ' + df['sentence'].values
labels = [ '\n'.join(wrap(l, 100)) for l in labels ]

In [None]:
plt.figure(figsize=(8, 27))
plt.title('Dendrogram \'broeden\', linkage = ward')

plot_dendrogram(cluster, labels=labels, orientation='left', leaf_font_size=10)

plt.show()


## Silhouette analysis

In [None]:
pca = PCA(n_components=3)
components = pca.fit_transform(matrix_np)

X = components

range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    ax1.set_xlim([-1, 1])

    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    clusterer = KMeans(n_clusters=n_clusters, random_state=best_random_state)
    cluster_labels = clusterer.fit(X).labels_

    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
    
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        y_lower = y_upper + 10  

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])
    ax1.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )

    centers = clusterer.cluster_centers_
  
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("")
    ax2.set_ylabel("")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()

## k-means with more than two clusters

In [None]:
kmeans_n = KMeans(n_clusters=4, n_init="auto", random_state=best_random_state)
kmeans_n = kmeans_n.fit(components)
df[kmeans_n] = kmeans_n.labels_
df_pca[kmeans_n] = kmeans_n.labels_

df

In [None]:
fig = px.scatter(df_pca, x='x', y='y', color='M/L',
                 color_discrete_map={'M': 'red', 'L': 'blue'},
                 symbol= kmeans_n,
                 hover_data='sentence')
fig.show()

In [None]:
ARI = metrics.adjusted_rand_score(df["M/L"], df[kmeans_n])
Vmeasure = metrics.v_measure_score(df["M/L"], df[kmeans_n])
scores[kmeans_n] = [ARI, Vmeasure]
df_scores = pd.DataFrame(scores, index=['ARI', 'Vmeasure'])

df_scores