# Self Order Score

Algoritmo em quatro passos: (1) ideias vizinhas da mesma categoria são agrupadas -- a posição do grupo é associada à posição do primeiro elemento do mesmo; (2) grupos são ordenados conforme à posição do grupo no texto; (3) agrega grupos por categoria (cada grupo é agregado na primeira posição que aparece) e conta quantas trocas foram necessárias para cada agregação; (4) o total somado de todas as trocas é o índice `self order score`.


### Associações

* pathophysiology: 1
* epidemiology: 2
* etiology: 3
* history: 4
* physical: 5
* exams: 6
* differential: 7
* therapeutic: 8

### Dicionários e funções auxiliares
* `categories`: labels de categorias (`category`) para ids (`category_id`)
* `inverted_categories`: ids de categorias (`category_id`) para labels (`category`)
* `groups_to_text`: representação de vetor `[[category_id, quantity],...]` para string `'category: quantity; ...'`
  representação de vetor usa índice de categoria (`category_id`) e string usa label de categoria (`category`)

In [None]:
import annotation_metrics as am

In [None]:
cats = [['epidemiology',65,1], ['etiology',65,1],['epidemiology',89,1], ['etiology',89,1],['history',35,1],['history',35,1]]

cats_inv = am.categories_labels_to_ids(cats)
print(cats_inv)

cats_r = am.categories_ids_to_labels(cats_inv)
print(cats_r)

print(am.categories_ids_to_text(cats_inv))

## Agrupamento de labels adjacentes

### `self_order_groups()`

* input: id de categorias e posições - `[[category_id, position], ...]`
* output: id de categorias e quantidade de itens agrupados - `[[category_id, quantity], ...]`

Categorias (`category_id`) são representadas por índices numéricos.

In [None]:
cats = [['epidemiology',65], ['etiology',65],['epidemiology',89], ['etiology',89],['history',35],['history',35]]

cats_inv = am.categories_labels_to_ids(cats)
print(cats_inv)

cats_groups = am.self_order_groups(cats_inv)
print(cats_groups)

cats_r = am.categories_ids_to_labels(cats_groups)
print(cats_r)

print(am.categories_ids_to_text(cats_groups))

In [None]:
print(am.self_order_groups([[1, 10], [2, 20], [1, 20], [2, 30]]))
print(am.self_order_groups([[1, 10, 2], [2, 20, 3], [1, 25, 5], [2, 30, 1]]))
print(am.self_order_groups([[1, 10], [2, 20], [1, 20], [3, 20], [1, 30], [2, 30], [3, 30]]))
print(am.self_order_groups([[2, 71, 2], [2, 96, 5], [3, 98, 3], [2, 100, 8], [5, 120, 2], [5, 130, 3], [5, 135, 4], [3, 140, 1], [5, 180, 2]]))
print(am.self_order_groups([[5, 135], [2, 100], [2, 71], [2, 96], [5, 130], [3, 98], [5, 180], [5, 120], [3, 140]]))
print(am.self_order_groups([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [5, 130], [5, 135], [3, 140], [5, 180]]))
print(am.self_order_groups([[2, 65, 1], [3, 65, 1], [2, 89, 1], [3, 89, 1], [4, 35, 1], [4, 35, 1]]))

In [None]:
print(am.self_order_score(am.self_order_groups([[1, 10], [2, 20], [1, 20], [2, 30]])))
print(am.self_order_score(am.self_order_groups([[1, 10, 2], [2, 20, 3], [1, 25, 5], [2, 30, 1]])))
print(am.self_order_score(am.self_order_groups([[1, 10], [2, 20], [1, 20], [3, 20], [1, 30], [2, 30], [3, 30]])))
print(am.self_order_score(am.self_order_groups([[2, 71, 2], [2, 96, 5], [3, 98, 3], [2, 100, 8], [5, 120, 2], [5, 130, 3], [5, 135, 4], [3, 140, 1], [5, 180, 2]])))
print(am.self_order_score(am.self_order_groups([[5, 135], [2, 100], [2, 71], [2, 96], [5, 130], [3, 98], [5, 180], [5, 120], [3, 140]])))
print(am.self_order_score(am.self_order_groups([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [5, 130], [5, 135], [3, 140], [5, 180]])))
print(am.self_order_score(am.self_order_groups([[2, 65, 1], [3, 65, 1], [2, 89, 1], [3, 89, 1], [4, 35, 1], [4, 35, 1]])))

In [None]:
print(am.normalized_self_order_score(am.self_order_groups([[1, 10], [2, 20], [1, 20], [2, 30]])))
print(am.normalized_self_order_score(am.self_order_groups([[1, 10, 2], [2, 20, 3], [1, 25, 5], [2, 30, 1]])))
print(am.normalized_self_order_score(am.self_order_groups([[1, 10], [2, 20], [1, 20], [3, 20], [1, 30], [2, 30], [3, 30]])))
print(am.normalized_self_order_score(am.self_order_groups([[2, 71, 2], [2, 96, 5], [3, 98, 3], [2, 100, 8], [5, 120, 2], [5, 130, 3], [5, 135, 4], [3, 140, 1], [5, 180, 2]])))
print(am.normalized_self_order_score(am.self_order_groups([[5, 135], [2, 100], [2, 71], [2, 96], [5, 130], [3, 98], [5, 180], [5, 120], [3, 140]])))
print(am.normalized_self_order_score(am.self_order_groups([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [5, 130], [5, 135], [3, 140], [5, 180]])))
print(am.normalized_self_order_score(am.self_order_groups([[2, 65, 1], [3, 65, 1], [2, 89, 1], [3, 89, 1], [4, 35, 1], [4, 35, 1]])))

# Clustering in Free Recall

Algoritmo de category clustering em free recall conforme descrito em https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3665324/

In [None]:
print(am.clustering_free_recall([[2, 1], [4, 2], [4, 3], [3, 4], [2, 5], [3, 6], [1, 7], [4, 8], [4, 9]]))
print(am.clustering_free_recall([[3, 1], [4, 2], [4, 3], [3, 4], [1, 5], [1, 6], [3, 7], [1, 8], [1, 9], [2, 10], [2, 11], [2, 12], [4, 13], [4, 14], [3, 15]]))
print(am.clustering_free_recall([[2, 1], [2, 2], [3, 3], [1, 4], [1, 5], [1, 6], [1, 7], [2, 8], [3, 9], [3, 10], [2, 11], [1, 12], [4, 13], [4, 14], [4, 15], [4, 16], [2, 17], [2, 18], [3, 19], [1, 20]]))
print(am.clustering_free_recall([[5, 135], [2, 100], [2, 71], [2, 96], [5, 130], [3, 98], [5, 180], [5, 120], [3, 140]]))
print(am.clustering_free_recall([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [5, 130], [5, 135], [3, 140], [5, 180]]))
print(am.clustering_free_recall([[2, 71], [2, 96], [3, 98], [2, 98], [5, 98], [7,98], [5, 130], [5, 135], [3, 140], [5, 180]]))
print(am.clustering_free_recall([[2, 1], [5, 2], [2, 3], [3, 4], [1, 5], [5, 6]]))
print(am.clustering_free_recall([[1, 1], [1, 2], [1, 3], [2, 4]]))
print(am.clustering_free_recall([[7,1],[3,2],[7,3],[3,4],[2,5],[2,6]]))
print(am.clustering_free_recall([[1,1],[1,2],[5,3],[2,4],[8,5],[1,6],[1,7]]))
print(am.clustering_free_recall([[5,1], [5,2], [5,3]]))

# Annotation Metrics

## `annotation_metrics()`

### Inputs:

#### Annotation Categories Ordered file (`categories_order_csv`)
  * `annotation id`: unique identifier for the annotation
  * `categories ordered`: `category:position/quantity; ...`

#### Sentence Scores file (`sentence_scores_csv`)
* `annotation id`: unique identifier for the annotation
* `objective test score`: score of the objective test (progression test)
* `organization level`: level of the organization attributed by the annotator
* `global score`: global score attributed by the annotator

### Outputs:

#### Clustered Annotations file (`annotation_metrics_csv`)

#### Statistics of Clusters file (`annotation_stats_csv`)

In [None]:
import annotation_metrics as am

am.annotation_metrics(
  'annotations-dpoc-medical_specialist_groups.csv',
  'annotations-dpoc-medical_specialist_scores.csv',
  'annotations-dpoc-medical_specialist_metrics.csv',
  'annotations-dpoc-medical_specialist_summary.csv',
  'annotations-dpoc-medical_specialist_stats.csv')

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import annotation_metrics as am


In [None]:

annotations = pd.read_csv('annotations-dpoc-medical_specialist_metrics.csv')

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
self_order_sentences = annotations['self order grouped'].tolist()
embeddings = model.encode(self_order_sentences)

# Reduce dimensionality with t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(embeddings)

# Reduce dimensionality with PCA
pca = PCA(n_components=2)
pca_results = pca.fit_transform(embeddings)

category_dimensions = annotations[am.categories_labels].values
tsne_results_cat = tsne.fit_transform(category_dimensions)

pca_results_cat = pca.fit_transform(category_dimensions)

## Comparison Llama against medical specialist's Clusters

In [None]:
fer_annotations = pd.read_csv('../teste-progresso/06-ml-comparison/medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')
llama_annotations = pd.read_csv('../teste-progresso/06-ml-comparison/llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv')

model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

fer_category_dimensions = fer_annotations[am.categories_labels].values
llama_category_dimensions = llama_annotations[am.categories_labels].values

fer_tsne_results_cat = tsne.fit_transform(fer_category_dimensions)
llama_tsne_results_cat = tsne.fit_transform(llama_category_dimensions)

fer_pca_results_cat = pca.fit_transform(fer_category_dimensions)
llama_pca_results_cat = pca.fit_transform(llama_category_dimensions)

## Utilizing distiluse-base-multilingual-cased-v2
Comparison Llama against medical specialist's Clusters

In [None]:
fer_annotations = pd.read_csv('../teste-progresso/06-ml-comparison/medical_specialist/aligned_annotations-dpoc-medical_specialist_metrics_435.csv')
llama_annotations = pd.read_csv('../teste-progresso/06-ml-comparison/llama/annotations-dpoc-llm_10_tf_idf_custom_shot_metrics.csv')

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

fer_self_order_sentences = fer_annotations['self order grouped'].tolist()
llama_self_order_sentences = llama_annotations['self order grouped'].tolist()

fer_embeddings = model.encode(fer_self_order_sentences)
llama_embeddings = model.encode(llama_self_order_sentences)

# Reduce dimensionality with t-SNE
tsne = TSNE(n_components=2, random_state=42)
fer_tsne_results = tsne.fit_transform(fer_embeddings)
llama_tsne_results = tsne.fit_transform(llama_embeddings)

# Reduce dimensionality with PCA
pca = PCA(n_components=2)
fer_pca_results = pca.fit_transform(fer_embeddings)
llama_pca_results = pca.fit_transform(llama_embeddings)

# category_dimensions = annotations[am.categories_labels].values

# fer_tsne_results_cat = tsne.fit_transform(category_dimensions)
# llama_tsne_results_cat = tsne.fit_transform(category_dimensions)

# fer_pca_results_cat = pca.fit_transform(category_dimensions)
# llama_pca_results_cat = pca.fit_transform(category_dimensions)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(fer_tsne_results[:, 0], fer_tsne_results[:, 1], label='Cluster Fer', marker='^',
            c=fer_annotations['cluster gmm'], cmap='Accent', alpha=0.5)
ax.scatter(llama_tsne_results[:, 0]+0.1, llama_tsne_results[:, 1]+0.1, 
                      c=llama_annotations['cluster gmm'], cmap='Accent', label='Cluster Llama', marker='x', alpha=0.7)
ax.set_xlabel('t-SNE 1')
ax.set_ylabel('t-SNE 2')
ax.set_title('Sentence Embeddings Clusters (GMM) - medical specialist x Llama')
ax.legend()
plt.show()

fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(fer_pca_results[:, 0], fer_pca_results[:, 1], label='Cluster Fer', marker='^',
            c=fer_annotations['cluster gmm'], cmap='Accent', alpha=0.5)
ax.scatter(llama_pca_results[:, 0]+0.1, llama_pca_results[:, 1]+0.1, 
                      c=llama_annotations['cluster gmm'], cmap='Accent', label='Cluster Llama', marker='x', alpha=0.7)
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.set_title('Sentence Embeddings Clusters (GMM) - medical specialist x Llama')
ax.legend()
plt.show()

## Sentence Embedding and Category Cluster - Comparing against medical specialist x Llama x BioBERT

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20, 8))

axs[0].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
         c=llama_annotations['cluster gmm cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (GMM) - Llama')
axs[0].set_xlabel('PCA 1')
axs[0].set_ylabel('PCA 2')

scatter = axs[1].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
             c=fer_annotations['cluster gmm cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (GMM) - Medical Specialist')
axs[1].set_xlabel('PCA 1')
axs[1].set_ylabel('PCA 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(20, 8))

axs[0].scatter(fer_pca_results_cat[:, 0], fer_pca_results_cat[:, 1],
         c=llama_annotations['cluster gmm cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (GMM) - Llama')
axs[0].set_xlabel('PCA 1')
axs[0].set_ylabel('PCA 2')

scatter = axs[1].scatter(fer_pca_results_cat[:, 0], fer_pca_results_cat[:, 1],
             c=fer_annotations['cluster gmm cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (GMM) - Medical Specialist')
axs[1].set_xlabel('PCA 1')
axs[1].set_ylabel('PCA 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()

axs[0].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
         c=llama_annotations['cluster gmm cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (GMM) - Llama')
axs[0].set_xlabel('PCA 1')
axs[0].set_ylabel('PCA 2')

scatter = axs[1].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
             c=fer_annotations['cluster gmm cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (GMM) - Medical Specialist')
axs[1].set_xlabel('PCA 1')
axs[1].set_ylabel('PCA 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(20, 8))

axs[0].scatter(fer_tsne_results_cat[:, 0], fer_tsne_results_cat[:, 1],
         c=llama_annotations['cluster gmm cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (GMM) - Llama')
axs[0].set_xlabel('tsne 1')
axs[0].set_ylabel('tsne 2')

scatter = axs[1].scatter(fer_tsne_results_cat[:, 0], fer_tsne_results_cat[:, 1],
             c=fer_annotations['cluster gmm cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (GMM) - Medical Specialist')
axs[1].set_xlabel('tsne 1')
axs[1].set_ylabel('tsne 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()

# Kmeans

fig, axs = plt.subplots(1, 2, figsize=(20, 8))

axs[0].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
         c=llama_annotations['cluster kmeans n cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (kmeans n) - Llama')
axs[0].set_xlabel('PCA 1')
axs[0].set_ylabel('PCA 2')

scatter = axs[1].scatter(llama_pca_results_cat[:, 0], llama_pca_results_cat[:, 1],
             c=fer_annotations['cluster kmeans n cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (kmeans n) - Medical Specialist')
axs[1].set_xlabel('PCA 1')
axs[1].set_ylabel('PCA 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(20, 8))

axs[0].scatter(fer_pca_results_cat[:, 0], fer_pca_results_cat[:, 1],
         c=llama_annotations['cluster kmeans n cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (kmeans n) - Llama')
axs[0].set_xlabel('PCA 1')
axs[0].set_ylabel('PCA 2')

scatter = axs[1].scatter(fer_pca_results_cat[:, 0], fer_pca_results_cat[:, 1],
             c=fer_annotations['cluster kmeans n cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (kmeans n) - Medical Specialist')
axs[1].set_xlabel('PCA 1')
axs[1].set_ylabel('PCA 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(20, 8))

axs[0].scatter(llama_tsne_results_cat[:, 0], llama_tsne_results_cat[:, 1],
         c=llama_annotations['cluster kmeans n cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (kmeans n) - Llama')
axs[0].set_xlabel('tsne 1')
axs[0].set_ylabel('tsne 2')

scatter = axs[1].scatter(llama_tsne_results_cat[:, 0], llama_tsne_results_cat[:, 1],
             c=fer_annotations['cluster kmeans n cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (kmeans n) - Medical Specialist')
axs[1].set_xlabel('tsne 1')
axs[1].set_ylabel('tsne 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()

fig, axs = plt.subplots(1, 2, figsize=(20, 8))

axs[0].scatter(fer_tsne_results_cat[:, 0], fer_tsne_results_cat[:, 1],
         c=llama_annotations['cluster kmeans n cat'], cmap='Accent', alpha=0.6)
axs[0].set_title('Categories Clusters (kmeans n) - Llama')
axs[0].set_xlabel('tsne 1')
axs[0].set_ylabel('tsne 2')

scatter = axs[1].scatter(fer_tsne_results_cat[:, 0], fer_tsne_results_cat[:, 1],
             c=fer_annotations['cluster kmeans n cat'], cmap='viridis', alpha=0.6)
axs[1].set_title('Categories Clusters (kmeans n) - Medical Specialist')
axs[1].set_xlabel('tsne 1')
axs[1].set_ylabel('tsne 2')

fig.colorbar(scatter, ax=axs, label='Cluster')
plt.show()