In [116]:
!cd ../src

zsh:cd:1: no such file or directory: ../src


In [117]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [118]:
%autoreload 2

In [119]:
import pandas as pd
import numpy as np
import spacy

In [120]:
nlp = spacy.load("en_core_web_lg")

### Parameters

In [121]:
from src.utils import plots_dir, clusters_dir, features_dir
import matplotlib.pyplot as plt

# Plots
font = {
    'size': 16
}
plt.rc('font', **font)

# Experiment
group = "combined"  # "controlled" or "free" or "combined"
pos = "adjectives"  # "nouns" or "adjectives"

colors = {
    "controlled": "#97121f",
    "free": "#1271d1",
    "combined": "#0b4539"
}

# Embeddings
embeddings_model = "glove"  # "bert" or "glove"

# Clustering
if group == "controlled" and pos == "nouns":
    N_CLUSTERS = 9
if group == "controlled" and pos == "adjectives":
    N_CLUSTERS = 10
if group == "free" and pos == "nouns":
    N_CLUSTERS = 12
if group == "free" and pos == "adjectives":
    N_CLUSTERS = 11
if group == "combined" and pos == "nouns":
    N_CLUSTERS = 17
if group == "combined" and pos == "adjectives":
    N_CLUSTERS = 19

# Base output paths
base_filename = f"{group}_{pos}_2_{embeddings_model}_agglo"
plots_path = plots_dir() / base_filename
plots_path = str(plots_path)
clusters_path = clusters_dir() / base_filename
clusters_path = str(clusters_path) + f"_n{N_CLUSTERS}"
features_path = features_dir() / "linguistic" / base_filename
features_path = str(features_path) + f"_n{N_CLUSTERS}"

In [122]:
def get_concept_id(index):
    if group == "controlled" and pos == "nouns":
        return "ConstNoun" + str(index + 1)
    if group == "controlled" and pos == "adjectives":
        return "ConstAdj" + str(index + 1)
    if group == "free" and pos == "nouns":
        return "FreeNoun" + str(index + 1)
    if group == "free" and pos == "adjectives":
        return "FreeAdj" + str(index + 1)
    if group == "combined" and pos == "nouns":
        return "CombNoun" + str(index + 1)
    if group == "combined" and pos == "adjectives":
        return "CombAdj" + str(index + 1)

### Load descriptors

In [123]:
from src.utils import descriptors_dir

descriptors = np.genfromtxt(descriptors_dir() / f"{group}_{pos}.txt", dtype=str)
len(descriptors), list(descriptors)

(80,
 ['first',
  'rhythmic',
  'excited',
  'much',
  'other',
  'anxious',
  'different',
  'constant',
  'more',
  'like',
  'sweaty',
  'hard',
  'stuck',
  'slow',
  'slight',
  'little',
  'normal',
  'pleasant',
  'sticky',
  'uncomfortable',
  'strong',
  'fast',
  'fun',
  'funny',
  'happy',
  'nice',
  'wrong',
  'rough',
  'light',
  'electrical',
  'small',
  'tiny',
  'heavy',
  'smooth',
  'textured',
  'whole',
  'aggressive',
  'good',
  'long',
  'annoying',
  'most',
  'same',
  'aware',
  'alert',
  'big',
  'last',
  'least',
  'bad',
  'soft',
  'few',
  'easy',
  'deep',
  'gentle',
  'subtle',
  'electric',
  'low',
  'slippery',
  'intense',
  'wet',
  'actual',
  'second',
  'super',
  'quick',
  'calming',
  'high',
  'angry',
  'short',
  'weird',
  'natural',
  'similar',
  'sure',
  'cold',
  'continuous',
  'satisfied',
  'specific',
  'okay',
  'frequent',
  'quiet',
  'difficult',
  'calm'])

### Get static embeddings for descriptors

In [124]:
if embeddings_model == "bert":
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    embeddings = model.encode(descriptors, convert_to_numpy=True, show_progress_bar=True)
else:  # "glove"
    from src.glove import get_embeddings
    embeddings_dict = get_embeddings(n_dim=300)
    embeddings = np.array([embeddings_dict[word] for word in descriptors])

print(embeddings.shape)

(80, 300)


### Compute clusters and save

In [125]:
from sklearn.cluster import AgglomerativeClustering

labels = AgglomerativeClustering(affinity="euclidean",
                                 n_clusters=N_CLUSTERS,
                                 linkage="ward"
                                 ).fit_predict(embeddings)

concept_to_desc = {} # k: concept_idx, v: list of descriptors
desc_to_concept = {} # k: desc_idx, v: concept_idx

for concept_idx in range(0, N_CLUSTERS):
    desc_indices = list(np.argwhere(labels == concept_idx))
    desc_indices = [desc_idx[0] for desc_idx in desc_indices]
    concept_to_desc[concept_idx] = [descriptors[desc_idx] for desc_idx in desc_indices]

    for desc_idx in desc_indices:
        desc_to_concept[desc_idx] = concept_idx

In [126]:
# Save as JSON
# path = clusters_path + ".json"
# with open(path, "w") as fp:
#     json.dump(concept_to_desc, fp)
#
# print(f"See clusters at {path}")

In [127]:
# Save as CSV
path = clusters_dir() / f"{group}_{pos}.csv"
concepts = []
for i, (key, value) in enumerate(concept_to_desc.items()):
    desc_elements = " ".join(value)
    concept_id = get_concept_id(i)
    row = [concept_id, desc_elements]
    concepts.append(row)
header = [ [ "Concept", "Descriptors" ] ]
concepts = np.vstack((header, concepts))
np.savetxt(path, concepts, delimiter=",", fmt="%s")

### Identify cluster centers and save


In [128]:
from sklearn.metrics.pairwise import euclidean_distances

concept_centers = []
for concept_idx in range(0, N_CLUSTERS):
    desc_indices = list(np.argwhere(labels == concept_idx))
    desc_indices = [desc_idx[0] for desc_idx in desc_indices]
    desc_embeddings = embeddings[desc_indices] # get embeddings for descriptors in concept
    distance_matrix = euclidean_distances(desc_embeddings, desc_embeddings) # compute distances
    summed_distance_matrix = np.sum(distance_matrix, axis=1) # sum up distances per descriptor
    min_idx = np.argmin(summed_distance_matrix) # find min overall distance
    concept_center_idx = desc_indices[min_idx]
    concept_centers.append(descriptors[concept_center_idx])

# np.savetxt(clusters_path + "_centers.csv", concept_centers, delimiter=",", fmt="%s")

### From clusters to concept matrix

1. Replace words in normalized pattern descriptions with the concept they are associated with.

In [129]:
from src.data import get_merged_norm_descriptions

merged_norm_descriptions = get_merged_norm_descriptions(group) # 32 patterns
concepts = []

for description in merged_norm_descriptions:
    concept_names = []
    for word in description.split():
        word_indices = np.where(np.array(descriptors) == word)[0]
        if len(word_indices) > 0:  # word is in descriptors
            word_idx = word_indices[0]
            concept_idx = desc_to_concept[word_idx]
            concept_name = f"concept_{concept_idx}"
            concept_names.append(concept_name)

    concepts.append(" ".join(concept_names))

### TF-IDF

1. Compute TF-IDF feature matrix on the new pattern descriptions that include concept ids instead of words.
2. Save feature matrix to disk.

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(concepts)

In [131]:
ids = [ [id] for id in range(1, 33) ]
features = X_tfidf.todense()
features = np.hstack((ids, features))
columns = ["id"]
for feature_name in vectorizer.get_feature_names_out():
    columns.append(feature_name)
df = pd.DataFrame(features, columns=columns)
df = df.round(4)
df = df.astype({ "id": int })
# df.to_csv(features_path + "_tfidf.csv", index=False)

### Count Vectorizer

1. Count occurrences of concepts.
2. Save feature matrix to disk.

In [132]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(concepts)

In [133]:
ids = [ [id] for id in range(1, 33) ]
features = X_count.todense()
features = np.hstack((ids, features))
columns = ["id"]
for feature_name in vectorizer.get_feature_names_out():
    columns.append(feature_name)
df = pd.DataFrame(features, columns=columns)
df = df.astype({ "id": int })
# df.to_csv(features_path + "_count.csv", index=False)