### Implementação de Fuzzy C-means utilizando GloVe

In [328]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import skfuzzy as fuzz

In [329]:
def load_glove(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def words_to_vectors(words, embeddings, dimension=50):
    vectors = []
    for word in words:
        vector = embeddings.get(word)
        if vector is not None:
            vectors.append(vector)
        else:
            print(f"'{word}' not found in GloVe vocabulary. Using zero vector.")
            vectors.append(np.zeros(dimension))
    return np.array(vectors)

glove_file_path = './GloVe/glove.6B.50d.txt'

glove_embeddings = load_glove(glove_file_path)


In [330]:
words = ['harvard', 'learning', 'intelligence']

word_vectors = words_to_vectors(words, glove_embeddings)
print(word_vectors)

[[-8.5970e-01  1.1120e+00 -2.9970e-01 -1.1093e+00  1.5653e-01 -1.3244e-01
  -1.0520e+00 -9.2620e-01 -5.2920e-01 -2.4501e-01 -2.2653e-01  2.5299e-01
  -9.9125e-02 -4.0640e-01  9.7853e-04 -3.5808e-02 -1.8689e-01  7.1157e-01
  -4.4480e-01  8.6651e-01  5.4339e-01  5.9826e-01 -3.1584e-02 -4.6351e-01
  -8.5038e-02 -1.8902e+00  1.1140e-01 -7.5604e-01 -1.6965e+00 -3.9752e-01
   1.2976e+00 -3.4127e-01 -2.2890e-01 -1.4524e+00 -2.9855e-01 -2.0297e-01
  -4.4211e-01  1.1521e+00  1.5059e+00 -4.8819e-01 -2.1176e-01 -3.6186e-01
  -9.1108e-02  9.5266e-01  2.0254e-01  1.0068e-01  6.9316e-01  2.6215e-01
  -9.0986e-01  5.9507e-01]
 [ 2.0461e-01  4.8659e-01 -5.5308e-01 -2.7019e-01  2.6336e-01  1.5751e-01
  -2.8994e-01 -5.1824e-01  5.1829e-02  3.6225e-01  3.7077e-01  1.3220e-01
  -6.1377e-02 -5.3606e-01 -3.4733e-01 -4.3981e-02 -8.6744e-02  7.8305e-01
   4.1422e-01  2.7996e-02  2.3433e-01  9.8844e-01 -4.1049e-01  6.2060e-01
   1.3966e+00 -6.5427e-01 -1.8221e-01 -1.0293e+00 -1.4741e-02 -2.5384e-01
   3.2270e+

In [331]:
wordsim_path_file = './WordSim_353/wordsim_relatedness_goldstandard.txt'
df = pd.read_csv(wordsim_path_file, sep='\t', header=None)
print(df.head())
words = pd.concat([df[0], df[1]]).str.lower().drop_duplicates()
word_vectors = words_to_vectors(words, glove_embeddings)
print('#words = ', len(words))

           0          1     2
0   computer   keyboard  7.62
1  Jerusalem     Israel  8.46
2     planet     galaxy  8.11
3     canyon  landscape  7.53
4       OPEC    country  5.63
#words =  346


In [332]:
# perplexity = np.arange(10, 300, 10)
# divergence = []

# for i in perplexity:
#     model = TSNE(n_components=2, init="pca", perplexity=i)
#     reduced = model.fit_transform(word_vectors)
#     divergence.append(model.kl_divergence_)
# fig = px.line(x=perplexity, y=divergence, markers=True)
# fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
# fig.update_traces(line_color="red", line_width=1)
# fig.show()

In [333]:
tsne = TSNE(n_components=2,perplexity=20, init='pca', random_state=0)
word_vectors_tsne = tsne.fit_transform(word_vectors)

tsne.kl_divergence_

1.2116248607635498

In [334]:
fig = px.scatter(x=word_vectors_tsne[:, 0], y=word_vectors_tsne[:, 1], text=words)
fig.update_layout(
    title="t-SNE visualization of WordSim_353 dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
    width=800,
    height=500
)
fig.show()

In [335]:
n_clusters = 8

fcm_model = fuzz.cluster.cmeans(word_vectors.T, c=n_clusters, m=1.1, error=0.05, maxiter=1000, init=None)
cluster_membership = np.argmax(fcm_model[1], axis=0)

df_membership = pd.DataFrame(fcm_model[1].T, index=words, columns=[f"Cluster {i}" for i in range(n_clusters)])

top_words_per_cluster = {}

top_n = 100

for i in range(n_clusters):
    # Seleciona as top_n palavras ordenadas pelo grau de pertencimento ao cluster i
    top_entries = df_membership[f"Cluster {i}"].sort_values(ascending=False).head(top_n)
    top_words_per_cluster[f"Cluster {i}"] = [f"{word} ({value:.4f})" for word, value in zip(top_entries.index, top_entries.values)]

df_top_words = pd.DataFrame.from_dict(top_words_per_cluster, orient="index")

display(df_top_words)

df_membership = pd.DataFrame(fcm_model[1].T, index=words, columns=[f"Cluster {i}" for i in range(n_clusters)])

df_membership = df_membership.map(lambda x: f"{x:.4f}")

display(df_membership)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
Cluster 0,professor (0.9952),architecture (0.9919),scientist (0.9858),chemistry (0.9847),physics (0.9815),library (0.9803),institution (0.9743),psychology (0.9719),journal (0.9389),culture (0.9308),...,serial (0.0482),jerusalem (0.0482),paper (0.0473),word (0.0472),recognition (0.0467),similarity (0.0455),wealth (0.0436),accommodation (0.0430),inmate (0.0416),keyboard (0.0415)
Cluster 1,shore (0.9985),sea (0.9949),area (0.9919),proximity (0.9861),canyon (0.9856),coast (0.9810),mars (0.9656),forest (0.9577),observation (0.9529),voyage (0.9491),...,arrangement (0.0614),group (0.0610),astronomer (0.0605),fbi (0.0572),string (0.0565),glass (0.0561),development (0.0554),deposit (0.0545),troops (0.0539),ecology (0.0538)
Cluster 2,baby (0.9975),lover (0.9953),popcorn (0.9914),soap (0.9861),drink (0.9845),closet (0.9834),shower (0.9831),lad (0.9757),cabbage (0.9681),smile (0.9679),...,lesson (0.0654),lawyer (0.0594),fertility (0.0590),music (0.0585),galaxy (0.0561),atmosphere (0.0532),paper (0.0521),accommodation (0.0491),book (0.0476),cd (0.0469)
Cluster 3,government (0.9990),plan (0.9963),withdrawal (0.9900),israel (0.9887),ministry (0.9854),president (0.9853),secretary (0.9842),planning (0.9839),wednesday (0.9839),peace (0.9821),...,food (0.0823),drought (0.0801),drug (0.0791),weapon (0.0719),street (0.0658),association (0.0589),flood (0.0550),year (0.0545),string (0.0510),line (0.0499)
Cluster 4,hardware (0.9981),phone (0.9969),internet (0.9968),computer (0.9967),software (0.9949),video (0.9933),network (0.9918),card (0.9778),telephone (0.9777),tool (0.9633),...,flight (0.0448),preparation (0.0447),theater (0.0430),people (0.0418),crew (0.0414),warning (0.0409),wealth (0.0388),citizen (0.0383),evidence (0.0368),deposit (0.0362)
Cluster 5,round (1.0000),match (0.9999),team (0.9996),season (0.9994),game (0.9989),cup (0.9981),record (0.9971),competition (0.9968),victory (0.9968),football (0.9939),...,love (0.0128),confidence (0.0126),combination (0.0126),board (0.0123),senate (0.0122),war (0.0122),holy (0.0118),opec (0.0118),kilometer (0.0114),money (0.0114)
Cluster 6,prejudice (0.9950),abuse (0.9904),isolation (0.9877),morality (0.9867),precedent (0.9793),sex (0.9779),disability (0.9712),anxiety (0.9674),gender (0.9658),treatment (0.9587),...,stroke (0.1244),seepage (0.1192),property (0.1177),racket (0.1165),fbi (0.1154),secret (0.1102),freud (0.1093),number (0.1007),size (0.1007),development (0.0892)
Cluster 7,price (1.0000),stock (0.9999),market (0.9999),profit (0.9997),trading (0.9995),dollar (0.9995),interest (0.9994),investor (0.9987),currency (0.9982),credit (0.9979),...,atmosphere (0.0070),prominence (0.0068),equipment (0.0067),minister (0.0064),governor (0.0064),registration (0.0064),fertility (0.0061),depression (0.0059),senate (0.0057),galaxy (0.0057)


Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7
computer,0.0020,0.0001,0.0002,0.0001,0.9967,0.0001,0.0004,0.0004
jerusalem,0.0482,0.1965,0.0114,0.6824,0.0092,0.0038,0.0458,0.0028
planet,0.0153,0.9261,0.0301,0.0016,0.0170,0.0018,0.0076,0.0005
canyon,0.0042,0.9856,0.0064,0.0007,0.0014,0.0006,0.0010,0.0003
opec,0.0053,0.0156,0.0218,0.2357,0.0125,0.0118,0.0198,0.6776
...,...,...,...,...,...,...,...,...
voyage,0.0082,0.9491,0.0189,0.0063,0.0036,0.0073,0.0060,0.0005
string,0.0349,0.0565,0.1404,0.0510,0.1859,0.3887,0.1369,0.0057
smile,0.0015,0.0060,0.9679,0.0010,0.0057,0.0022,0.0155,0.0003
cucumber,0.0035,0.0227,0.9521,0.0017,0.0073,0.0066,0.0040,0.0022


In [336]:

fig = px.scatter(x=word_vectors_tsne[:, 0], y=word_vectors_tsne[:, 1], text=words, color=cluster_membership)
fig.update_layout(
    title="t-SNE visualization of WordSim_353 dataset with FCM",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
    width=1500,
    height=900
)
fig.show()

fcm_im = fcm_model[1].T @ fcm_model[1]
fig = px.imshow(fcm_im, color_continuous_scale='plasma')
fig.update_layout(width=1200, height = 900)
fig.show()

df_membership.index = np.array(df_membership.index)
df_membership['cluster'] = cluster_membership

df_membership_ord = df_membership.sort_values(by='cluster')
df_membership_ord = df_membership_ord.drop(columns='cluster')
# display(df_membership_ord)

df_membership_ord = np.array(df_membership_ord).astype(float)
fcm_im_ord = df_membership_ord @ df_membership_ord.T

fig = px.imshow(fcm_im_ord, color_continuous_scale='plasma')
fig.update_layout(width=1200, height = 900)
fig.show()