### Implementação de Fuzzy C-means utilizando GloVe

In [2]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import skfuzzy as fuzz

In [3]:
def load_glove(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def words_to_vectors(words, embeddings, dimension=50):
    vectors = []
    for word in words:
        vector = embeddings.get(word)
        if vector is not None:
            vectors.append(vector)
        else:
            print(f"'{word}' not found in GloVe vocabulary. Using zero vector.")
            vectors.append(np.zeros(dimension))
    return np.array(vectors)

glove_file_path = './GloVe/glove.6B.50d.txt'

glove_embeddings = load_glove(glove_file_path)


In [4]:
words = ['harvard', 'learning', 'intelligence']

word_vectors = words_to_vectors(words, glove_embeddings)
print(word_vectors)

[[-8.5970e-01  1.1120e+00 -2.9970e-01 -1.1093e+00  1.5653e-01 -1.3244e-01
  -1.0520e+00 -9.2620e-01 -5.2920e-01 -2.4501e-01 -2.2653e-01  2.5299e-01
  -9.9125e-02 -4.0640e-01  9.7853e-04 -3.5808e-02 -1.8689e-01  7.1157e-01
  -4.4480e-01  8.6651e-01  5.4339e-01  5.9826e-01 -3.1584e-02 -4.6351e-01
  -8.5038e-02 -1.8902e+00  1.1140e-01 -7.5604e-01 -1.6965e+00 -3.9752e-01
   1.2976e+00 -3.4127e-01 -2.2890e-01 -1.4524e+00 -2.9855e-01 -2.0297e-01
  -4.4211e-01  1.1521e+00  1.5059e+00 -4.8819e-01 -2.1176e-01 -3.6186e-01
  -9.1108e-02  9.5266e-01  2.0254e-01  1.0068e-01  6.9316e-01  2.6215e-01
  -9.0986e-01  5.9507e-01]
 [ 2.0461e-01  4.8659e-01 -5.5308e-01 -2.7019e-01  2.6336e-01  1.5751e-01
  -2.8994e-01 -5.1824e-01  5.1829e-02  3.6225e-01  3.7077e-01  1.3220e-01
  -6.1377e-02 -5.3606e-01 -3.4733e-01 -4.3981e-02 -8.6744e-02  7.8305e-01
   4.1422e-01  2.7996e-02  2.3433e-01  9.8844e-01 -4.1049e-01  6.2060e-01
   1.3966e+00 -6.5427e-01 -1.8221e-01 -1.0293e+00 -1.4741e-02 -2.5384e-01
   3.2270e+

In [5]:
wordsim_path_file = './WordSim_353/wordsim_relatedness_goldstandard.txt'
df = pd.read_csv(wordsim_path_file, sep='\t', header=None)
print(df.head())
words = pd.concat([df[0], df[1]]).str.lower().drop_duplicates()
word_vectors = words_to_vectors(words, glove_embeddings)
print('#words = ', len(words))

           0          1     2
0   computer   keyboard  7.62
1  Jerusalem     Israel  8.46
2     planet     galaxy  8.11
3     canyon  landscape  7.53
4       OPEC    country  5.63
#words =  346


In [6]:
# perplexity = np.arange(10, 300, 10)
# divergence = []

# for i in perplexity:
#     model = TSNE(n_components=2, init="pca", perplexity=i)
#     reduced = model.fit_transform(word_vectors)
#     divergence.append(model.kl_divergence_)
# fig = px.line(x=perplexity, y=divergence, markers=True)
# fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
# fig.update_traces(line_color="red", line_width=1)
# fig.show()

In [7]:
tsne = TSNE(n_components=2,perplexity=20, init='pca', random_state=0)
word_vectors_tsne = tsne.fit_transform(word_vectors)

tsne.kl_divergence_

1.2116248607635498

In [8]:
fig = px.scatter(x=word_vectors_tsne[:, 0], y=word_vectors_tsne[:, 1], text=words)
fig.update_layout(
    title="t-SNE visualization of WordSim_353 dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
    width=800,
    height=500
)
fig.show()

In [9]:
n_clusters = 8

fcm_model = fuzz.cluster.cmeans(word_vectors.T, c=n_clusters, m=1.1, error=0.05, maxiter=1000, init=None)
cluster_membership = np.argmax(fcm_model[1], axis=0)

df_membership = pd.DataFrame(fcm_model[1].T, index=words, columns=[f"Cluster {i}" for i in range(n_clusters)])

top_words_per_cluster = {}

top_n = 100

for i in range(n_clusters):
    # Seleciona as top_n palavras ordenadas pelo grau de pertencimento ao cluster i
    top_entries = df_membership[f"Cluster {i}"].sort_values(ascending=False).head(top_n)
    top_words_per_cluster[f"Cluster {i}"] = [f"{word} ({value:.4f})" for word, value in zip(top_entries.index, top_entries.values)]

df_top_words = pd.DataFrame.from_dict(top_words_per_cluster, orient="index")

display(df_top_words)

df_membership = pd.DataFrame(fcm_model[1].T, index=words, columns=[f"Cluster {i}" for i in range(n_clusters)])

df_membership = df_membership.map(lambda x: f"{x:.4f}")

display(df_membership)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
Cluster 0,shore (0.9975),sea (0.9907),canyon (0.9901),area (0.9818),proximity (0.9801),mars (0.9767),coast (0.9708),graveyard (0.9600),observation (0.9476),kilometer (0.9429),...,phone (0.0731),project (0.0677),group (0.0677),network (0.0659),category (0.0578),fingerprint (0.0567),family (0.0566),prominence (0.0541),lad (0.0504),concert (0.0496)
Cluster 1,round (1.0000),match (0.9999),team (0.9995),season (0.9993),game (0.9988),record (0.9976),cup (0.9975),competition (0.9973),victory (0.9968),football (0.9924),...,lesson (0.0139),kilometer (0.0137),lad (0.0135),opec (0.0131),money (0.0130),board (0.0129),hospital (0.0123),constellation (0.0122),maker (0.0120),atmosphere (0.0120)
Cluster 2,government (0.9990),plan (0.9955),israel (0.9889),withdrawal (0.9879),wednesday (0.9865),issue (0.9863),delay (0.9838),president (0.9827),announcement (0.9817),secretary (0.9809),...,reason (0.0995),focus (0.0993),drug (0.0948),car (0.0938),flood (0.0910),holy (0.0885),drought (0.0765),television (0.0737),problem (0.0729),street (0.0676)
Cluster 3,popcorn (0.9992),coffee (0.9991),drink (0.9989),egg (0.9965),sugar (0.9963),cabbage (0.9945),seafood (0.9936),liquid (0.9893),shower (0.9885),cucumber (0.9804),...,tiger (0.0177),wealth (0.0176),viewer (0.0175),astronomer (0.0165),landscape (0.0164),line (0.0163),hundred (0.0155),live (0.0146),depression (0.0139),galaxy (0.0128)
Cluster 4,lover (0.9973),love (0.9959),movie (0.9941),film (0.9869),man (0.9866),magician (0.9842),brother (0.9801),girl (0.9781),wizard (0.9718),mother (0.9705),...,fbi (0.0813),hospital (0.0807),tiger (0.0797),closet (0.0794),direction (0.0789),group (0.0784),soap (0.0781),proton (0.0771),constellation (0.0771),series (0.0755)
Cluster 5,price (1.0000),stock (1.0000),market (1.0000),profit (0.9998),trading (0.9996),interest (0.9994),dollar (0.9993),credit (0.9993),investor (0.9988),currency (0.9977),...,flood (0.0092),racket (0.0091),century (0.0088),recess (0.0086),prominence (0.0086),development (0.0085),atmosphere (0.0083),hypertension (0.0079),depression (0.0079),association (0.0078)
Cluster 6,prejudice (0.9967),morality (0.9881),isolation (0.9874),precedent (0.9865),abuse (0.9809),anxiety (0.9643),gender (0.9598),disability (0.9583),racism (0.9582),liability (0.9396),...,stroke (0.1166),holy (0.1149),proton (0.1087),recess (0.1082),oracle (0.0970),size (0.0925),history (0.0920),property (0.0912),focus (0.0874),secret (0.0741)
Cluster 7,communication (0.9879),library (0.9769),tool (0.9754),architecture (0.9684),journal (0.9666),laboratory (0.9651),institution (0.9617),computer (0.9582),physics (0.9498),software (0.9459),...,secret (0.0797),laundering (0.0792),energy (0.0762),atmosphere (0.0727),television (0.0704),criterion (0.0685),depression (0.0677),theater (0.0672),maker (0.0656),lesson (0.0655)


Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7
computer,0.0024,0.0017,0.0017,0.0025,0.0185,0.0093,0.0057,0.9582
jerusalem,0.2035,0.0039,0.6886,0.0028,0.0297,0.0030,0.0432,0.0254
planet,0.9320,0.0014,0.0015,0.0065,0.0388,0.0005,0.0068,0.0124
canyon,0.9901,0.0005,0.0006,0.0023,0.0030,0.0003,0.0010,0.0021
opec,0.0159,0.0131,0.2499,0.0677,0.0079,0.6133,0.0249,0.0074
...,...,...,...,...,...,...,...,...
voyage,0.9404,0.0070,0.0061,0.0031,0.0338,0.0006,0.0057,0.0033
string,0.0486,0.3045,0.0432,0.0217,0.4328,0.0055,0.1228,0.0210
smile,0.0196,0.0058,0.0032,0.1332,0.7855,0.0009,0.0489,0.0029
cucumber,0.0079,0.0021,0.0006,0.9804,0.0058,0.0008,0.0017,0.0008


In [10]:

fig = px.scatter(x=word_vectors_tsne[:, 0], y=word_vectors_tsne[:, 1], text=words, color=cluster_membership)
fig.update_layout(
    title="t-SNE visualization of WordSim_353 dataset with FCM",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
    width=1500,
    height=900
)
fig.show()

fcm_im = fcm_model[1].T @ fcm_model[1]
fig = px.imshow(fcm_im, color_continuous_scale='plasma')
fig.update_layout(width=1200, height = 900)
fig.show()

df_membership.index = np.array(df_membership.index)
df_membership['cluster'] = cluster_membership

df_membership_ord = df_membership.sort_values(by='cluster')
df_membership_ord = df_membership_ord.drop(columns='cluster')
# display(df_membership_ord)

df_membership_ord = np.array(df_membership_ord).astype(float)
fcm_im_ord = df_membership_ord @ df_membership_ord.T

fig = px.imshow(fcm_im_ord, color_continuous_scale='plasma')
fig.update_layout(width=1200, height = 900)
fig.show()