In [10]:
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import plotly.express as px
import hdbscan


In [11]:
# model Universal Sentence Encoder - całe zdania
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


In [12]:
df_processed = pd.read_excel('cyberbullying_train_complete_sentences.xlsx', sheet_name='Superanotator')
df_processed = df_processed.drop(columns=["label1", "label2", "label3", "1. Sarkazm uszczypliwy, dokuczliwy -> mowa nienawiści"])
df_processed = df_processed.dropna(axis=0, how='any', subset=None, inplace=False)


In [13]:
# Generowanie osadzeń dla zdań - lista zdań
sentence_embeddings = embed(df_processed['text'].tolist()) 
sentence_embeddings = np.array(sentence_embeddings)

# Standaryzacja danych
standardized_data = StandardScaler().fit_transform(sentence_embeddings)

# Zastosowanie t-SNE do redukcji wymiarowości
tfidf_tsne = TSNE(n_components=2, random_state=0, perplexity=50).fit_transform(standardized_data)

# Utworzenie DataFrame z wynikami t-SNE
df_tsne = pd.DataFrame(tfidf_tsne, columns=("Dim_1", "Dim_2"))
df_tsne['label'] = df_processed['LABEL']
df_tsne['text'] = df_processed['text']


In [14]:
# Wizualizacja wyników t-SNE
fig = px.scatter(df_tsne, x='Dim_1', y='Dim_2', color='label',
                 hover_data={'text': True, 'label': True}, width=900, height=600)

fig.update_traces(marker=dict(size=10, opacity=0.7), selector=dict(mode='markers'))
fig.show()


In [15]:
# Klasteryzacja przy użyciu HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
df_tsne['cluster'] = clusterer.fit_predict(tfidf_tsne)


In [16]:
# Wizualizacja klastrów
fig = px.scatter(df_tsne, x='Dim_1', y='Dim_2', color='cluster',
                 hover_data=['text'],
                 title='Klasteryzacja HDBSCAN dla osadzeń USE')

fig.show()
