In [5]:
import gensim
import numpy as np
import pandas as pd
import re
# import stopwordsiso as stopwords
import spacy
import json
from sklearn.manifold import TSNE
import plotly.express as px

In [6]:
# ! pip install plotly
# ! pip install spacy
# ! pip install scikit-learn
# ! pip install pandas
# ! pip install gensim
# ! pip install numpy

In [7]:
# ! python -m spacy download pl_core_news_sm

In [8]:
file_path = 'data/fragments_classification.jsonl'  
data = pd.read_json(file_path, lines=True)

In [9]:
data

Unnamed: 0,text,label
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, odwrócenie]]"
1,@USER No właśnie o tym jest ten tweet 😄,[]
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, wzmocnienie]]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[]
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[]
...,...,...
795,@USER Wszystkiego najlepszego z okazji urodzin...,"[[5, 29, wzmocnienie]]"
796,"@USER widzę, że pewne tweety działają jak magn...",[]
797,"@USER @USER Chociaż futro ma z jenota,\nTo nie...","[[43, 52, odwrócenie], [55, 67, wzmocnienie], ..."
798,@USER Ty aby nie zacząleś ćpać przez wydumane ...,"[[13, 25, odwrócenie], [37, 54, wzmocnienie]]"


# Preprocessing

In [10]:
def adjust_labels_to_words(row):
    text = row['text']
    labels = row['label']
    adjusted_labels = []
    
    for start_idx, end_idx, label in labels:
        while start_idx > 0 and text[start_idx-1].isalnum():
            start_idx -= 1
        
        while end_idx < len(text) and text[end_idx].isalnum():
            end_idx += 1
        
        adjusted_labels.append((start_idx, end_idx, label))
    
    return adjusted_labels

In [11]:
nlp = spacy.load("pl_core_news_sm")

# polish_stopwords = stopwords.stopwords("pl")

def preprocess_text(text):
    text = re.sub(r'@\w+', '', text)

    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower().strip()

    doc = nlp(text)

    # return [token.lemma_ for token in doc if token.lemma_.lower() not in polish_stopwords]
    return [token.lemma_ for token in doc]

In [12]:
def get_labels(row):
    text = row['text']
    return [(preprocess_text(text[start_idx : end_idx]), label) for start_idx, end_idx, label in row['label']]


In [13]:
data['label'] = data.apply(adjust_labels_to_words, axis=1)

In [14]:
data['tokens'] = data['text'].apply(preprocess_text)

In [15]:
data['labels2'] = data.apply(get_labels, axis=1)

# Word2Vec

In [16]:
with open('data/full_text_classification.jsonl', 'r', encoding='utf-8') as file:
    texts = [preprocess_text(json.loads(line)['text']) for line in file]

texts.extend(data['tokens'])

In [17]:
w2v = gensim.models.Word2Vec(texts, vector_size=300, window=5, min_count=1, workers=4)

In [18]:
fragment_texts = set()
fragments = []
for labels in data['labels2']:
    for label in labels:
        text = ' '.join(label[0])
        if text in fragment_texts:
            continue

        fragment_texts.add(text)
        vectors = []
        for word in label[0]:
            if word in w2v.wv:
                vectors.append(w2v.wv[word])
        if vectors:
            fragments.append([np.mean(vectors, axis=0), text, label[1]])

df_fragments = pd.DataFrame(fragments, columns=['vector', 'text', 'label'])

In [19]:
tsne = TSNE(n_components=2, random_state=42, perplexity=10)
tsne_results = tsne.fit_transform(np.vstack(df_fragments['vector'].values))

tsne_df = pd.DataFrame(tsne_results, columns=['x', 'y'])
tsne_df['label'] = df_fragments['label']
tsne_df['text'] = df_fragments['text']

fig = px.scatter(tsne_df, x='x', y='y', color='label', 
                 title='Wizualizacja osadzeń Word2Vec przy użyciu t-SNE',
                 labels={'label': 'Wydźwięk'},
                 hover_name=tsne_df['text'], 
                 color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_traces(marker=dict(size=10),
                  selector=dict(mode='markers'))

fig.show()

In [20]:
df_fragments

Unnamed: 0,vector,text,label
0,"[0.03208522, 0.11183627, 0.08850198, 0.0559627...",nie uzny,odwrócenie
1,"[0.015718201, 0.046250414, 0.03262996, 0.02251...",wiecej polski,wzmocnienie
2,"[0.03477292, 0.11591163, 0.09144242, 0.0584718...",nie decydować,odwrócenie
3,"[0.063273594, 0.2084809, 0.16154782, 0.1059608...",nie mieć,odwrócenie
4,"[0.033696067, 0.11345251, 0.08600411, 0.056021...",nie obudzić,odwrócenie
...,...,...,...
598,"[0.0347093, 0.11659486, 0.08906367, 0.05726533...",nie pimp,odwrócenie
599,"[0.006372989, 0.010344596, 0.009473213, 0.0035...",zwyc ciota,wzmocnienie
600,"[0.003098149, 0.007902853, 0.0052166553, 0.001...",dziwka,wzmocnienie
601,"[0.034013852, 0.112634905, 0.08568529, 0.05553...",nie zacząleś,odwrócenie


In [21]:
df_fragments.to_csv('vectors.csv', index=False, encoding='utf-8')

In [22]:
from itertools import product

def top_k_similars(vec_list: list[np.array], k: int = 2, df2: pd.DataFrame = df_fragments):
    
    cosine_similarity = lambda v, w: np.dot(v, w) / (np.linalg.norm(v) * np.linalg.norm(w))
    
    pairs = list(product(vec_list, vec_list))
    pairs = [pair for pair in pairs if not np.array_equal(pair[0], pair[1])]
    cosines = [cosine_similarity(v, w) for v, w in pairs if not np.array_equal(v, w)]
    vecs1 = [tuple(pair[0]) for pair in pairs]
    vecs2 = [pair[1] for pair in pairs]
    pd_dict = {'vector_1': vecs1, 'vector_2': vecs2, 'cs': cosines}
    df = pd.DataFrame(pd_dict)
    
    vec_list = []
    sims_list = []
    cos_list = []
    for vec in df['vector_1'].unique():
        df_filter = df[df['vector_1'] == vec]
        df_sorted = df_filter.sort_values(by = 'cs', ascending = False).head(k)
        vec_list.append(np.array(vec))
        sims_list.append(df_sorted['vector_2'].values)
        cos_list.append(df_sorted['cs'].values)
        
    final_df = pd.DataFrame({'vector': vec_list, 'top_vectors': sims_list, 'cos_sim': cos_list})
    df2['vector'] = df2['vector'].apply(lambda x: tuple(x))
    final_df['vector'] = final_df['vector'].apply(lambda x: tuple(x))
    merged = pd.merge(df2, final_df, on = 'vector', how = 'inner')
    merged['vector'] = merged['vector'].apply(lambda x: np.array(x))
    dct = dict(zip(merged.text.values, merged.vector.values))
    
    top_vectors = merged['top_vectors'].values
    
    def find_key_by_value(dictionary, target_value):
        return next((key for key, value in dictionary.items() if np.array_equal(value, target_value)), None)
    
    ll = []
    for item in top_vectors:
        lll = []
        for vec in item:
            lll.append(find_key_by_value(dct, vec))
        ll.append(lll)
    
    merged['top_words'] = ll
    
    return merged

In [30]:
similarities = top_k_similars(vec_list=df_fragments['vector'].to_list())

In [31]:
similarities

Unnamed: 0,vector,text,label,top_vectors,cos_sim,top_words
0,"[0.03208522, 0.11183627, 0.08850198, 0.0559627...",nie uzny,odwrócenie,"[(0.06592008, 0.2236279, 0.17347991, 0.1114765...","[0.9999854, 0.9999854]","[nie, nie]"
1,"[0.015718201, 0.046250414, 0.03262996, 0.02251...",wiecej polski,wzmocnienie,"[(0.012964179, 0.04487863, 0.033291247, 0.0244...","[0.99983716, 0.9998052]","[polski mord, weć z]"
2,"[0.03477292, 0.11591163, 0.09144242, 0.0584718...",nie decydować,odwrócenie,"[(0.06592008, 0.2236279, 0.17347991, 0.1114765...","[0.9999862, 0.9999862]","[nie, nie]"
3,"[0.063273594, 0.2084809, 0.16154782, 0.1059608...",nie mieć,odwrócenie,"[(0.067248136, 0.2147036, 0.16824406, 0.109277...","[0.99999034, 0.99999017]","[nie być, a nie]"
4,"[0.033696067, 0.11345251, 0.08600411, 0.056021...",nie obudzić,odwrócenie,"[(0.06592008, 0.2236279, 0.17347991, 0.1114765...","[0.99998665, 0.99998665]","[nie, nie]"
...,...,...,...,...,...,...
598,"[0.0347093, 0.11659486, 0.08906367, 0.05726533...",nie pimp,odwrócenie,"[(0.06592008, 0.2236279, 0.17347991, 0.1114765...","[0.99998665, 0.99998665]","[nie, nie]"
599,"[0.006372989, 0.010344596, 0.009473213, 0.0035...",zwyc ciota,wzmocnienie,"[(0.020286871, 0.064928256, 0.04803827, 0.0326...","[0.99519885, 0.9951795]","[chyba wybierzemy, chyba jakiś]"
600,"[0.003098149, 0.007902853, 0.0052166553, 0.001...",dziwka,wzmocnienie,"[(0.011763301, 0.030984126, 0.02600205, 0.0149...","[0.9902191, 0.99014634]","[zbyt mocno, no blagać]"
601,"[0.034013852, 0.112634905, 0.08568529, 0.05553...",nie zacząleś,odwrócenie,"[(0.06592008, 0.2236279, 0.17347991, 0.1114765...","[0.9999853, 0.9999853]","[nie, nie]"
