In [95]:
import gensim
import numpy as np
import pandas as pd
import re
import stopwordsiso as stopwords
import spacy
import json
from sklearn.manifold import TSNE
import plotly.express as px

In [18]:
#! python -m spacy download pl_core_news_sm

Collecting pl-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_sm-3.8.0/pl_core_news_sm-3.8.0-py3-none-any.whl (20.2 MB)
     ---------------------------------------- 0.0/20.2 MB ? eta -:--:--
     ------- -------------------------------- 3.9/20.2 MB 29.4 MB/s eta 0:00:01
     ----------------------- --------------- 12.1/20.2 MB 34.3 MB/s eta 0:00:01
     --------------------------------------  20.2/20.2 MB 36.4 MB/s eta 0:00:01
     --------------------------------------- 20.2/20.2 MB 35.4 MB/s eta 0:00:00
Installing collected packages: pl-core-news-sm
Successfully installed pl-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_sm')


In [7]:
file_path = 'data/fragments_classification.jsonl'  
data = pd.read_json(file_path, lines=True)

In [8]:
data

Unnamed: 0,text,label
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, odwrócenie]]"
1,@USER No właśnie o tym jest ten tweet 😄,[]
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, wzmocnienie]]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[]
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[]
...,...,...
595,"@USER Standard, uciekły po ujawniono ich manip...","[[6, 14, osłabienie]]"
596,@USER @USER No ale dwie naraz to nie jest norm...,"[[33, 41, odwrócenie]]"
597,@USER Wciąż przemyka Nowym Światem wartkim kro...,[]
598,@USER @USER mniejsza ? A na euro 12 w ktorym m...,"[[62, 77, wzmocnienie]]"


# Preprocessing

In [78]:
def adjust_labels_to_words(row):
    text = row['text']
    labels = row['label']
    adjusted_labels = []
    
    for start_idx, end_idx, label in labels:
        while start_idx > 0 and text[start_idx-1].isalnum():
            start_idx -= 1
        
        while end_idx < len(text) and text[end_idx].isalnum():
            end_idx += 1
        
        adjusted_labels.append((start_idx, end_idx, label))
    
    return adjusted_labels

In [51]:
nlp = spacy.load("pl_core_news_sm")

polish_stopwords = stopwords.stopwords("pl")

def preprocess_text(text):
    text = re.sub(r'@\w+', '', text)

    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower().strip()

    doc = nlp(text)

    # return [token.lemma_ for token in doc if token.lemma_.lower() not in polish_stopwords]
    return [token.lemma_ for token in doc]

In [50]:
def get_labels(row):
    text = row['text']
    return [(preprocess_text(text[start_idx : end_idx]), label) for start_idx, end_idx, label in row['label']]


In [79]:
data['label'] = data.apply(adjust_labels_to_words, axis=1)

In [80]:
data['tokens'] = data['text'].apply(preprocess_text)

In [81]:
data['labels2'] = data.apply(get_labels, axis=1)

# Word2Vec

In [82]:
with open('data/full_text_classification.jsonl', 'r', encoding='utf-8') as file:
    texts = [preprocess_text(json.loads(line)['text']) for line in file]

texts.extend(data['tokens'])

In [83]:
w2v = gensim.models.Word2Vec(texts, vector_size=300, window=5, min_count=1, workers=4)

In [108]:
fragment_texts = set()
fragments = []
for labels in data['labels2']:
    for label in labels:
        text = ' '.join(label[0])
        if text in fragment_texts:
            continue

        fragment_texts.add(text)
        vectors = []
        for word in label[0]:
            if word in w2v.wv:
                vectors.append(w2v.wv[word])
        if vectors:
            fragments.append([np.mean(vectors, axis=0), text, label[1]])

df_fragments = pd.DataFrame(fragments, columns=['vector', 'text', 'label'])

In [132]:
tsne = TSNE(n_components=2, random_state=42, perplexity=10)
tsne_results = tsne.fit_transform(np.vstack(df_fragments['vector'].values))

tsne_df = pd.DataFrame(tsne_results, columns=['x', 'y'])
tsne_df['label'] = df_fragments['label']
tsne_df['text'] = df_fragments['text']

fig = px.scatter(tsne_df, x='x', y='y', color='label', 
                 title='Wizualizacja osadzeń Word2Vec przy użyciu t-SNE',
                 labels={'label': 'Wydźwięk'},
                 hover_name=tsne_df['text'], 
                 color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_traces(marker=dict(size=10),
                  selector=dict(mode='markers'))

fig.show()