In [43]:
import re

from definitions import project_folders
import pandas as pd
from text_processing.sentencize import sentencize_dataframe
from text_processing.import_docx import DocumentToDataFrame
import hu_core_news_lg
from tqdm import tqdm
from utils.helpers import get_stopwords

tqdm.pandas()

In [2]:
corpus_generator = DocumentToDataFrame(data_folder='data')

In [3]:
corpus = corpus_generator.generate_corpus()

# Sentencizing and Preprocessing of documents

In [4]:
corpus_sented = sentencize_dataframe(corpus)

100%|██████████| 2772/2772 [11:33<00:00,  4.00it/s]


In [5]:
nlp = hu_core_news_lg.load()

In [6]:
tokenizer = nlp.tokenizer

In [7]:
corpus_sented['sentence_token_count'] = corpus_sented['sentences'].apply(
    lambda x: len(tokenizer(re.sub(' {2,}', ' ', re.sub('[\\.\\,\\?\\!]|', '', x)))))

In [8]:
corpus_sented['token_count'] = corpus_sented['text'].apply(lambda x: len(tokenizer(x)))

In [9]:
corpus_sented.to_csv(project_folders['work'] / 'corpus_sented.csv')

In [10]:
corpus_sented = corpus_sented[corpus_sented['sentence_token_count'] > 4]

In [95]:
# corpus_sented = pd.read_csv(project_folders['work'] / 'corpus_sented.csv')

In [11]:
print(type(corpus_sented))

<class 'pandas.core.frame.DataFrame'>


# Classification

In [11]:
# Use a pipeline as a high-level helper
from transformers import pipeline

sentence_classifier = pipeline("text-classification", model="NYTK/sentiment-hts5-hubert-hungarian")

In [12]:
def categorize_labels(text):
    try:
        return sentence_classifier(text, top_k=5)
    except Exception as e:
        print(f"An error occurred: {type(e).__name__} - {e}")
        return None

In [15]:
corpus_sented = pd.concat([corpus_sented,  #left
                           corpus_sented['sentences']  #right
                          .progress_apply(categorize_labels)
                          .explode()
                          .apply(pd.Series)
                          .reset_index().pivot(index='index', columns='label', values='score')]
                          , axis=1)

100%|██████████| 4375/4375 [10:54<00:00,  6.69it/s]


In [17]:
corpus_sented = corpus_sented.rename({'LABEL_0': '-2',
                                      'LABEL_1': '-1',
                                      'LABEL_2': '0',
                                      'LABEL_3': '1',
                                      'LABEL_4': '2'}, axis=1)

In [18]:
#introducing a constant shift compared to the 0 solution to bias towards the norm:
for col in ['-2', '-1', '1', '2']:
    corpus_sented[col] = corpus_sented[col] - 0.3

In [19]:
corpus_sented['most_probable'] = corpus_sented[['-2', '-1', '0', '1', '2']].idxmax(axis=1)

In [20]:
for col in ['-2', '-1', '1', '2']:
    corpus_sented[col] = corpus_sented[col] + 0.3

In [7]:
negative = corpus_sented[corpus_sented['most_probable'].str.contains('|'.join(['-2', '-1']))]

In [8]:
positive = corpus_sented[corpus_sented['most_probable'].str.contains('|'.join(['^1', '^2']))]

In [9]:
very_negative = corpus_sented[corpus_sented['most_probable'].str.contains('-2')]
very_positive = corpus_sented[corpus_sented['most_probable'].str.contains('^2')]

In [21]:
corpus_sented['text_id'] = corpus_sented.groupby('text').ngroup()

In [32]:
corpus_sented['text'] = corpus_sented.groupby('text_id')['sentences'].transform(' '.join)

In [34]:
corpus_sented['negative_sentences'] = corpus_sented['most_probable'].apply(lambda x: x in ['-1', '-2'])
corpus_sented['very_negative_sentences'] = corpus_sented['most_probable'].str.contains('-2')
corpus_sented['positive_sentences'] = corpus_sented['most_probable'].apply(lambda x: x in ['1', '2'])
corpus_sented['very_positive_sentences'] = corpus_sented['most_probable'].str.contains('^2')

In [35]:
corpus_sented['has_negative'] = corpus_sented.groupby('text_id')['negative_sentences'].transform('max')
corpus_sented['has_very_negative'] = corpus_sented.groupby('text_id')['very_negative_sentences'].transform('max')
corpus_sented['has_positive'] = corpus_sented.groupby('text_id')['positive_sentences'].transform('max')
corpus_sented['has_very_positive'] = corpus_sented.groupby('text_id')['very_positive_sentences'].transform('max')

In [40]:
corpus_sented.to_csv('results/sentiment_predicted.csv', index=True)

In [None]:
# corpus_sented = pd.read_csv('results/sentiment_predicted.csv')

In [55]:
corpus = corpus_sented.drop_duplicates('text_id').drop(['sentences',
                                                        'sentence_token_count',
                                                        'negative_sentences',
                                                        'very_negative_sentences',
                                                        'positive_sentences',
                                                        'very_positive_sentences'], axis=1)

In [61]:
corpus.reset_index(inplace=True)

# Embeddings of subsets - to be tokenized

In [41]:
from text_processing.embedding import SentenceEmbedder

In [42]:
embedder = SentenceEmbedder()

In [56]:
text_embeddings = embedder.retrieve_embeddings(
    corpus,
    text_varname='text',
    load_from_file=False,
    persist=True,
    embedding_file_path=project_folders['work'] / 'embedding_text.npy')

100%|██████████| 130/130 [26:01<00:00, 12.01s/it]


# Preparing for Topic Modeling - Lemmatization for c-TF-IDF

In [64]:
import hu_core_news_lg

nlp = hu_core_news_lg.load()

In [65]:
with open('work_files/stop_words.txt', mode='r', encoding='UTF-8') as file:
    document = nlp(' '.join([line.strip() for line in file.readlines() + get_stopwords()]))

In [66]:
#lemmatizing stopwords
interview_stopwords = list({word.lemma_ for word in tqdm(document)})

100%|██████████| 299/299 [00:00<00:00, 315178.91it/s]


In [67]:
interview_stopwords

['tényleg',
 'gondol',
 'szeret',
 'néha',
 'köszi',
 'mellett',
 'új',
 'kicsi',
 'mikor',
 'lesz',
 'amúgy',
 'ill',
 'cikk',
 'sem',
 'vagy',
 'ahogy',
 'írtátl',
 'illetve',
 'ok',
 'több',
 'vissza',
 'előtt',
 'számára',
 'én',
 'nehéz',
 'emilyen',
 'miért',
 'vagyis',
 'újra',
 'ne',
 'nincs',
 'fú',
 'szép',
 'míg',
 'van',
 'tök',
 'annyira',
 'kér',
 'elso',
 'ison',
 'aztán',
 'lehet',
 'felír',
 'oket',
 'oda',
 'elég',
 'egyes',
 'szerinte',
 'dolog',
 'ők',
 'ő',
 'rossz',
 'közben',
 'továbbá',
 'miatt',
 'úgy',
 'megoszt',
 'se',
 'mivel',
 'ellen',
 'nagy',
 'néhány',
 's',
 'közül',
 'és',
 'maga',
 'olyan',
 'keres',
 'valid',
 'hogyan',
 'már',
 'fel',
 'ilyen',
 'szét',
 'semmi',
 'majd',
 'bár',
 'igazából',
 'valaki',
 'aki',
 'azért',
 'volna',
 'át',
 'sokkal',
 'oké',
 'teljes',
 'ide',
 'szerint',
 'milyen',
 'nyis',
 'mindig',
 'itt',
 'ma',
 'amíg',
 'te',
 'ért',
 'pozitív',
 'egyszerre',
 'kell',
 'között',
 'nem',
 'ha',
 'azután',
 'köszön',
 'valójába

In [53]:
corpus_list = corpus_sented['text'].tolist()
batch_size = 700

In [54]:
batches = [corpus_list[i:i + batch_size] for i in range(0, len(corpus_list), batch_size)]

In [71]:
corpus['lemmatized'] = corpus['text'].progress_apply(
    lambda x: ' '.join(
        [word.lemma_ for word in nlp(x)])).str.replace(' ,', ',').str.replace(' \\.', '.')

  3%|▎         | 67/2066 [00:06<03:28,  9.59it/s]

KeyboardInterrupt



In [187]:
corpus.to_csv('work_files/corpus_lemmatized.csv', index=False)

# Topic Modeling

In [74]:
from text_processing.topic_modeling import *

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [76]:
df_topic = corpus[
    ['doc_id', 'id', 'lemmatized', 'text', 'has_negative', 'has_positive', 'has_very_negative', 'has_very_positive']]

In [128]:
umap_params = {'n_neighbors': 3, 'n_components': 5, 'min_dist': 0.1, 'metric': 'cosine', 'random_state': 42}
hdbscan_params = {'min_cluster_size': 5, 'metric': 'euclidean', 'cluster_selection_method': 'eom',
                  'prediction_data': True}

In [129]:
text_container = TextContainer(corpus_input=df_topic[df_topic.has_negative], text_varname='lemmatized')
embeddings_container = EmbeddingsContainer(embeddings=text_embeddings[df_topic[df_topic.has_negative].index])
param_container = ParamContainer(
    dimensionality_reducer_params=umap_params,
    clusterer_params=hdbscan_params,
    stopwords=interview_stopwords)

In [130]:
model_container = ModelContainer(param_container=param_container)
estimator = TopicEstimator(text_container=text_container,
                           embeddings_container=embeddings_container,
                           applied_models=model_container)

In [131]:
_, _, topic_model = estimator.estimate_topic_model()

In [133]:
negative_results = topic_model.get_topic_info()

In [0]:
umap_params = {'n_neighbors': 3, 'n_components': 5, 'min_dist': 0.1, 'metric': 'cosine', 'random_state': 42}
hdbscan_params = {'min_cluster_size': 4, 'metric': 'euclidean', 'cluster_selection_method': 'eom',
                  'prediction_data': True}
text_container = TextContainer(corpus_input=df_topic[df_topic.has_positive], text_varname='lemmatized')
embeddings_container = EmbeddingsContainer(embeddings=text_embeddings[df_topic[df_topic.has_positive].index])
param_container = ParamContainer(
    dimensionality_reducer_params=umap_params,
    clusterer_params=hdbscan_params,
    stopwords=interview_stopwords)
model_container = ModelContainer(param_container=param_container)
estimator = TopicEstimator(text_container=text_container,
                           embeddings_container=embeddings_container,
                           applied_models=model_container)
_, _, topic_model = estimator.estimate_topic_model()

In [138]:
positive_results = topic_model.get_topic_info()

In [140]:
from pandas import ExcelWriter

In [147]:
sheet_names = ['negative', 'positive']
with ExcelWriter(project_folders['result'] / 'results.xlsx') as writer:
    for i, df in enumerate([negative_results, positive_results]):
        df.to_excel(writer, f'{sheet_names[i]} results')

In [25]:
umap_params_list = [{'n_neighbors': n, 'n_components': c, 'min_dist': d, 'metric': 'cosine', 'random_state': 42}
                    for n in [5, 20]
                    for c in [2, 3, 5]
                    for d in [.1]]
hdbscan_params_list = [
    {'min_cluster_size': m, 'metric': 'euclidean', 'cluster_selection_method': 'eom', 'prediction_data': True}
    for m in [5, 15, 30, 50]]

iterate_topic_estimator = IterateTopicEstimator(param_container=param_container,
                                                embeddings_container=embeddings_container,
                                                text_container=text_container,
                                                paramgrid_filename=project_folders[
                                                                       'result'] / 'negative_full_param_grid.xlsx',
                                                reduced_embeddings_folder='negative_reduced_embeddings',
                                                dimensionality_reducer_params_to_iterate=umap_params_list,
                                                clusterer_params_to_iterate=hdbscan_params_list)

In [26]:
iterate_topic_estimator.set_param_grid()
iterate_topic_estimator.estimate_and_store_over_param_set(optimized=True, persist=True)
save_xls(iterate_topic_estimator.topic_tables, iterate_topic_estimator.paramgrid_result_path)

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:20<01:00, 20.32s/it][A
 50%|█████     | 2/4 [00:31<00:29, 14.94s/it][A
 75%|███████▌  | 3/4 [00:46<00:14, 14.94s/it][A
100%|██████████| 4/4 [01:10<00:00, 17.70s/it][A
 17%|█▋        | 1/6 [01:58<09:53, 118.70s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:15<00:46, 15.62s/it][A
 50%|█████     | 2/4 [00:31<00:31, 15.61s/it][A
 75%|███████▌  | 3/4 [00:49<00:16, 16.83s/it][A
100%|██████████| 4/4 [01:04<00:00, 16.04s/it][A
 33%|███▎      | 2/6 [03:36<07:06, 106.63s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:17<00:53, 17.69s/it][A
 50%|█████     | 2/4 [00:34<00:34, 17.13s/it][A
 75%|███████▌  | 3/4 [00:48<00:15, 15.94s/it][A
100%|██████████| 4/4 [01:01<00:00, 15.27s/it][A
 50%|█████     | 3/6 [05:09<05:00, 100.21s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:12<00:37, 12.52s/it][A
 50%|█████     | 2/4

In [195]:
param_container.vectorizer_params

In [196]:
asd = ParamContainer(stopwords=get_stopwords())

In [198]:
asd.stopwords

['azonban',
 'számára',
 'ison',
 'által',
 'őket',
 'éppen',
 'csak',
 'egyéb',
 'amelyet',
 'sok',
 'persze',
 'ilyenkor',
 'egy',
 'valami',
 'több',
 'elég',
 'a',
 'teljes',
 'hiszen',
 'hát',
 'voltak',
 'aztán',
 'inkább',
 'belül',
 'eddig',
 'ekkor',
 'sem',
 'ilyen',
 'közül',
 'mindenki',
 'ellen',
 'elso',
 'cikk',
 'lett',
 'előtt',
 'nekem',
 'stb.',
 'való',
 'ebben',
 'melyek',
 'kívül',
 'pedig',
 'akár',
 'szerint',
 'eloször',
 'amelyeket',
 'amelynek',
 'jó',
 'ugyanis',
 'valamint',
 'tovább',
 'egész',
 'le',
 'hanem',
 'hogyan',
 'ki',
 'lehetett',
 'ma',
 'vagy',
 'mit',
 'mi',
 'o',
 'jól',
 'lenne',
 'ok',
 'nagy',
 'sokat',
 'össze',
 'aki',
 'legyen',
 'minden',
 'amíg',
 'volt',
 'sokkal',
 'ezen',
 'között',
 'vagyis',
 'de',
 'újra',
 'ha',
 'akkor',
 's',
 'magát',
 'ismét',
 'ő',
 'vannak',
 'maga',
 'azt',
 'ehhez',
 'is',
 'mely',
 'valaki',
 'ennek',
 'volna',
 'keressünk',
 'szét',
 'igen',
 'ti',
 'mindent',
 'ezért',
 'túl',
 'ez',
 'ahogy',
 'cik

In [175]:
text_container = TextContainer(corpus_input=df_topic[df_topic.has_positive], text_varname='text')
embeddings_container = EmbeddingsContainer(embeddings=text_embeddings[df_topic[df_topic.has_positive].index])
param_container = ParamContainer(stopwords=get_stopwords())

In [176]:
iterate_topic_estimator = IterateTopicEstimator(param_container=param_container,
                                                embeddings_container=embeddings_container,
                                                text_container=text_container,
                                                paramgrid_filename=project_folders[
                                                                       'result'] / 'positive_full_param_grid.xlsx',
                                                reduced_embeddings_folder='positive_reduced_embeddings',
                                                dimensionality_reducer_params_to_iterate=umap_params_list,
                                                clusterer_params_to_iterate=hdbscan_params_list)

In [177]:
iterate_topic_estimator.set_param_grid()
iterate_topic_estimator.estimate_and_store_over_param_set(optimized=True, persist=True)

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:10<00:31, 10.56s/it][A
 50%|█████     | 2/4 [00:12<00:11,  5.67s/it][A
 75%|███████▌  | 3/4 [00:14<00:03,  3.99s/it][A
100%|██████████| 4/4 [00:16<00:00,  4.22s/it][A
 17%|█▋        | 1/6 [00:29<02:25, 29.16s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:02<00:06,  2.31s/it][A
 50%|█████     | 2/4 [00:04<00:04,  2.02s/it][A
 75%|███████▌  | 3/4 [00:05<00:01,  1.91s/it][A
100%|██████████| 4/4 [00:08<00:00,  2.06s/it][A
 33%|███▎      | 2/6 [00:40<01:15, 18.95s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:02<00:06,  2.16s/it][A
 50%|█████     | 2/4 [00:04<00:04,  2.08s/it][A
 75%|███████▌  | 3/4 [00:06<00:02,  2.03s/it][A
100%|██████████| 4/4 [00:08<00:00,  2.00s/it][A
 50%|█████     | 3/6 [00:53<00:48, 16.06s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:02<00:05,  2.00s/it][A
 50%|█████     | 2/4 [0

In [178]:
save_xls(iterate_topic_estimator.topic_tables, iterate_topic_estimator.paramgrid_result_path)

In [180]:
sw = pd.Series(get_stopwords())

In [185]:
sw.str.contains('és').max()

True

In [188]:
iterate_topic_estimator.estimator.vectorizer

NoneType

In [95]:
interview_stopwords

['tud',
 'szóval',
 'ugyanez',
 'szoba',
 'felír',
 'bocsánat',
 'oké',
 'ühüm',
 'aha',
 'ja',
 'nehéz',
 'sajnos',
 'ír',
 'szerinte',
 'tényleg',
 'kicsi',
 'közben',
 'dolog',
 'annyi',
 'amúgy',
 'mond',
 'egyébként',
 'első',
 'hogyha',
 'lesz',
 'tök',
 'köszön',
 'köszön',
 'köszi',
 'ért',
 'ért',
 'ugye',
 'annyi',
 'dolog',
 'se',
 'úgyhogy',
 'ért',
 'vár',
 'tud',
 'kér',
 'ti',
 'fontos',
 'egyszerre',
 'kér',
 'egymás',
 'tud',
 'negatív',
 'pozitív',
 'az',
 'szép',
 'nevetés',
 'sajnos',
 'jó',
 'rossz',
 'ja',
 'mi',
 'igazából',
 'pont',
 'mond',
 'valid',
 'érez',
 'gondol',
 'fú',
 'valójában',
 'akar',
 'szeret',
 'annyi',
 'szóval',
 'hogyha',
 'ért',
 'közben',
 'megoszt',
 'ez',
 'hogyan',
 'milyen',
 'azonban',
 'vagyis',
 'ahol',
 'ide',
 'emilyen',
 'elég',
 'ön',
 'sok',
 'között',
 'hanem',
 'jól',
 'lesz',
 'nagyon',
 'az',
 'jól',
 'egyes',
 'lesz',
 'amely',
 'ez',
 'mi',
 'új',
 'van',
 'ezért',
 'szinte',
 'vele',
 'ellen',
 'ez',
 'legalább',
 'mi',


In [81]:
text_instances = text_container.text_instances

In [96]:
iterate_topic_estimator.topic_models[4].update_topics(text_container.text_instances.progress_apply(
    lambda x: ' '.join(
        [word.lemma_ for word in nlp(x)])).str.replace(' ,', ',').str.replace(' \\.', '.'),
                                                      vectorizer_model=CountVectorizer(
                                                          stop_words=interview_stopwords))

100%|██████████| 731/731 [03:16<00:00,  3.72it/s]


In [97]:
iterate_topic_estimator.topic_models[4].get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5,-1_covid_unatkoztál_komplett_csaba,"[covid, unatkoztál, komplett, csaba, unatkozik...","[Köszi szépen. Még mondtad, először Csaba, hog..."
1,0,682,0_óra_neki_csinál_tanár,"[óra, neki, csinál, tanár, egyetem, szak, év, ...","[Hát, én talán kicsit tudok, mert én is azt ír..."
2,1,31,1_kommunikáció_vesz_füst_problépma,"[kommunikáció, vesz, füst, problépma, láng, tr...","[Igen, és hogy igazából ezáltal funkcióját ves..."
3,2,13,2_nyis_kiegyenlítetlen_visszáj_menza,"[nyis, kiegyenlítetlen, visszáj, menza, jaj, h...","[Hogy ugyanez a visszájára is tud fordulni?, H..."


In [64]:
text_container.text_instances

1       Ha a gondolatot befejezted, akkor újba már ne ...
8       Mármint így az Óbudai Egyetemhez képest nem vo...
23      Hát én sem mondanám, hogy volt, vagy nem monda...
37      Én meg igazából, velem többször megtörtént az,...
40      Hú, ez egy ilyen megfoghatatlan dolog, ami sze...
                              ...                        
2479                                Jaja, rohadjanak meg!
2491    Nekem még egy lenne, hogy "jobb órarend felosz...
2493    Hát én azt írtam, hogy "kevesebb óra, hogy töb...
2498    Mondjuk, hogy így. Nekem van egy ilyen, de ez ...
2499    Köszi szépen. Minden csoporton visszatérő prob...
Name: text, Length: 731, dtype: object