In [35]:
from plotly import express as px
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from dash import dcc
import numpy as np
import pandas as pd
import cmasher as cmr




def generate_dash_component(archetype_or_group, fig):
    return dcc.Graph(
        id=f"wordcloud_{archetype_or_group}",
        figure=fig,
        style={"height": 1000},
        config={"displayModeBar": False, "autosizable": True, "responsive": True},
    )


def generate_wordcloud_fig(wordcloud_image):
    fig = px.imshow(wordcloud_image)
    fig.update_layout(
        xaxis={'visible': False},
        yaxis={'visible': False},
        margin={'t': 0, 'b': 0, 'l': 0, 'r': 0},
        hovermode=False,
        paper_bgcolor="#F9F9FA",
        plot_bgcolor="#F9F9FA",
    )
    return fig


def generate_wordcloud_div(wordcloud_exclusions, input_df, word_col, archetype_or_group):
    """
    Function that will generate and save wordcloud.
    Text being analyzed already has general stopwords
    removed from earlier preprocessing. Will exclude
    search query only.
    Classname will be used in filename.
    """
    # save classname
    archetype_or_group = str(archetype_or_group)

    # add search query to list of exclusions
    excluded_words = wordcloud_exclusions + list(STOPWORDS)

    # instantiate wordcloud
    wordcloud = WordCloud(
        stopwords=excluded_words,
        min_font_size=8,
        scale=5,
        background_color='#F9F9FA',
        collocations=True,
        regexp=r"[a-zA-z#&]+",
        max_words=30,
        min_word_length=4,
        font_path='C:/Windows/Fonts/BASKVILL.TTF',
        collocation_threshold=0,
        colormap=cmr.get_sub_cmap('ocean', 0, 0.7),
    )

    # generate image
    wordcloud_text = " ".join(text for text in input_df[word_col])
    wordcloud_image = wordcloud.generate(wordcloud_text)
    wordcloud_image = wordcloud_image.to_array()
    fig = generate_wordcloud_fig(wordcloud_image)
    fig.show()
    fig.write_image(f"wordcloud_{archetype_or_group}.svg", scale=1, width=1800, height=800)
    return generate_dash_component(archetype_or_group, fig)


data = {
    'word': [
        'python', 'data', 'analysis', 'visualization', 'machine', 'learning',
        'artificial', 'intelligence', 'programming', 'code', 'algorithm',
        'statistics', 'pandas', 'numpy', 'matplotlib', 'seaborn', 'jupyter',
        'notebook', 'dataset', 'model', 'training', 'prediction', 'feature',
        'regression', 'classification', 'clustering', 'neural', 'network',
        'deep', 'science', 'big', 'mining', 'warehousing', 'engineering',
        'software', 'development', 'framework', 'library', 'api', 'database',
        'sql', 'nosql', 'cloud', 'computing', 'aws', 'azure', 'google',
        'docker', 'kubernetes', 'git', 'version', 'control'
    ],
    'frequency': [
        120, 95, 87, 92, 78, 85, 65, 72, 110, 88, 76,
        69, 82, 79, 71, 63, 58, 67, 74, 89, 83, 77, 68,
        56, 62, 54, 61, 59, 73, 91, 96, 64, 52, 66, 75,
        81, 57, 70, 49, 53, 47, 45, 84, 86, 43, 48, 55,
        41, 39, 46, 44, 42
    ]
}

sample_data = pd.DataFrame(data)
# print(sample_data)
WordCloud = generate_wordcloud_div([], sample_data, 'word', 'sample_archetype')