# Wikipedia

In [None]:
import gradio as gr
from transformers import pipeline
import pandas as pd
import altair as alt

ner = pipeline("ner", grouped_entities=True)

def analyze_text(text):
    data = ner(text)
    data = pd.DataFrame(data)
    
    # list for words and entitiy group
    word_list = data['word'].tolist()
    entity_list = data['entity_group'].tolist()

    # create df with word and entity
    word_entity_df = pd.DataFrame({'Word': word_list, 'Entity': entity_list})
    word_freq = word_entity_df.groupby(['Word', 'Entity']).size().reset_index(name='Frequency')

    color_scale = alt.Scale(domain=['ORG', 'LOC', 'MISC', 'PER'], range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

    scatter = alt.Chart(word_freq).mark_circle(opacity=0.7, stroke='black', strokeWidth=0.5).encode(
    x=alt.X('Frequency', scale=alt.Scale(padding=1), axis=alt.Axis(tickCount=5)),
    y=alt.Y('Word', sort='-x'),
    size=alt.Size('Frequency', scale=alt.Scale(range=[200, 1500]), title='Frequency'),
    color=alt.Color('Entity', scale=color_scale, legend=alt.Legend(title='Entity')),
    tooltip=['Word', 'Frequency', 'Entity']
    ).properties(
    title='Words and Entities',
    width=1000,
    height=800
    ).configure_view(
    strokeWidth=0,
    fill='f5f5f5'
    ).configure_title(
    fontSize=25,
    anchor='start'
    ).configure_axis(
    labelColor='grey',
    labelFontSize=12,
    titleFontSize=15
    ).configure_legend(
    labelFontSize=12,
    symbolSize=150,
    symbolStrokeWidth=2,
    labelPadding=10
    ).configure_axisX(
    grid=False
    )


    return scatter

demo = gr.Interface(
    fn=analyze_text,
    inputs="text",
    outputs="plot"
)

demo.launch()


In [None]:
import gradio as gr
from transformers import pipeline
import pandas as pd
import altair as alt
import re

ner = pipeline("ner", grouped_entities=True)

def analyze_text(text):

    umlaute_dict = {'Ä': 'AE', 'Ö': 'OE', 'Ü': 'UE', 'ä': 'ae', 'ö': 'oe', 'ü': 'ue'}
    for umlaut, ersatz in umlaute_dict.items():
        text_clean = text.replace(umlaut, ersatz)
    
    data = ner(text_clean)
    data = pd.DataFrame(data)

    word_list = data['word'].tolist()
    entity_list = data['entity_group'].tolist()

    word_entity_df = pd.DataFrame({'Word': word_list, 'Entity': entity_list})
    word_freq = word_entity_df.groupby(['Word', 'Entity']).size().reset_index(name='Frequency')

    color_scale = alt.Scale(domain=['ORG', 'LOC', 'MISC', 'PER'], range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

    scatter = alt.Chart(word_freq).mark_circle(opacity=0.7, stroke='black', strokeWidth=0.5).encode(
        x=alt.X('Frequency', scale=alt.Scale(padding=1), axis=alt.Axis(tickCount=5)),
        y=alt.Y('Word', sort='-x'),
        size=alt.Size('Frequency', scale=alt.Scale(range=[200, 1500]), title='Frequency'),
        color=alt.Color('Entity', scale=color_scale, legend=alt.Legend(title='Entity')),
        tooltip=['Word', 'Frequency', 'Entity']
    ).properties(
        title='Words and Entities',
        width=600,
        height=1000
    ).configure_view(
        strokeWidth=0,
        fill='f5f5f5'
    ).configure_title(
        fontSize=25,
        anchor='start'
    ).configure_axis(
        labelColor='grey',
        labelFontSize=12,
        titleFontSize=15
    ).configure_legend(
        labelFontSize=12,
        symbolSize=150,
        symbolStrokeWidth=2,
        labelPadding=10
    ).configure_axisX(
        grid=False
    )

    scatter.save('scatter_plot.html')  


    # sort word_freq for better ui experience

    sorted_word_freq = word_freq.sort_values(by='Frequency', ascending=False)

    return sorted_word_freq, "scatter_plot.html" 



demo = gr.Interface(
    fn=analyze_text,
    inputs="text",
    outputs=["dataframe","file"],
    title="Named Entity Recognition"
    
)


demo.launch()

