# Wikipedia

In [1]:
import pandas as pd
import requests
import re
from transformers import pipeline
import gradio as gr


ner = pipeline("ner", grouped_entities=True)

# URL
url = "https://raw.githubusercontent.com/laurenzbrahner/BigDataScenarios/main/data/NER_text_Wikipedia_crawl.txt"

# Load the text
response = requests.get(url)
text = response.text

# Delete the brackets
pattern = r'\[.*?\]'
text = re.sub(pattern, '', text)

# split all sentences in a list
sentences_list = text.split(". ")


def ner_wikipedia(sentence, entities_to_display):
  # use transformer model
  data = ner(sentence)
  # to dataframe
  data = pd.DataFrame(data)
  # Filter entities if user selected one
  if entities_to_display:
    data = data[data['entity_group'].isin(entities_to_display)]

  word_list = data['word'].tolist()
  entity_list = data['entity_group'].tolist()

  word_entity_df = pd.DataFrame({'Word': word_list, 'Entity': entity_list})
  word_freq = word_entity_df.groupby(['Word', 'Entity']).size().reset_index(name='Frequency')
    
  # Colors for the Points in the Scatter Plot
  color_scale = alt.Scale(domain=['ORG', 'LOC', 'MISC', 'PER'], range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

   # Define Scatter plot
  scatter = alt.Chart(word_freq).mark_circle(opacity=0.7, stroke='black', strokeWidth=0.5).encode(
        x=alt.X('Frequency', scale=alt.Scale(padding=1), axis=alt.Axis(tickCount=5)),
        y=alt.Y('Word', sort='-x'),
        size=alt.Size('Frequency', scale=alt.Scale(range=[200, 1500]), title='Frequency'),
        color=alt.Color('Entity', scale=color_scale, legend=alt.Legend(title='Entity')),
        tooltip=['Word', 'Frequency', 'Entity']
    ).properties(
        title='Words and Entities',
        width=500,
        height=600
    ).configure_view(
        strokeWidth=0,
        fill='f5f5f5'
    ).configure_title(
        fontSize=25,
        anchor='start'
    ).configure_axis(
        labelColor='grey',
        labelFontSize=12,
        titleFontSize=15
    ).configure_legend(
        labelFontSize=12,
        symbolSize=150,
        symbolStrokeWidth=2,
        labelPadding=10,
        padding=40
    ).configure_axisX(
        grid=False
    )

  scatter.save('scatter_plot.html')  

  sorted_word_freq = word_freq.sort_values(by='Frequency', ascending=False)

  return sorted_word_freq, scatter, "scatter_plot.html"



checkbox = gr.CheckboxGroup(["LOC", "MISC", "PER", "ORG"], 
                            label="Choose the entities you want to see",
                            info="If you select nothing, every entity will be selected")








dropdown = gr.Dropdown(sentences_list, label='Choose an article you would like to have summarized')


demo = gr.Interface(
    fn=ner_wikipedia,
    inputs=[dropdown, checkbox],
    outputs=["dataframe","plot", "file"],
    title="Named Entity Recognition"
)

demo.launch()





No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


