# **NER with sentence picker: demonstrated with wikipedia page** 

### Required libraries


In [None]:
import pandas as pd
import requests
import re
from transformers import pipeline
import gradio as gr

### initialize pipeline

In [None]:
ner = pipeline("ner", grouped_entities=True)

### read in txt with wikipedia page for demo

In [None]:
# URL
url = "https://raw.githubusercontent.com/laurenzbrahner/BigDataScenarios/main/data/NER_text_Wikipedia_crawl.txt"


### Preprocess Loaded data

In [None]:
# Load the text
response = requests.get(url)
text = response.text

# Delete the brackets
pattern = r'\[.*?\]'
text = re.sub(pattern, '', text)

# split all sentences in a list
sentences_list = text.split(". ")

### Main input processing function

In [None]:
def ner_wikipedia(sentence, entities_to_display):
  # use transformer model
  data = ner(sentence)
  # to dataframe
  data = pd.DataFrame(data)
  # Filter entities if user selected one
  if entities_to_display:
    data = data[data['entity_group'].isin(entities_to_display)]

  word_list = data['word'].tolist()
  entity_list = data['entity_group'].tolist()

  word_entity_df = pd.DataFrame({'Word': word_list, 'Entity': entity_list})
  word_freq = word_entity_df.groupby(['Word', 'Entity']).size().reset_index(name='Frequency')
    
  # Colors for the Points in the Scatter Plot
  color_scale = alt.Scale(domain=['ORG', 'LOC', 'MISC', 'PER'], range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

   # Define Scatter plot
  scatter = alt.Chart(word_freq).mark_circle(opacity=0.7, stroke='black', strokeWidth=0.5).encode(
        x=alt.X('Frequency', scale=alt.Scale(padding=1), axis=alt.Axis(tickCount=5)),
        y=alt.Y('Word', sort='-x'),
        size=alt.Size('Frequency', scale=alt.Scale(range=[200, 1500]), title='Frequency'),
        color=alt.Color('Entity', scale=color_scale, legend=alt.Legend(title='Entity')),
        tooltip=['Word', 'Frequency', 'Entity']
    ).properties(
        title='Words and Entities',
        width=500,
        height=600
    ).configure_view(
        strokeWidth=0,
        fill='f5f5f5'
    ).configure_title(
        fontSize=25,
        anchor='start'
    ).configure_axis(
        labelColor='grey',
        labelFontSize=12,
        titleFontSize=15
    ).configure_legend(
        labelFontSize=12,
        symbolSize=150,
        symbolStrokeWidth=2,
        labelPadding=10,
        padding=40
    ).configure_axisX(
        grid=False
    )
  # Save the plot as html
  scatter.save('scatter_plot.html')  

  # Sort the dataframe by frequency
  sorted_word_freq = word_freq.sort_values(by='Frequency', ascending=False)
  # Return the dataframe, the scatter plot and the html file
  return sorted_word_freq, scatter, "scatter_plot.html"

## **Definition of the Gradio UI**

### checkbox and dropdown 

In [None]:
# Define the CheckboxGroup
checkbox = gr.CheckboxGroup(["LOC", "MISC", "PER", "ORG"], 
                            label="Choose the entities you want to see",
                            info="If you select nothing, every entity will be selected")

# Define the Dropdown
dropdown = gr.Dropdown(sentences_list, label='Choose an article you would like to have summarized')

### Instanciate and launch the Interface

In [None]:
demo = gr.Interface(
    fn=ner_wikipedia,
    inputs=[dropdown, checkbox],
    outputs=["dataframe","plot", "file"],
    title="Named Entity Recognition"
)

demo.launch()

## **The entire code in one piece:**

In [None]:
import pandas as pd
import requests
import altair as  alt
import re
from transformers import pipeline
import gradio as gr


ner = pipeline("ner", grouped_entities=True)

# URL
url = "https://raw.githubusercontent.com/laurenzbrahner/BigDataScenarios/main/data/NER_text_Wikipedia_crawl.txt"

# Load the text
response = requests.get(url)
text = response.text

# Delete the brackets
pattern = r'\[.*?\]'
text = re.sub(pattern, '', text)

# split all sentences in a list
sentences_list = text.split(". ")


def ner_wikipedia(sentence, entities_to_display):
  # use transformer model
  data = ner(sentence)
  # to dataframe
  data = pd.DataFrame(data)
  # Filter entities if user selected one
  if entities_to_display:
    data = data[data['entity_group'].isin(entities_to_display)]

  word_list = data['word'].tolist()
  entity_list = data['entity_group'].tolist()

  word_entity_df = pd.DataFrame({'Word': word_list, 'Entity': entity_list})
  word_freq = word_entity_df.groupby(['Word', 'Entity']).size().reset_index(name='Frequency')
    
  # Colors for the Points in the Scatter Plot
  color_scale = alt.Scale(domain=['ORG', 'LOC', 'MISC', 'PER'], range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

   # Define Scatter plot
  scatter = alt.Chart(word_freq).mark_circle(opacity=0.7, stroke='black', strokeWidth=0.5).encode(
        x=alt.X('Frequency', scale=alt.Scale(padding=1), axis=alt.Axis(tickCount=5)),
        y=alt.Y('Word', sort='-x'),
        size=alt.Size('Frequency', scale=alt.Scale(range=[200, 1500]), title='Frequency'),
        color=alt.Color('Entity', scale=color_scale, legend=alt.Legend(title='Entity')),
        tooltip=['Word', 'Frequency', 'Entity']
    ).properties(
        title='Words and Entities',
        width=500,
        height=600
    ).configure_view(
        strokeWidth=0,
        fill='f5f5f5'
    ).configure_title(
        fontSize=25,
        anchor='start'
    ).configure_axis(
        labelColor='grey',
        labelFontSize=12,
        titleFontSize=15
    ).configure_legend(
        labelFontSize=12,
        symbolSize=150,
        symbolStrokeWidth=2,
        labelPadding=10,
        padding=40
    ).configure_axisX(
        grid=False
    )

  scatter.save('scatter_plot.html')  

  sorted_word_freq = word_freq.sort_values(by='Frequency', ascending=False)

  return sorted_word_freq, scatter, "scatter_plot.html"



checkbox = gr.CheckboxGroup(["LOC", "MISC", "PER", "ORG"], 
                            label="Choose the entities you want to see",
                            info="If you select nothing, every entity will be selected")








dropdown = gr.Dropdown(sentences_list, label='Choose an article you would like to have summarized')


demo = gr.Interface(
    fn=ner_wikipedia,
    inputs=[dropdown, checkbox],
    outputs=["dataframe","plot", "file"],
    title="Named Entity Recognition"
)

demo.launch()



