# Using stanza for Named Entity Recognition


## Installation

Run the code cell below to install stanza:

In [3]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

## Import libraries

In [4]:
import stanza
import re
import os

## Creating the pipeline

Download the English language model and build the pipeline (we specify that it should only tokenize the text, separate multiword tokens and perform Named Entity Recognition):


In [5]:
# Download the language model:
stanza.download("en")

# Create the pipeline, specifying the language:
nlp = stanza.Pipeline(lang="en", processors='tokenize,mwt,ner')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


## Cloning to repository


In [6]:
#Clone to FASDH25-portofolio folder
!git clone https://github.com/kulsoom-za/FASDH25-portfolio2.git

Cloning into 'FASDH25-portfolio2'...
remote: Enumerating objects: 4409, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 4409 (delta 9), reused 10 (delta 5), pack-reused 4392 (from 2)[K
Receiving objects: 100% (4409/4409), 17.85 MiB | 16.06 MiB/s, done.
Resolving deltas: 100% (30/30), done.


## Filter only Jan 2024 articles

In [7]:
import os
import re

# Initialize Stanza pipeline
# stanza.download('en')
# nlp = stanza.Pipeline('en', processors='tokenize,ner')

def normalize_place_name(place):
    """Normalize place names using standardized naming conventions"""
    place = place.strip()

    # Remove common prefixes and suffixes
    place = re.sub(r'^the\s+', '', place, flags=re.IGNORECASE)
    place = re.sub(r'[\'’]s', '', place)

    # Standard naming conventions dictionary
    standard_names = {
        # Region normalizations
        'gaza': 'Gaza',  # Catches all Gaza variants

        # Country abbreviations
        'US': 'United States',
        'U.S.': 'United States',
        'USA': 'United States',
        'UK': 'United Kingdom',
        'UAE': 'United Arab Emirates',
        'Britain': 'United Kingdom',

        # Official names to common names
        'State of Israel': 'Israel',
        'Islamic Republic of Iran': 'Iran',
        'Republic of Yemen': 'Yemen',
        'State of Palestine': 'Palestine',

        # Common misspellings
        'Beruit': 'Beirut',
        'Dahiyeb': 'Dahiyeh',
        'Tel Israel': 'Tel Aviv',

        # Sub-region normalizations
        'WestBank': 'West Bank',
        'Gaza Strip': 'Gaza',
        'Gaza City': 'Gaza'
    }

    # Check for Gaza first (special case)
    if re.search(r'gaza', place.lower()):
        return standard_names['gaza']

    # Return standardized name if exists, otherwise original
    return standard_names.get(place, place)

# Initialize places dictionary
places = {}

folder = "/content/FASDH25-portfolio2/articles"

for filename in os.listdir(folder):
    if filename.startswith("2024-01-"):
        path = os.path.join(folder, filename)
        with open(path, encoding="utf-8") as file:
            text = file.read()
        doc = nlp(text)

        for sentence in doc.sentences:
            for ent in sentence.ents:
                if ent.type in ["GPE", "LOC"]:
                    normalized = normalize_place_name(ent.text)
                    if normalized in places:
                        places[normalized] += 1
                    else:
                        places[normalized] = 1

print(places)

{'West Bank': 164, 'Dura': 2, 'Hebron': 10, 'Tulkarem': 2, 'Gaza': 1830, 'Israel': 1632, 'Nablus': 5, 'Red Sea': 249, 'United States': 877, 'United Kingdom': 152, 'Yemen': 189, 'Iran': 210, 'Sanaa': 15, 'Saudi Arabia': 39, 'Aden': 3, 'Tel Aviv': 52, 'Palestine': 125, 'Africa': 29, 'Marib': 3, 'Middle East': 102, 'United Arab Emirates': 21, 'Turkey': 25, 'Jordan': 43, 'Qatar': 65, 'Charleston': 1, 'South Carolina': 4, 'Doha': 19, 'Hong Kong': 2, 'South Africa': 208, 'Lebanon': 178, 'Hague': 39, 'Pretoria': 8, 'Uganda': 12, 'China': 30, 'Russia': 43, 'Kuwait': 2, 'Ukraine': 47, 'Canada': 42, 'Montreal': 1, 'Milton, Ontario': 1, 'Jabalia': 11, 'Ottawa': 3, 'Egypt': 44, 'Rafah': 40, 'Toronto': 1, 'Calgary': 1, 'Afghanistan': 7, 'Austria': 3, 'Australia': 13, 'Finland': 3, 'Germany': 31, 'Italy': 10, 'Japan': 9, 'Netherlands': 14, 'Iceland': 1, 'Sweden': 3, 'Switzerland': 9, 'Romania': 4, 'Washington, DC': 4, 'Jerusalem': 26, 'Gretna': 2, 'Louisiana': 3, 'New Orleans': 5, '@MirandaCleland':

### Storing data in a tsv file


In [8]:
# Define the name and path of the output file
filename = "/content/FASDH25-portfolio2/ner_counts.tsv"

# Open the file in writing mode using UTF-8 encoding
with open(filename, mode="w", encoding="utf-8") as file:
    # Create the header line: column names separated by a tab
    header = "Place\tCount\n"
    file.write(header)

    # Loop through the places dictionary (cleaned place names and their counts)
    for place, count in places.items():
        # Create a row with the place and count separated by a tab
        row = f"{place}\t{count}\n"
        file.write(row)

The file will now be stored in our colab's session environment. You can see it by clicking the folder icon in the left-hand tool bar in colab. Double-click it to view it in colab. Right-click it and choose "Download" to download the file.

To access it in your script, use the path `/content/ner_counts.tsv`

In [9]:
with open("/content/FASDH25-portfolio2/ner_counts.tsv", encoding="utf-8") as file:
  print(file.read())

Place	Count
West Bank	164
Dura	2
Hebron	10
Tulkarem	2
Gaza	1830
Israel	1632
Nablus	5
Red Sea	249
United States	877
United Kingdom	152
Yemen	189
Iran	210
Sanaa	15
Saudi Arabia	39
Aden	3
Tel Aviv	52
Palestine	125
Africa	29
Marib	3
Middle East	102
United Arab Emirates	21
Turkey	25
Jordan	43
Qatar	65
Charleston	1
South Carolina	4
Doha	19
Hong Kong	2
South Africa	208
Lebanon	178
Hague	39
Pretoria	8
Uganda	12
China	30
Russia	43
Kuwait	2
Ukraine	47
Canada	42
Montreal	1
Milton, Ontario	1
Jabalia	11
Ottawa	3
Egypt	44
Rafah	40
Toronto	1
Calgary	1
Afghanistan	7
Austria	3
Australia	13
Finland	3
Germany	31
Italy	10
Japan	9
Netherlands	14
Iceland	1
Sweden	3
Switzerland	9
Romania	4
Washington, DC	4
Jerusalem	26
Gretna	2
Louisiana	3
New Orleans	5
@MirandaCleland	1
East Jerusalem	23
Scotland	2
Ireland	3
Norway	11
#October7	2
Syria	84
Jenin	19
Ramallah	24
Abwein	1
Qalqilya	2
Jericho	1
Bahrain	11
Denmark	3
New Zealand	2
South Korea	1
Spain	7
France	14
Belgium	16
Czech Republic	1
Hungary	1
Berlin	5
Paris	