## Tagging using Stanford NER

In [2]:
import os
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize


# Loading Stanford classifier
if os.getenv('PATH_TO_CLASSIFIER') is not None:
    st = StanfordNERTagger(os.getenv('PATH_TO_CLASSIFIER'))

In [5]:
text = 'Hi, My name is Hassan Mehmood and I work in NASA. My office number is +18889107865. I live in California'

In [None]:
# Tagging names, organizations etc

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)

In [None]:
# Above is not recognizing full name

## Tagging using SpaCy

In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [3]:
# https://spacy.io/models/en#en_core_web_sm

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pprint import pprint

nlp = en_core_web_sm.load()

In [6]:
doc = nlp(text)
pprint([(X.text, X.label_) for X in doc.ents])

[('Hassan Mehmood', 'PERSON'), ('NASA', 'ORG'), ('California', 'GPE')]


## Extracting Emails, Phones & Addresses

In [7]:
from commonregex import CommonRegex
parser = CommonRegex()

In [8]:
parser.emails(text)

[]

In [9]:
parser.phones(text)

['+18889107865']

In [10]:
parser.street_addresses(text)

[]

## Identifying different types of numbers like credit card, pins, phones etc

In [11]:
import re
# ^\d{3}-\d{2}-\d{4}$ --- ^ & $ represents starting and end
ssn_pattern = r'\d{3}-\d{2}-\d{4}'
postal_code_pattern = r'(\d{5}(?:\-\d{4})?)'
phone_number_pattern= r'^(1\s?)?((\([0-9]{3}\))|[0-9]{3})[\s\-]?[\0-9]{3}[\s\-]?[0-9]{4}$'
pin_pattern = r'^\d{3}$'

In [12]:
re.findall(ssn_pattern, '333-22-4444 | 123-45-6789')

['333-22-4444', '123-45-6789']