# Check Sensitive Information (ex Personal Identifying Information-PII)
The goal of this jupyter notebook is to provide several functions to identify and extract "sensitive information" from an input text.

## Table of Content
* [General Sensitive Information](#general-pii) (based on regular expression--RegEx)
  - Email address
  - Phone number
  - IPv4 address
  - URL http/https
  - Bitcoin wallet
  - Money amount
  - Cryptocurrency amount
  - Creditcard number
  - Passport number
  - IBAN (bank) number
* [Specific Sensitive Information](#specific-pii) (based on list of substring)
* [Country Names](#countries)
* [Dutch Cities](#duch-cities) 
* [Dutch Names](#duch-names) 
* [Typos](#typos) 
* [Defining check_everything()](#check-everything)
* [Testing check_everything()](#testing-everything)
* [Flask Application](#flask)


## General Sensitive Information (based on regex)  <a class="anchor" id="general-pii"></a>

In [1]:
pii_regexes = {
    "bank_iban":r'\b[A-Z]{2}\s*\d{2}\s*(?:\w{4}\s*){2,7}\w{1,4}\b',
    "email_address": r"[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+[a-zA-Z0-9]", 
    "phone_number":r"(?:\+?\d{1,2}\s?\(?\d{3}\)?(?:[-.\s]?\d{3}){2})",# r"\+?\d{1,2}\s?\(?\d{3}\)?(?:[-.\s]?\d{3}){2}",
    "ipv4":r'([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)',
    "url_http":r'https?://\S+|http?://\S+',
    "bitcoin_wallet": r'(^(bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}$)',
    "money_amount": r"(?:[\d.,]+)\s*(?:[$€£¥₣₽₩¢₹%\₿])|(?:[$€£¥₣₽₩¢₹%\₿])\s*(?:[\d.,]+)",
    "cryptocurrency_amount":r"(BTC\s*\d+(?:\.\d+)?|ETH\s*\d+(?:\.\d+)?|LTC\s*\d+(?:\.\d+)?|XRP\s*\d+(?:\.\d+)?)",
    "creditcard": r"\b(?:\d[ -]*?){13,16}\b",
    "passport": r'\b[A-Z0-9<]{6,20}\b'
}

In [2]:
import re

def identify_pii(text):
    pii_found = {}

    for pii_type, regex in pii_regexes.items():
        matches = re.findall(regex, text)
        if matches:
            pii_found[pii_type] = matches

    return pii_found

## Specific Sensitive Information (based on a list of substrings) <a class="anchor" id="specific-pii"></a>

In [3]:
import re

def words_with_substrings(text, substrings=None):
    if not substrings:
        return []

    word_list = text.split()
    matching_words = set()
    for substring in substrings:
        matching_words.update(word.rstrip('.') for word in word_list if re.findall(substring, word, re.IGNORECASE))
    return list(matching_words)


## Countries Name <a class="anchor" id="countries"></a>

In [4]:
import re
import pandas as pd

def find_countries_in_text(text):
    countries_table = pd.read_html('https://www.geonames.org/countries/')[1]
    countries = list(countries_table['Country'])

    all_names_pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, countries)))

    found_countries = re.findall(all_names_pattern, text, re.IGNORECASE)

    return found_countries

## Dutch names (100.000) <a class="anchor" id="dutch-names"></a>

In [5]:
# !pip install faker
from faker import Faker
import re

def find_dutch_names_in_text(text):
    fake = Faker('nl_NL')
    dutch_names = [fake.first_name() for _ in range(100000)]
    
    all_names_pattern = "|".join(dutch_names)
    
    # Add word boundaries to the pattern
    pattern = r'\b(' + all_names_pattern + r')\b'

    # Use the re.findall method to find all occurrences of the names in the text
    found_names = re.findall(pattern, text, re.IGNORECASE)

    return found_names

## Dutch cities <a class="anchor" id="dutch-cities"></a>

In [6]:
import re
import pandas as pd

def find_dutch_cities_in_text(text):
    df = pd.read_csv('data/WoonplaatsenCodes.csv', sep=';') #https://datasets.cbs.nl/CSV/CBS/nl/85516NED

    # Escape special characters in city names for accurate pattern matching
    escaped_cities = [re.escape(city) for city in df['Title']]

    # Create the pattern by joining the escaped city names
    pattern = r'\b(?:' + '|'.join(escaped_cities) + r')\b'

    # Use the re.findall method to find all occurrences of the names in the text
    found_cities = re.findall(pattern, text, re.IGNORECASE)
    
    return found_cities

## Dutch postcode

In [7]:
def find_dutch_postcodes(text):
    pattern = r"\b\d{4}\s?[A-Z]{2}\b"
    dutch_postcodes = re.findall(pattern, text)
    return dutch_postcodes

## Typos <a class="anchor" id="typos"></a>

In [8]:
# !pip install pyspellchecker
# from spellchecker import SpellChecker

# def find_typos(text):
#     spell = SpellChecker()
#     words = text.split()

#     misspelled = [word for word in words if not word[0].isdigit() and not spell.known([word.lower()])]
    
#     return misspelled

# MERGING EVERYTHING <a class="anchor" id="check-everything"></a>

In [9]:
def check_everything (text,substrings):
    output_dic = identify_pii(text)
    output_dic['sensitive_string'] = words_with_substrings(text, substrings)
    output_dic['dutch_name']= find_dutch_names_in_text(text)
    output_dic['country'] = find_countries_in_text(text)
    output_dic['dutch_city'] = find_dutch_cities_in_text(text)
    output_dic['dutch_postcode'] = find_dutch_postcodes(text)
    
    #     output_dic['typos']= find_typos(text)
    
    return output_dic

### Testing check_everything() <a class="anchor" id="testing-everything"></a>

In [10]:
text = "My email is jairsantanna@gmail.com. My passport number is ABC12345XYZ. I live at the happystraat 5, Utrecht, 7514AB Netherlands. Kim Cardoso and Sjoerd Santanna are working with me at NorthWave. My IP address is 127.0.0.1 and 192.168.0.1. Could you please send me € 50? My IBAN is GB29NWBK60161331926819. My credit card is 1234-5678-9012-3456. My phone number is 0642330000 and +31682500000. PLEASE SEND ME ETH 1."
substrings = ['northwave']

check_everything (text,substrings)

{'bank_iban': ['GB29NWBK60161331926819'],
 'email_address': ['jairsantanna@gmail.com'],
 'phone_number': ['60161331926', '0642330000', '+31682500000'],
 'ipv4': ['127.0.0.1', '192.168.0.1'],
 'money_amount': ['€ 50'],
 'cryptocurrency_amount': ['ETH 1'],
 'creditcard': ['1234-5678-9012-3456'],
 'passport': ['ABC12345XYZ', '7514AB', '0642330000', '31682500000', 'PLEASE'],
 'sensitive_string': ['NorthWave'],
 'dutch_name': ['Kim', 'Sjoerd'],
 'country': ['Netherlands'],
 'dutch_city': ['Utrecht'],
 'dutch_postcode': ['7514AB']}

# Flask application

In [11]:
import csv
from flask import Flask, render_template, request

app = Flask(__name__)

def replace_highlighted_words(text, highlighted_words):
    modified_text = text
    for word_type, words in highlighted_words.items():
        for word in words:
            modified_text = modified_text.replace(word, f'{word_type.upper()}')
    return modified_text


@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        text = request.form['text']
        sensitive_words = request.form['sensitive_words'].split(';')
        highlighted_words = check_everything(text, sensitive_words)
        highlighted_text = text
        modified_text = replace_highlighted_words(text, highlighted_words)

        for word_type, words in highlighted_words.items():
            if word_type == 'dutch_name' or word_type == 'dutch_city' or word_type == 'dutch_postcode' or word_type == 'country':
                for word in words:
                    if word in highlighted_text:
                        highlighted_text = highlighted_text.replace(
                            word, f'<span style="background-color: magenta;">{word}</span>'
                        )
            else:
                for word in words:
                    if word in highlighted_text:
                        highlighted_text = highlighted_text.replace(
                            word, f'<span style="background-color: red;">{word}</span>'
                        )

        return render_template(
            'index.html',
            text=text,
            highlighted_text=highlighted_text,
            modified_text=modified_text,
            sensitive_words=request.form['sensitive_words']  # Pass sensitive words back to the template
        )

    return render_template('index.html')


if __name__ == '__main__':
    app.run(port=5010)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5010
Press CTRL+C to quit
127.0.0.1 - - [14/Jul/2023 14:30:33] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jul/2023 14:30:33] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [14/Jul/2023 14:30:46] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [14/Jul/2023 14:30:46] "GET /static/copy-icon.png HTTP/1.1" 200 -
127.0.0.1 - - [14/Jul/2023 14:31:19] "GET / HTTP/1.1" 200 -


# TEST: 

# My email is jairsantanna@gmail.com. My passport number is ABC12345XYZ. I live at the happystraat 5, Utrecht, 7514AB Netherlands. Kim Cardoso and Sjoerd Santanna are working with me at NorthWave. My IP address is 127.0.0.1 and 192.168.0.1. Could you please send me € 50? My IBAN is GB29NWBK60161331926819. My credit card is 1234-5678-9012-3456. My phone number is 0642330000 and +31682500000. Could you please send me ETH 1?