#### This notebook is modified from <a href="https://www.kaggle.com/code/leonshangguan/modify-of-pii-detect-study">Modify of PII Detect Study</a>, <a href="https://www.kaggle.com/code/pjmathematician/pii-eda-presidio-baseline">PII EDA Presidio Baseline</a> and <a href="https://www.kaggle.com/code/yunsuxiaozi/pii-detect-study-notebook">PII detect study notebook</a>. 

# Modifications 

Firstly, big thanks to the users who provided the notebooks above. This notebook is merely adding some utility code to make a solid baseline that other users may iterate on, but all the heavy lifting was done by the above notebooks.

I encapsulated the analyzer in a class, and added code to run the analyzer on (potentially) both the training and test set. I also added validation code, so that we can analyze the performance of the analyzer on the training set.

I also added a global configuration for ease of testing, which allows the user to switch between training and inference mode. Additionally, I incorporated the external data that [https://www.kaggle.com/alejopaullier](@moth) kindly provided in his discussion post.

For now, the business logic roughly stays the same as the Modify of PII Detect Study notebook that I used beforehand.

## Resources

* Customizing the presidio analyzer: https://microsoft.github.io/presidio/samples/python/customizing_presidio_analyzer/

# Version History

- v16: Original baseline
- v17: Changed score thresholds for patterns from 0.5 -> 0.8
- v20: fixed bug in evaluation code

# Configuration

In [None]:
class CONFIG:
    """
    > General Options
    """
    # global seed
    seed = 42
    # number of samples to use for testing purposes
    # if None, we use the whole training dataset
    samples_testing = None
    # flag to indicate whether to use the external training dataset
    # or just to use the original data
    use_external_train_data = True
    # whether to run the algorithm on the training set and do subsequent validation
    # with 6.8k rows, this takes almost 50 minutes to run
    run_on_train_data = True
    
    """
    > Analyzer Options
    """
    # score threshold for patterns
    address_pattern_score = 0.8
    email_pattern_score = 0.8
    url_pattern_score = 0.8

# Import Libraries

### Install presidio

In [None]:
#安装python库 presidio_analyzer 不从python库里下载,而是从给定的链接处下载,更新到最新版本,并减少输出信息.
!pip install -U -q presidio_analyzer --no-index --find-links=file:///kaggle/input/presidio-wheels/presidio

### Import  necessary libraries

In [None]:
import json
import pandas as pd

from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from tqdm import tqdm
from typing import List
import random
import pprint
import re
import gc
from ast import literal_eval

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import fbeta_score, classification_report, confusion_matrix

from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer

from dateutil import parser

In [None]:
random.seed(42)

# Define Metric (F1 Beta Score)

In [None]:
def score(y_true, y_pred):
    return fbeta_score(y_true, y_pred, average="micro", beta=5)

def macro_score(y_true, y_pred):
    return fbeta_score(y_true, y_pred, average="macro", beta=5)

# Import Datasets

## Import Original Data

In [None]:
train_df = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))
print(f"len(train_df):{len(train_df)}, train_df[0].keys(): {list(train_df[0].keys())}")
print("-"*50)

labels = set()
for i in range(len(train_df)):
    labels.update(train_df[i]['labels'])
print(f"labels: {labels}")

test_df = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))

## Load External Data (if needed)

In [None]:
if CONFIG.use_external_train_data:
    # Convert the "stringified lists" in the columns to proper Python lists
    df_train_external = pd.read_csv('/kaggle/input/pii-external-dataset/pii_dataset.csv', converters={
        'tokens': literal_eval, 
        'labels': literal_eval, 
        'trailing_whitespace': literal_eval
    })
    df_train_external.rename(columns={'text': 'full_text'}, inplace=True)
    # convert to format similar to how we load in the original data
    df_train_external = df_train_external.to_dict('records')
    train_df.extend(df_train_external)

## Sample Data (if needed)

In [None]:
len(train_df)

In [None]:
if CONFIG.samples_testing != None:
    train_df = random.sample(train_df, CONFIG.samples_testing)

In [None]:
len(train_df)

## Helper Methods

In [None]:
def is_valid_date(text):
    try:
        # Attempt to parse the text as a date
        parsed_date = parser.parse(text)
        return True
    except:
        return False
    
def tokens2index(row):
    tokens  = row['tokens']
    start_ind = []
    end_ind = []
    prev_ind = 0
    for tok in tokens:
        start = prev_ind + row['full_text'][prev_ind:].index(tok)
        end = start+len(tok)
        start_ind.append(start)
        end_ind.append(end)
        prev_ind = end
    return start_ind, end_ind

# binary search
def find_or_next_larger(arr, target):
    left, right = 0, len(arr) - 1

    while left <= right:
        mid = (left + right) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return left

def count_trailing_whitespaces(word):
    return len(word) - len(word.rstrip())

## Create Analyzer

For ease of code, we encapsulate the analyzer code in a class.

In [None]:
class MyAnalyzer:
    
    def __init__(self):
        ## Initialize the analyzer
        configuration = {
            "nlp_engine_name": "spacy",
            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
        }
        
        # Create NLP engine based on configuration
        provider = NlpEngineProvider(nlp_configuration=configuration)
        nlp_engine = provider.create_engine()

        # create address recognizer
        address_regex = r'\b\d+\s+\w+(\s+\w+)*\s+((st(\.)?)|(ave(\.)?)|(rd(\.)?)|(blvd(\.)?)|(ln(\.)?)|(ct(\.)?)|(dr(\.)?))\b'
        address_pattern = Pattern(name="address", regex=address_regex, score = CONFIG.address_pattern_score)
        address_recognizer = PatternRecognizer(supported_entity="ADDRESS_CUSTOM", patterns = [address_pattern])

        # create email recognizer
        email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        email_pattern = Pattern(name="email address", regex=email_regex, score = CONFIG.email_pattern_score)
        email_recognizer = PatternRecognizer(supported_entity="EMAIL_CUSTOM", patterns = [email_pattern])

        # create url recognizer 
        url_regex = r'https?://\S+|www\.\S+'
        url_pattern = Pattern(name="url", regex=url_regex, score=CONFIG.url_pattern_score)
        url_recognizer = PatternRecognizer(supported_entity="URL_CUSTOM", patterns = [url_pattern])

        registry = RecognizerRegistry()
        registry.load_predefined_recognizers()
        registry.add_recognizer(address_recognizer)
        registry.add_recognizer(email_recognizer)
        registry.add_recognizer(url_recognizer)

        # Pass the created NLP engine and supported_languages to the AnalyzerEngine
        self.analyzer = AnalyzerEngine(
            nlp_engine=nlp_engine, 
            supported_languages=["en"],
            registry=registry
        )
        
        ## Initialize the black list
        self.black_list = ["wikipedia", "coursera", ".pdf", ".PDF", "article", 
                           ".png", ".gov", ".work", ".ai", ".firm", ".arts", 
                           ".store", ".rec", ".biz", ".travel" ]
        
     
    def predict_tokens(self, df_: list) -> pd.DataFrame:
        """Predict the tokens that have PII in the dataframe."""
        
        PHONE_NUM, NAME_STUDENT, URL_PERSONAL, EMAIL, STREET_ADDRESS, ID_NUM, USERNAME = [],[],[],[],[],[], []

        preds = []
        #查找每个词分词后的起始位置和终点位置
        for i in tqdm(range(len(df_)), desc="Processing tokens2index"):
            start, end = tokens2index(df_[i])
            df_[i]['start'] = start
            df_[i]['end'] = end

        for i, d in tqdm(enumerate(df_), total=len(df_), desc="Analyzing entities"):
            #results:[type: PERSON, start: 22, end: 37, score: 0.85]
            results = self.analyzer.analyze(text=d['full_text'],
                                   entities=[
                                             #"PHONE_NUMBER", 
                                             "PERSON", 
                                             "URL_CUSTOM", #"IP_ADDRESS", #"URL",
                                             "EMAIL_ADDRESS", "EMAIL_CUSTOM", 
                                             "ADDRESS_CUSTOM",
                                             "US_SSN", "US_ITIN", "US_PASSPORT", "US_BANK_NUMBER",
                                             "USERNAME"],
                                   language='en',
        #                            score_threshold=0.2,
                                    )
            pre_preds = []
            for r in results:#遍历找到过的每个实体,r:[type: PERSON, start: 22, end: 37, score: 0.85]
                #就是第s个词就是某个实体的开始
                s = find_or_next_larger(d['start'], r.start)#d['start'][s]=r.start
                end = r.end#实体终点
                word = d['full_text'][r.start:r.end]#文本里找单词
                end = end - count_trailing_whitespaces(word)#end减去尾部的空格就是单词自身尾部的下标
                temp_preds = [s]#实体单词的集合从第s个单词开始,然后连续几个单词?
                try:
                    #实体可能不是一个单词,分词的下一个单词如果还没有到达实体的尾部,就把下一个单词加上
                    while d['end'][s+1] <= end:
                        temp_preds.append(s+1)
                        s +=1
                except:
                    pass

                #找出来的实体是什么,我们就给它打对应的标签
                tmp = False

                if r.entity_type == 'USERNAME':
                    label =  'USERNAME'
                    USERNAME.append(d['full_text'][r.start:r.end])

        #         if r.entity_type == 'PHONE_NUMBER':
        #             #检查是不是日期类型
        #             if is_valid_date(word):
        #                 continue
        #             label =  'PHONE_NUM'
        #             PHONE_NUM.append(d['full_text'][r.start:r.end])

                if r.entity_type == 'PERSON':
                    label =  'NAME_STUDENT'
                    NAME_STUDENT.append(d['full_text'][r.start:r.end])

                if r.entity_type == 'ADDRESS_CUSTOM':
                    label = 'STREET_ADDRESS'
                    STREET_ADDRESS.append(d['full_text'][r.start:r.end])

                if r.entity_type == 'US_SSN' or r.entity_type == 'US_ITIN' or r.entity_type == 'US_PASSPORT' or r.entity_type == 'US_BANK_NUMBER':
                    label = 'ID_NUM'
                    ID_NUM.append(d['full_text'][r.start:r.end])

                if r.entity_type == 'EMAIL_ADDRESS' or r.entity_type == 'EMAIL_CUSTOM':
                    label = 'EMAIL'
                    EMAIL.append(d['full_text'][r.start:r.end])

                if r.entity_type == 'URL_CUSTOM':# or r.entity_type == 'IP_ADDRESS' or "http" in word:
                    #去除掉黑名单里的标签
                    for w in self.black_list:
                        if w in word:
                            tmp = True
                            break

                    label = 'URL_PERSONAL'
                    URL_PERSONAL.append(d['full_text'][r.start:r.end])

                if tmp:
                    continue


                #取出实体中的一个分词的下标
                for p in temp_preds:
                    if len(pre_preds) > 0:#第2次及以后经过这里.
                        """
                        新开始一个r的时候,pre_preds[-1]['rlabel']还是上一个实体的r.entity_type
                        此时也许会不等于这个实体的r.entity_type,换句话说,第一个等号就是还在同一个实体里.
                        p - pre_preds[-1]['token']==1就是连续的意思
                        """
                        if pre_preds[-1]['rlabel'] == r.entity_type and (p - pre_preds[-1]['token']==1):
                            label_f = "I-"+label#实体的中间位置
                        else:
                            label_f = "B-"+label#否则就是下一个实体的开始
                    else:#第一个label是起始位置,故标记为‘B-’
                        label_f = "B-"+label
                    #保存document,从第p个单词开始,标签为label_f
                    pre_preds.append(({
                            "document":d['document'],
                            "token":p,
                            "label":label_f,
                            "rlabel":r.entity_type#实体的类型
                        }))
            preds.extend(pre_preds)#遍历完这个数据之后,将所有找到的实体做汇总
            
        preds_df = pd.DataFrame(preds).iloc[:,:-1].reset_index()
        return preds_df
        
    
        

# Predict Train Set

In [None]:
analyzer = MyAnalyzer()

In [None]:
if CONFIG.run_on_train_data:
    train_preds = analyzer.predict_tokens(train_df)

# Evaluate Performance on Training Set

## Generate Corresponding DataFrame for "True" Answers

In [None]:
if CONFIG.run_on_train_data:
    
    train_act_records = []
    count = 0
    for entry in train_df:
        for idx, (token, label) in enumerate(zip(entry["tokens"], entry["labels"])):
            if label != 'O':
                train_act_records.append({
                    'row_id': count,
                    'document': entry["document"],
                    'token': idx,
                    'label': label,
                })
                count += 1

    train_act = pd.DataFrame.from_records(train_act_records)

In [None]:
def get_pred_act_lists_for_dfs(preds: pd.DataFrame, act: pd.DataFrame):
    document_idx_list = [(ex["document"], len(ex["tokens"])) for ex in train_df]
    
    preds_list = []
    act_list = []
    preds.sort_values(by="token", inplace=True)
    act.sort_values(by="token", inplace=True)
    
    for document, len_tokens in tqdm(document_idx_list, total=len(document_idx_list)):
        preds_doc = preds[preds["document"] == document]
        act_doc = act[act["document"] == document]
        
        # We do a "merge" (like in mergesort) to combine the results from the preds and 
        # actual values
        preds_idx = 0
        act_idx = 0
        for i in range(len_tokens):
            preds_head, act_head = None, None
            if preds_idx < len(preds_doc):
                preds_head = preds_doc.iloc[preds_idx]
            if act_idx < len(act_doc):
                act_head = act_doc.iloc[act_idx]
                
            if act_head is not None and act_head["token"] == i:
                act_list.append(act_head["label"])
                act_idx += 1
            else:
                act_list.append('O')
                
            if preds_head is not None and preds_head["token"] == i:
                preds_list.append(preds_head["label"])
                preds_idx += 1
            else:
                preds_list.append('O')
            
    return preds_list, act_list
    

In [None]:
if CONFIG.run_on_train_data:
    train_preds_list, train_act_list = get_pred_act_lists_for_dfs(train_preds, train_act)

## Classification Report on Training Set

In [None]:
if CONFIG.run_on_train_data:
    print(classification_report(train_preds_list, train_act_list, digits=4))

## F Beta Score on Training Set

In [None]:
if CONFIG.run_on_train_data:
    print("Micro F1 Beta Score:", score(train_preds_list, train_act_list))
    print("Macro F1 Beta Score:", macro_score(train_preds_list, train_act_list))

In [None]:
if CONFIG.run_on_train_data:
    del train_preds_list, train_act_list, train_preds, train_act
    gc.collect()

# Predict Test Set

In [None]:
test_preds = analyzer.predict_tokens(test_df)
test_preds.head()

# Submission

In [None]:
submission = pd.DataFrame(test_preds)
submission.columns = ['row_id','document', 'token', 'label']
submission.to_csv('submission.csv', index = False)
submission.head()