<a href="https://colab.research.google.com/github/kristianJW54/ML-AI-Models-Projects/blob/main/XML_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
import re
import pandas as pd
import numpy as np
import nltk
from nltk import sent_tokenize

**Sample XML Data**

In [None]:
import xml.etree.ElementTree as ET

# Create the root element
user_form = ET.Element("userForm")

# Add sub-elements with user information
name = ET.SubElement(user_form, "name")
name.text = "Christopher Adams"

email = ET.SubElement(user_form, "email")
email.text = "chris.adams@example.com"

feedback = ET.SubElement(user_form, "feedback")
feedback.text = """
    Hello,

    I'm writing to share my experience after using your service. My name is Christopher Adams, and you can reach me at chris.adams@gmail.com.

    Recently, I was at your branch located at 789 Pine Street, Cityborough. I was impressed with the overall atmosphere and the helpful staff. The location is convenient for me, as I live nearby at 101 Maple Lane.

    During my visit, I used your online platform as well. It's user-friendly, and I appreciate the effort in keeping it simple. However, I encountered a small issue while navigating through the payment process. It would be great if you could look into this.

    Additionally, I'd like to suggest incorporating more payment options, as it would enhance the convenience for customers like me. Test phone number +49 621 60-6641516

    Thank you for your attention to these details. I look forward to seeing improvements and continuing to use your services. Testing new entity labels, Paracetamol 500mg.

    My patient Mr Barry Hunter is also being mentioned to see if we can detect patient information, he can be reached at barry.h@gmail.com.

    It doesn't matter how long the text in this area is, becuase hopefully the machine learning tool will process all of this and pull out the correct entities and information.

    Best regards,
    Christopher Adams
"""

# Create an ElementTree object
tree = ET.ElementTree(user_form)

# Save the XML to a file
tree.write("user_form.xml", encoding="utf-8", xml_declaration=True)

# Alternatively, you can get the XML string using tostring() method
xml_string = ET.tostring(user_form, encoding="utf-8")
# print(xml_string.decode())


**Processing Doc**

In [None]:
# Read the XML file using with open()
with open("user_form.xml", "r", encoding="utf-8") as file:
    xml_data = file.read()

print(xml_data)

<?xml version='1.0' encoding='utf-8'?>
<userForm><name>Christopher Adams</name><email>chris.adams@example.com</email><feedback>
    Hello,

    I'm writing to share my experience after using your service. My name is Christopher Adams, and you can reach me at chris.adams@gmail.com.

    Recently, I was at your branch located at 789 Pine Street, Cityborough. I was impressed with the overall atmosphere and the helpful staff. The location is convenient for me, as I live nearby at 101 Maple Lane.

    During my visit, I used your online platform as well. It's user-friendly, and I appreciate the effort in keeping it simple. However, I encountered a small issue while navigating through the payment process. It would be great if you could look into this.

    Additionally, I'd like to suggest incorporating more payment options, as it would enhance the convenience for customers like me. Test phone number +49 621 60-6641516

    Thank you for your attention to these details. I look forward to see

In [None]:
# Parse the XML data
root = ET.fromstring(xml_data)

# Extract information from the XML
name = root.find("name").text
email = root.find("email").text
feedback = root.find("feedback").text
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

feedback = re.sub(r"\n", "", feedback)

doc = nlp(feedback)

named_entities = [(ent.text, ent.label_) for ent in doc.ents]

entity = {
    "Name" : [],
    "Location" : [],
    "MEDICAL" : []
}

for ent in doc.ents:
  if ent.label_ == "PERSON":
      if ent.text not in entity.values():
          entity["Name"].append(ent.text.strip(" "))

  elif ent.label_ in ["FAC", "GPE", "LOC"]:
    if ent.text not in entity.values():
      entity["Location"].append(ent.text.strip(" "))
  elif ent.label_ in ["ORG", "WORK_OF_ART"]:
    if ent.text not in entity.values():
      entity["MEDICAL"].append(ent.text.strip(" "))


for e, v in entity.items():
  print(e, v)


Name ['Christopher Adams', 'Barry Hunter', 'Christopher Adams']
Location ['789 Pine Street', 'Cityborough', '101 Maple Lane']
MEDICAL ['Paracetamol 500']


**Extract PII**

- Emails
- Phone numbers etc

In [None]:
email_pattern = re.compile(r"[\w\.-]+@[\w\.-]+")

emails = re.findall(email_pattern, feedback)

# print(emails)

number_pattern = re.compile(r'(?:([+]\d{1,4})[-.\s]?)?(?:[(](\d{1,3})[)][-.\s]?)?(\d{1,4})[-.\s]?(\d{1,4})[-.\s]?(\d{1,9})')

numbers = re.findall(number_pattern, feedback)
fnumbers = [' '.join(filter(None, match)) for match in numbers]
# Filter out numbers with fewer than 5 digits
phone_numbers = [num for num in fnumbers if len(re.findall(r'\d', num)) >= 5]

# Filter out numbers with 2 or more digits that are not valid
other_numbers = [num for num in fnumbers if len(re.findall(r'\d', num)) >= 2 and num not in phone_numbers]

other = {
    "Emails" : emails,
    "Phone Numbers" : phone_numbers,
    "Other Numbers" : other_numbers
}

for e, v in other.items():
  print(e,v)

# print("Phone Numbers:", phone_numbers)
# print("Other Numbers:", other_numbers)

Emails ['chris.adams@gmail.com.', 'barry.h@gmail.com.']
Phone Numbers ['+49 621 60 6641516']
Other Numbers ['7 8 9', '1 0 1', '5 0 0']


**Load into DataFrame**

In [None]:
form = {**entity, **other}

data = pd.DataFrame.from_dict(form, orient='index').transpose()

data.head()


Unnamed: 0,Name,Location,MEDICAL,Emails,Phone Numbers,Other Numbers
0,Christopher Adams,789 Pine Street,Paracetamol 500,chris.adams@gmail.com.,+49 621 60 6641516,7 8 9
1,Barry Hunter,Cityborough,,barry.h@gmail.com.,,1 0 1
2,Christopher Adams,101 Maple Lane,,,,5 0 0


In [None]:
#Redact

def redact(text, data):
    for e, v in data.items():
        for i in v:
          pattern = i

        # Use regular expression to replace all occurrences of the pattern with 'Redacted'
          text = text.replace(pattern, 'Redacted')

    return text

# Redact the feedback text
redacted_feedback = redact(feedback, form)

# Split the redacted text into sentences
sentences = re.split(r'\.', redacted_feedback)

# Print each sentence on a new line
for sentence in sentences:
    print(sentence)

# Open the file in write mode
with open("redacted_text.txt", "w") as r:
    # Write each sentence to the file
    for sentence in sentences:
        r.write(sentence + '\n')


    Hello,    I'm writing to share my experience after using your service
 My name is Redacted, and you can reach me at Redacted    Recently, I was at your branch located at Redacted, Redacted
 I was impressed with the overall atmosphere and the helpful staff
 The location is convenient for me, as I live nearby at Redacted
    During my visit, I used your online platform as well
 It's user-friendly, and I appreciate the effort in keeping it simple
 However, I encountered a small issue while navigating through the payment process
 It would be great if you could look into this
    Additionally, I'd like to suggest incorporating more payment options, as it would enhance the convenience for customers like me
 Test phone number +49 621 60-6641516    Thank you for your attention to these details
 I look forward to seeing improvements and continuing to use your services
 Testing new entity labels, Redactedmg
    My patient Mr Redacted is also being mentioned to see if we can detect patient in