## Object modification detection.

In many cass, RPA solutions need to detect the modification of target object such like button, menu and popups. 

If a target object is located in the web site, it would be more frequently updated. 


In [1]:
#! pip install bs4

In [2]:
from vertexai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
    Image,
    Part,
    HarmBlockThreshold,
    HarmCategory,
)

multimodal_model = GenerativeModel("gemini-pro-vision")

In [3]:
import os

bucket_name = os.environ.get("BUCKET_UPLOAD_TEMP")


In [4]:
from google.cloud import storage

def upload_file_to_temp_bucket(file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blob = bucket.blob(file_name)
    blob.upload_from_filename(file_name)
    
    return blob.public_url.replace("https://storage.googleapis.com/", "gs://")

### Multi Modal Test

In [12]:
import bs4
import re

def read_text_from_resource(file_name):
  with open(file_name, "r") as f:
    return f.read()

def set_long_attribute_value_to_null(element):
  for attribute in element.attrs:
    attribute_value = element.get(attribute)
    if attribute_value is not None and attribute != "id":
      element[attribute] = ""

def remove_script_element_in_html_recursively(html):
  for child in html.children:
    if isinstance(child, bs4.element.Tag):
      if child.name == "script":
        child.decompose()
      else:
        remove_script_element_in_html_recursively(child)

def remove_long_attribute_value_in_html_recursively(html):
  set_long_attribute_value_to_null(html)
  for child in html.children:
    if isinstance(child, bs4.element.Tag):
      remove_long_attribute_value_in_html_recursively(child)

def get_html_resource(file_name):
  html = bs4.BeautifulSoup(read_text_from_resource(file_name), 'html.parser')
  remove_long_attribute_value_in_html_recursively(html)
  remove_script_element_in_html_recursively(html)
  return html


In [20]:
html_org = get_html_resource("resources/kor_body_source_org.html")
html_mod = get_html_resource("resources/kor_body_source_mod.html")

print(str(html_org))
print(str(html_mod))

<body> <div id="u_skip"> <a href=""><span>상단영역 바로가기</span></a> <a href=""><span>서비스 메뉴 바로가기</span></a> <a href=""><span>새소식 블록 바로가기</span></a> <a href=""><span>쇼핑 블록 바로가기</span></a> <a href=""><span>관심사 블록 바로가기</span></a> <a href=""><span>MY 영역 바로가기</span></a> <a href=""><span>위젯 보드 바로가기</span></a> <a href=""><span>보기 설정 바로가기</span></a> </div> <div id="wrap"> <div class="" id="header" role=""> <div class="" id="topSearchWrap"> <div class="" id="special-logo"></div> <div class="" id="timeboard-ex" style=""> </div> <div class="" id="search_area"> <div class="" id="special-input-logo-ex" style=""></div> <div class=""> <div class="" id="search"> <h1 class="" id="special-input-logo"><a class="" href=""><img alt="" height="" src="" width=""/></a><a class="" href=""><span class=""><svg fill="" viewbox="" xmlns=""><path d=""></path></svg></span><span class="">NAVER</span></a></h1> <form action="" id="sform" method="" name="" role=""> <fieldset> <legend class="">검색</legend> <input name="" type=

![original_image](resources/kor_body_img_org.png)

![modified_image](resources/kor_body_img_mod.png)


In [32]:
image_uri_org = upload_file_to_temp_bucket("resources/kor_body_img_org.png")
image_uri_mod = upload_file_to_temp_bucket("resources/kor_body_img_mod.png")



xpath = '//*[@id="account"]/div/div/a[3]'

prompt1 = """You are an RPA Agent. 
You are given the original screen image (png), the original HTML, the text of the target element (button), and the xpath that points to the element.

original button text : 회원가입

original xpath : {xpath}

original image:
""".format(html_text_org="", xpath=xpath)

prompt2 = """When you are given the changed screen image and the changed HTML, please show the target element in the changed HTML using xpath and button text.

modified html text : {html_text_mod}

modified image :
""".format(html_text_mod=str(html_mod))

contents = [prompt1, Part.from_uri(uri=image_uri_org, mime_type="image/png"), prompt2, Part.from_uri(uri=image_uri_mod,mime_type="image/png")]

generation_config = GenerationConfig(
    temperature=0.1,
    max_output_tokens=2048,
)

safety_config = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

responses = multimodal_model.generate_content(contents, generation_config=generation_config, safety_settings=safety_config, stream=False)


In [33]:
print(responses)

candidates {
  content {
    role: "model"
    parts {
      text: " xpath : //*[@id=\"account\"]/div/div/a[3]\nbutton text : 네이버로 로그인"
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 3307
  candidates_token_count: 26
  total_token_count: 3333
}



### Text Only

In [29]:
model = GenerativeModel("gemini-pro")

prompt1 = """You are an RPA Agent. 
You are given the original HTML and button text and Xpath which indicates the target object(button).
When you are given the changed HTML, please show the target element in the changed HTML using xpath and button text.

original html text: {html_text_org}

original xpath : {xpath}

original button text : 회원가입

modified html text : {html_text_mod}

modified button text :

modified xpath :
""".format(html_text_org=str(html_org), xpath=xpath, html_text_mod=str(html_mod))

generation_config = GenerationConfig(
    temperature=0.1,
    max_output_tokens=2048,
)

safety_config = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

responses = model.generate_content(prompt1, generation_config=generation_config, safety_settings=safety_config, stream=False)


In [30]:
print(responses)

candidates {
  content {
    role: "model"
    parts {
      text: "modified xpath : //*[@id=\"account\"]/div/div/a[4]"
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 8419
  candidates_token_count: 18
  total_token_count: 8437
}



In [31]:
print(html_mod.prettify())

<body>
 <div id="u_skip">
  <a href="">
   <span>
    상단영역 바로가기
   </span>
  </a>
  <a href="">
   <span>
    서비스 메뉴 바로가기
   </span>
  </a>
  <a href="">
   <span>
    새소식 블록 바로가기
   </span>
  </a>
  <a href="">
   <span>
    쇼핑 블록 바로가기
   </span>
  </a>
  <a href="">
   <span>
    관심사 블록 바로가기
   </span>
  </a>
  <a href="">
   <span>
    MY 영역 바로가기
   </span>
  </a>
  <a href="">
   <span>
    위젯 보드 바로가기
   </span>
  </a>
  <a href="">
   <span>
    보기 설정 바로가기
   </span>
  </a>
 </div>
 <div id="wrap">
  <div id="header" role="">
   <div class="" id="topSearchWrap">
    <div class="" id="special-logo">
    </div>
    <div class="" id="timeboard-ex" style="">
    </div>
    <div class="" id="search_area">
     <div class="" id="special-input-logo-ex" style="">
     </div>
     <div class="">
      <div class="" id="search">
       <h1 class="" id="special-input-logo">
        <a class="" href="">
         <img alt="" height="" src="" width=""/>
        </a>
        <a class="" href="">

### Test - Describe the difference

It's not easy to determine whether two screen snapshot images has different layout. We will test it with Gemini Pro.


In [5]:
image_uri_org = upload_file_to_temp_bucket("resources/kor_body_img_org.png")
image_uri_mod = upload_file_to_temp_bucket("resources/kor_body_img_mod.png")



prompt1 = """You are an RPA Agent. 
Two images are given to you. Determine whether the two images are the same layout or not and describe the reason.

original image:

"""
prompt2 = """

modified image :
"""

contents = [prompt1, Part.from_uri(uri=image_uri_org, mime_type="image/png"), prompt2, Part.from_uri(uri=image_uri_mod,mime_type="image/png")]

generation_config = GenerationConfig(
    temperature=0.1,
    max_output_tokens=2048,
)

safety_config = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

responses = multimodal_model.generate_content(contents, generation_config=generation_config, safety_settings=safety_config, stream=False)


In [6]:
print(responses)

candidates {
  content {
    role: "model"
    parts {
      text: " The two images are of the same layout. Both images have the same navigation bar at the top, the same search bar in the middle, and the same footer at the bottom. The main content area is also the same in both images."
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 557
  candidates_token_count: 48
  total_token_count: 605
}



It's very wierd. We need to improve prompt to detect the difference in the images.

In [7]:
image_uri_org = upload_file_to_temp_bucket("resources/kor_body_img_org.png")
image_uri_mod = upload_file_to_temp_bucket("resources/kor_body_img_mod.png")


prompt1 = """You are an RPA Agent. 
Two images are given to you. Check if the overall layout and buttons of the two images have been changed, focusing on the orange box. Describe in detail the differences you find.

original image:

"""
prompt2 = """

modified image :
"""

contents = [prompt1, Part.from_uri(uri=image_uri_org, mime_type="image/png"), prompt2, Part.from_uri(uri=image_uri_mod,mime_type="image/png")]

generation_config = GenerationConfig(
    temperature=0.1,
    max_output_tokens=2048,
)

safety_config = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

responses = multimodal_model.generate_content(contents, generation_config=generation_config, safety_settings=safety_config, stream=False)

In [8]:
print(responses)

candidates {
  content {
    role: "model"
    parts {
      text: " The overall layout of the two images is the same. The orange box is located at the top of the screen and contains the NAVER logo. The buttons below the orange box are also the same in both images. However, there are some differences between the two images.\n\nIn the modified image, the orange box is slightly larger than in the original image. The NAVER logo is also slightly different in the two images. In the original image, the logo is white, while in the modified image, the logo is a gradient of orange and yellow.\n\nThe buttons below the orange box are also slightly different in the two images. In the original image, the buttons are black with white text, while in the modified image, the buttons are white with black text.\n\nOverall, the two images are very similar, but there are some minor differences."
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLI

Only prompt modification couldn't handle it very well. 

We will add more images to detect original button and layout.

In [9]:
image_uri_org = upload_file_to_temp_bucket("resources/kor_body_img_org.png")
image_uri_mod = upload_file_to_temp_bucket("resources/kor_body_img_mod.png")
image_uri_object = upload_file_to_temp_bucket("resources/target_object.png")


prompt1 = """You are an RPA Agent. 
Given the original screenshot image and the target object (button) image to click, check if the same button exists on the modified screen and if the location is the same.

original screen shot:
"""
prompt2 = """
target object image :
"""
prompt3 = """
modified screen shot:
"""

contents = [prompt1, Part.from_uri(uri=image_uri_org, mime_type="image/png"), prompt2, Part.from_uri(uri=image_uri_object, mime_type="image/png"), prompt3, Part.from_uri(uri=image_uri_mod,mime_type="image/png")]

generation_config = GenerationConfig(
    temperature=0.1,
    max_output_tokens=2048,
)

safety_config = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

responses = multimodal_model.generate_content(contents, generation_config=generation_config, safety_settings=safety_config, stream=False)

In [10]:
print(responses)

candidates {
  content {
    role: "model"
    parts {
      text: " No, the button does not exist on the modified screen."
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 834
  candidates_token_count: 12
  total_token_count: 846
}



Good. And try to find alternative button(text) on the modified snapshot. 

In [13]:
#image_uri_org = upload_file_to_temp_bucket("resources/kor_body_img_org.png")
image_uri_mod = upload_file_to_temp_bucket("resources/kor_body_img_mod.png")
image_uri_object = upload_file_to_temp_bucket("resources/target_object.png")
html_mod = get_html_resource("resources/kor_body_source_mod.html")

prompt1 = """You are an RPA Agent. 
In the modified screenshot image and HTML, check for buttons that can be replaced with the given target object (button), and provide the corresponding XPath for this.

target object image :
"""
prompt2 = """
modified screen shot:
"""
prompt3 = """
modified html text:
{html_text_mod}
""".format(html_text_mod=str(html_mod))

contents = [prompt1, Part.from_uri(uri=image_uri_object, mime_type="image/png"), prompt2, Part.from_uri(uri=image_uri_mod,mime_type="image/png"), prompt3]

generation_config = GenerationConfig(
    temperature=0.1,
    max_output_tokens=2048,
)

safety_config = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

responses = multimodal_model.generate_content(contents, generation_config=generation_config, safety_settings=safety_config, stream=False)

In [14]:
print(responses)

candidates {
  content {
    role: "model"
    parts {
      text: " 네이버 메인 페이지 상단 우측에 위치한 회원가입 버튼의 XPath는 다음과 같습니다.\n\n```\n//*[@id=\"account\"]/div/div[3]/a[3]\n```"
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 3255
  candidates_token_count: 44
  total_token_count: 3299
}



Good. 


### Test Result.



### Test Result.

The results show that there is a clear difference between the two mmodles (text only, multi-modal). but it is not yet clear which method has a greater impact. 
However, it is important to note that the token size constraint is more restrctive for models that provide vision than for text-only models.