# ScanBot

The AI-driven bot that will read all your scanned documents and extract information into a database automatically!

## Preparation

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Imports

In [23]:
import os
import openai
import yaml
import pandas as pd
import base64
import re
import shutil
import time
from sklearn.metrics.pairwise import cosine_similarity

## Insert your API Key

In [3]:
openai.api_key = "sk-proj-iHrdMRQrydnSDtQUIzU6T3BlbkFJCoAngxToaT2u????????"

## Helper Functions

In [4]:
# Function to load prompts from files
def load_prompt(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()


# Function to analyze the document and extract meta data
def analyze_document(file_path, prompt):
    # create image data url
    with open(file_path, "rb") as file:
        image_url = f"data:image/jpeg;base64,{base64.b64encode(file.read()).decode()}" # TODO! Hint: How can you send an image via an API?

    # TODO! Hint: Use ChatGPT playground or Docs
    try:
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": image_url
                            },
                        },
                        {
                            "type": "text",
                            "text": prompt,
                        },
                    ],
                }
            ],
            temperature=1,
            max_tokens=750,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error: {e}")
        # hard coded sleep time to avoid rate limit
        time.sleep(15)
        return analyze_document(file_path, prompt)


# Function to generate a filename based on meta data
def generate_filename(meta_data):
    # <doc type>_<date>_<title>.jpg
    # if any of the fields are missing, use "unknown"
    string = ""
    string += str(meta_data.get("document_type", "unknown")) + "_"
    string += str(meta_data.get("author", "unknown")) + "_"
    string += str(meta_data.get("date", "unknown")) + "_"
    string += str(meta_data.get("title", "unknown"))
    # Make sure that the filename is a valid filename
    string = re.sub(r"[^\w\s-]", "", string)
    return string + ".jpg"


# Function to save meta data to a YAML file
def save_meta_file(meta_data, meta_file_path):
    # Save json file as yaml
    with open(meta_file_path, "w", encoding="utf-8") as file:
        yaml.dump(meta_data, file, default_flow_style=False,
                  allow_unicode=True)


# Function to save meta data to CSV and Excel files
def save_meta_data_to_csv_excel(meta_data_list, csv_path, excel_path):
    # check if the csv file exists
    if not os.path.exists(csv_path):
        existing_df = pd.DataFrame()
    else:
        existing_df = pd.read_csv(csv_path)

    new_df = pd.DataFrame(meta_data_list)
    merged_df = pd.concat([existing_df, new_df]).drop_duplicates(subset='original_filepath')
    merged_df.to_csv(csv_path, index=False, encoding="utf-8")
    merged_df.to_excel(excel_path, index=False)


def parse_meta_data(response):
    # Get content between "```yaml" and "```" to extract meta data
    yaml_string = response.split("```yaml")[1].split("```")[0]
    meta_info = re.sub(r'(?<=: )([^"\n]*: [^"\n]*)', r'"\1"', yaml_string)
    meta_json = yaml.safe_load(meta_info)
    return meta_json


def save_meta_data(meta_json, file_path):
    # Generate new file name
    new_filename = generate_filename(meta_json)
    # Rename the document file
    renamed_file_location = os.path.join('database', 'renamed files',
                                         new_filename)
    shutil.copy2(file_path, renamed_file_location)
    # Create meta data file
    meta_file_path = os.path.join('database', 'meta files',
                                  new_filename.split('.')[0] + ".yaml")
    save_meta_file(meta_json, meta_file_path)
    # Add new file name to meta data
    meta_json["original_filepath"] = file_path
    meta_json["renamed_filename"] = new_filename
    return new_filename

def get_original_filenames(csv_path):
    # Extract all "original_filepath" from the meta_data csv file
    original_filepaths = []
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        original_filepaths = df["original_filepath"].tolist()
    return original_filepaths

## Main

**Running the next cell will delete previous extractions from the `database` directory**

In [5]:
# Delete everything inside the database folder in one line
shutil.rmtree('database', ignore_errors=True)

# First, make sure that database/meta files and database/renamed_files exist
os.makedirs('database/meta files')
os.makedirs('database/renamed files')

Let's checkout the prompt first:

In [6]:
prompt_path = "prompts/analyze_document.txt"
prompt = load_prompt(prompt_path)
print(prompt)

Please analyze the attached scan of a document and extract the relevant information. Create a metadata file in YAML format that includes only the following fields and nothing else:

- document_type: The type or category of the document (e.g., invoice, receipt, contract, letter).
- date: The date of the document, if available (e.g., invoice date, letter date, birthday).
- author: The person or organization that created or sent the document.
- recipient: The person or organization to whom the document is addressed.
- title: The main title or subject of the document.
- content: Extract the main content from the document as full text.
- names: All names of persons mentioned in the document.
- places: All locations or addresses mentioned in the document.
- organizations: All organizations or companies mentioned in the document.
- goods: All goods or products mentioned in the document.

The response must be strictly in YAML format and contain only the metadata fields specified above, without

Use ChatGPT-4o to collect information about all scanned documents; subsequently collect all data in a table

In [7]:
meta_data_list = []
document_dir = "documents"
csv_path = "database/meta_data.csv"
excel_path = "database/meta_data.xlsx"
# Extract all "original_filepath" from the meta_data csv file
original_filepaths = get_original_filenames(csv_path)

for filename in os.listdir(document_dir):
    extensions = (".jpg", ".jpeg", ".png")
    if (filename.endswith(extensions) and
        (os.path.join(document_dir, filename) not in original_filepaths)):
        file_path = os.path.join(document_dir, filename)
        print(f"Processing {filename}...")
        # Analyze the document to get meta data
        llm_response = analyze_document(file_path, prompt)
        # Parse the meta data from the response
        meta_json = parse_meta_data(llm_response)
        # Save the meta data and rename the document file
        new_file_name = save_meta_data(meta_json, file_path)
        meta_data_list.append(meta_json)
        print(f"Processed {filename}, renamed it to {new_file_name}")

# Save all meta data to CSV and Excel files
save_meta_data_to_csv_excel(meta_data_list, csv_path, excel_path)
print(f"Meta data saved to {csv_path} and {excel_path}")

Processing CV_1.jpg...


Processed CV_1.jpg, renamed it to Curriculum Vitae_Dmitriy Yurevich Khoroshev_None_Curriculum Vitae.jpg
Processing CV_2.jpg...
Processed CV_2.jpg, renamed it to Curriculum Vitae_Alexei Ivanovich Petrov_None_Curriculum Vitae.jpg
Meta data saved to database/meta_data.csv and database/meta_data.xlsx


## Analysis
Now that we extracted relevant information from the documents, we want to cross check that this person is not an sanctions list.

In [10]:
meta_data_list

[{'document_type': 'Curriculum Vitae',
  'date': None,
  'author': 'Dmitriy Yurevich Khoroshev',
  'recipient': None,
  'title': 'Curriculum Vitae',
  'content': 'Name: Dmitriy Yurevich Khoroshev Date of Birth: April 17, 1993 Place of Birth: Russian Federation Nationality: Russian  Email: khoroshev1@icloud.com, sitedev5@yandex.ru Passport: 2018278055 (Russia), 2006801524 (Russia) Tax ID No: 366110340670 (Russia)\nProfessional Experience\nSenior Software Engineer, TechSolutions Ltd. September 2019 - May 2024\n- Led a team of developers to design and implement software solutions. - Developed and maintained high-performance web applications. - Collaborated with cross-functional teams to define project requirements. - Ensured software quality through rigorous testing and code reviews.\nCybersecurity Consultant June 2015 - August 2019\n- Provided cybersecurity services to various clients. - Specialized in penetration testing and vulnerability assessment. - Developed security protocols and m

In [36]:
name_cv_1 = meta_data_list[0]['names'][0]
name_cv_2 = meta_data_list[1]['names'][0]
content_cv_1 = meta_data_list[0]['content']
content_cv_2 = meta_data_list[1]['content']
name_cv_1, name_cv_2

('Dmitriy Yurevich Khoroshev', 'Alexei Ivanovich Petrov')

Next, import the sanctions list

In [17]:
# Import the sanctions_list.csv file
sanctions_list = pd.read_csv("sanctions_list_short.csv", sep=";", delimiter=None, header=None)
# lower case entire dataframe
sanctions_list = sanctions_list.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
sanctions_list.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,"48603,""khoroshev, dmitry yuryevich"",""individua...",pob russian federation,nationality russia,citizen russia,email address khoroshev1@icloud.com,alt. email address sitedev5@yandex.ru,gender male,digital currency address - xbt bc1qvhnfknw852...,secondary sanctions risk: ukraine-/russia-rel...,passport 2018278055 (russia),alt. passport 2006801524 (russia),tax id no. 366110340670 (russia),"a.k.a. 'lockbitsupp'."""
1,"48604,""seliverstov, ivan vladimirovich"",""indiv...","pob magdeburg, germany",nationality russia,"gender male.""",,,,,,,,,
2,"48605,""gopstein, ben-zion"",""individual"",""west-...",nationality israel,gender male,"national id no. 024526394 (israel).""",,,,,,,,,
3,"48607,""militechtrade limited liability company...",tax id no. 9706027480 (russia),"registration number 1227700679216 (russia).""",,,,,,,,,,
4,"48849,""federal state budgetary institution 48t...",organization established date 07 apr 1928,target type government entity,tax id no. 5042129453 (russia),registration number 1136441000706 (russia),a.k.a. 'scientific research institute of medi...,a.k.a. 'the virology center',a.k.a. 'scientific research institute of micr...,a.k.a. 'scientific research institute of epid...,a.k.a. 'military technical scientific researc...,,,


In [18]:
# combine all columns to one column
sanctions_list["combined"] = sanctions_list.apply(lambda row: " ".join(row.dropna()), axis=1)

### The easy case: Direct matches

In this case, one of the attributes of the extracted data like name, email adress, passport number, etc. matches directly with an entry in the sanctions list.

Try to find a match using the name from the first CV and the sanction list

In [19]:
for name in name_cv_1.split(' '): 
    match = sanctions_list.apply(lambda row: row.astype(str).str.contains(name.lower()).any(), axis=1)
    if match.any():
        print(f"Match found for {name} in row {match[match].index[0]}")
        print(sanctions_list.loc[match[match].index[0]])

Match found for Khoroshev in row 0
0           48603,"khoroshev, dmitry yuryevich","individua...
1                                      pob russian federation
2                                          nationality russia
3                                              citizen russia
4                         email address khoroshev1@icloud.com
5                       alt. email address sitedev5@yandex.ru
6                                                 gender male
7            digital currency address - xbt bc1qvhnfknw852...
8            secondary sanctions risk: ukraine-/russia-rel...
9                                passport 2018278055 (russia)
10                          alt. passport 2006801524 (russia)
11                           tax id no. 366110340670 (russia)
12                                     a.k.a. 'lockbitsupp'."
combined    48603,"khoroshev, dmitry yuryevich","individua...
Name: 0, dtype: object


For the second CV there's no match found in the sanctions list. Good, right?

In [21]:
for name in name_cv_2.split(' '): 
    match = sanctions_list.apply(lambda row: row.astype(str).str.contains(name.lower()).any(), axis=1)
    if match.any():
        print(f"Match found for {name} in row {match[match].index[0]}")
        print(sanctions_list.loc[match[match].index[0]])

### The hard case: A person tries to fake its identity.
In this case full-text search is not sufficient as the hard facts about a person are alterered (Name, passport id, etc.). But we can hope that the sanctioned person or company was sloppy in covering all traces. In this case, we can try to vectorize both the collected CV data and the sanctions list and check the top matches.

The top matches can be disregarded entirely, if the similarity is below some threshold. 
Above some threshold, additional checks can be triggered.

https://platform.openai.com/docs/guides/embeddings/use-cases

In [69]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

def get_similarity(document_embedding, vector_base):
   similarities = vector_base.map(lambda x: cosine_similarity([x], [document_embedding])[0][0])
   return similarities

Get all embeddings from both the CVs and the sanctions database

In [67]:
sanctions_list['ada_embedding'] = sanctions_vector_base = sanctions_list.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

cv_1_embedding = get_embedding(content_cv_1)
cv_2_embedding = get_embedding(content_cv_2)

Calculate similarity between CV 1 and the sanctions list

In [70]:
get_similarity(cv_1_embedding, sanctions_vector_base)

0    0.727150
1    0.489509
2    0.237508
3    0.411932
4    0.390976
5    0.350561
Name: combined, dtype: float64

In [71]:
get_similarity(cv_2_embedding, sanctions_vector_base)

0    0.572706
1    0.481862
2    0.279866
3    0.422896
4    0.365666
5    0.351010
Name: combined, dtype: float64

We found earlier that the name of the applicant in CV 1 matches directly with an entry in the sanctions list (the first entry). This is also reflected by the cosine similarity, which is largest with respect to the correct match of the sanctions list. The similarity is mainly large because of matching names, email adresses and passport numbers.

Now let's check the second CV for which no direct match was found.

Despite no perfect matches between various attributes, there is a suspiciuously large similarity to the first entry in the sanctions list. Why could that be?

In order to figure that out, we can leverage a popular **explainable AI** method: *Counterfactual what-if analysis*

In such an analysis we specify some goal and try to modify the inputs such that the specified target is achieved. In this case, we want to change the CV content such that the cosine similarity isn't suspicious anymore.

In [35]:
meta_data_list[1]['content']

'Name: Dmitriy Yurevich Khoroshev Date of Birth: April 17, 1993 Place of Birth: Russian Federation Nationality: Russian  Email: khoroshev1@icloud.com, sitedev5@yandex.ru Passport: 2018278055 (Russia), 2006801524 (Russia) Tax ID No: 366110340670 (Russia)\nProfessional Experience\nSenior Software Engineer, TechSolutions Ltd. September 2019 - May 2024\n- Led a team of developers to design and implement software solutions. - Developed and maintained high-performance web applications. - Collaborated with cross-functional teams to define project requirements. - Ensured software quality through rigorous testing and code reviews.\nCybersecurity Consultant June 2015 - August 2019\n- Provided cybersecurity services to various clients. - Specialized in penetration testing and vulnerability assessment. - Developed security protocols and measures for organizations. - Trained staff on cybersecurity best practices.\n'

In [81]:
modified_cv_2 = 'Personal Information\n\nName: Alexei Ivanovich Petrov\nPlace of Birth: France, Paris\nNationality: French\nEmail: alexei.petrov@protonmail.com\n\nProfessional Experience\n\nSenior Software Engineer, TechSolutions Ltd.\n\nSeptember 2019 - May 2024\n\n- Led a team of developers to design and implement software solutions.\n- Developed and maintained high-performance web applications.\n- Collaborated with cross-functional teams to define project requirements.\n- Ensured software quality through rigorous testing and code reviews.\n\nCybersecurity Consultant\n\nJune 2015 - August 2019\n\n- Provided cybersecurity services to various clients.\n- Specialized in penetration testing and vulnerability assessment.\n- Developed security protocols and measures for organizations.\n- Trained staff on cybersecurity best practices.\n'


In [82]:
modified_cv_2_embedding = get_embedding(modified_cv_2)
get_similarity(modified_cv_2_embedding, sanctions_vector_base)

0    0.488156
1    0.414368
2    0.230206
3    0.350396
4    0.307408
5    0.304089
Name: combined, dtype: float64

As you can see, it is actually quite hard not to get a suspicious similarity score without changing almost all details of the CV. Note however, that a high similarity score is not proof that the person of the CV is the same as the person on the sanctions list. But it can serve as an early warning system triggering a more thorough analysis, by collection more information, etc.