In [1]:
# emulate login with this website
# https://rmm.jaggaer.com/uowisconsin/erd-client/app/login/

# we only know this website is using jquery

import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin, urlencode
import ipywidgets as widgets
from IPython.display import display, HTML
import re
import base64
import pandas as pd

# login page
url = 'https://rmm.jaggaer.com/uowisconsin/erd-client/app/login/'
uri = urlparse(url)
base_url = '{uri.scheme}://{uri.netloc}'.format(uri=uri)

# create a session
s = requests.Session()

# get the login page
r = s.get(url)

# parse the login page
soup = BeautifulSoup(r.text, 'html.parser')

In [2]:
# Based on the above html, we can see that the form is using POST method
# emulate the form data
username = ''
password = ''
lang = 'en_US'
tz = 'US/Central'

form = {'u': username, 'p': password, 'lang': lang, 'tz': tz}

# submit the form to the login url with the form data
r = s.post(urljoin(base_url, soup.form['action']), data=form)
soup = BeautifulSoup(r.text, 'html.parser')

In [3]:
# check if we are logged in, check in soup
if soup.find('div', {'id': 'loginmessages'}):
    print('Login failed')
    print("current url: ", r.url)
else:
    print('Login success')
    print("current url: ", r.url)

Login success
current url:  https://rmm.jaggaer.com/uowisconsin/erd-client/app/secure/home/


In [4]:
# request the current page
r = s.get(r.url)
soup = BeautifulSoup(r.text, 'html.parser')

In [5]:
# Grep all rows and put them into json
rows = soup.find_all('div', {'class': 'row'})
data = []
# for child with id=tile-ViewMyRequestsMenuItem
filter_data = []

for row in rows:
    row_data = {
        'class': row.get('class'),
        'id': row.get('id'),
        'children': []
    }
    for child in row.find_all(recursive=False):
        child_data = {
            'tag': child.name,
            'attributes': child.attrs,
            'text': child.get_text(strip=True)
        }
        print(child_data)
        # true if the id start with "tile-"
        if child_data['attributes'].get('id') and child_data['attributes']['id'].startswith('tile-'):
            filter_data.append(child_data)
        row_data['children'].append(child_data)
    data.append(row_data)

# Convert to JSON
json_data = json.dumps(data, indent=4)

# Save to a file
output_path = 'main_page.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json_file.write(json_data)

print(f'Extracted data has been saved to {output_path}')

# save the filtered data to a file
output_path = 'filtered_data.json'
json_data = json.dumps(filter_data, indent=4)
with open(output_path, 'w', encoding='utf-8') as json_file:
    json_file.write(json_data)

print(f'Filtered data has been saved to {output_path}')

{'tag': 'div', 'attributes': {'class': ['col-sm-11', 'no-gutters']}, 'text': 'Research Material Management'}
{'tag': 'div', 'attributes': {'id': 'quicknav', 'class': ['col-sm-13', 'no-gutters']}, 'text': 'Wickens ChemUWMSNUser PreferencesPreferred Search LocationsAlternate Container LocationsReceiving Search LocationsManage Sub-locationsLogoutVersion\xa024.1.0.423JAGGAER Privacy Policy'}
{'tag': 'div', 'attributes': {'class': ['col-sm-24']}, 'text': ''}
{'tag': 'div', 'attributes': {'id': 'toolbar', 'class': ['col-24'], 'style': 'margin-bottom: 1px;'}, 'text': ''}
{'tag': 'div', 'attributes': {'class': ['col-sm-24', 'col-md-14', 'col-lg-10']}, 'text': 'Container Search (use * to perform a wildcard search)Source SearchMaterial SearchMaterial CreateReceivingStructure SearchView My RequestsScan OperationsContainer Operations WorksheetAssign Users to Principal InvestigatorProjectsContainer Inventory ReconciliationSchedule ReportView Completed ReportsTo Do List'}
{'tag': 'div', 'attributes'

In [6]:
# we will first focus on Source Search tile
# the id of the tile is tile-SourceSearchMenuItem
# get the url of the tile under data-url attribute
sourcesearch_url = base_url + [x['attributes']['data-url'] for x in filter_data if x['attributes']['id'] == 'tile-SourceSearchMenuItem'][0]
print("sourcesearch_url: ", sourcesearch_url)
print("current title: ", soup.title.string)

# change the section of this new url
r = s.get(sourcesearch_url, allow_redirects=True)
soup = BeautifulSoup(r.text, 'html.parser')

sourcesearch_url:  https://rmm.jaggaer.com/uowisconsin/erd-client/app/secure/sourcesearch/
current title:  JAGGAER - Research Material Management  Home Page


In [7]:
"""<select id="advancedsearchtype" name="advancedsearchtype" class="form-control">
                    <option value="search-materialidentifiers">Material Identifier</option>
                    <option value="search-catalog">Catalog # / Mfr Part #</option>
                    <option value="search-term-mustinclude">Must Include  </option>
                    <option value="search-term-exact">Exact Phrase</option>
                    <option value="search-term-includeany">Include Any</option>
    </select>
"""

# example payload
"""
csrftoken: 88781371-9faf-4580-8afc-2b9a1af1cf40
searchterm: thianthrene
advancedsearchtype: search-materialidentifiers
advancedsearchterms: 
excludeterms: 
channels: InventoryLocationsCatalog
channels: StoreRoomsCatalog
"""

# grep the form content from the page
searchform_structure = soup.find('form', {'id': 'searchform'})

# grep the csrf token
csrftoken = searchform_structure.find('input', {'name': 'csrftoken'})['value']

form_action_url = urljoin(base_url, searchform_structure['action'])

# Create input widgets for form data
search_term_input = widgets.Text(
    description='Search Term:', 
    value='thianthrene',
    style={'description_width': 'initial'},
    display='flex',
)
advanced_search_type_dropdown = widgets.Dropdown(
    options=[
        ('Material Identifier', 'search-materialidentifiers'),
        ('Catalog # / Mfr Part #', 'search-catalog'),
        ('Must Include', 'search-term-mustinclude'),
        ('Exact Phrase', 'search-term-exact'),
        ('Include Any', 'search-term-includeany')
    ],
    description='Advanced Search Type:',
    value='search-materialidentifiers',
    style={'description_width': 'initial'},
    display='flex',
)
advanced_search_terms_input = widgets.Text(
    description='Advanced Search Terms:', 
    value='',
    style={'description_width': 'initial'},
    display='flex',
)
exclude_terms_input = widgets.Text(
    description='Exclude Terms:', 
    value='',
    style={'description_width': 'initial'},
    display='flex',
)
channel_dropdown = widgets.Dropdown(
    options=[
        ('Inventory Locations Catalog', 'InventoryLocationsCatalog'),
        ('Store Rooms Catalog', 'StoreRoomsCatalog')
    ],
    description='Channel:',
    style={'description_width': 'initial'},
    display='flex',
)

# Cache to store fetched data
cache = {}

In [8]:
# Function to handle form submission and show results
def on_form_submit(temp_store):
    searchterm = search_term_input.value
    advancedsearchtype = advanced_search_type_dropdown.value
    advancedsearchterms = advanced_search_terms_input.value or None
    excludeterms = exclude_terms_input.value or None
    selected_channel = channel_dropdown.value

    form_data = {
        'csrftoken': csrftoken,
        'searchterm': searchterm,
        'advancedsearchtype': advancedsearchtype,
        'advancedsearchterms': advancedsearchterms,
        'excludeterms': excludeterms,
        'channels': [selected_channel]
    }

    response = s.post(form_action_url, data=form_data)
    if response.status_code == 200:
        response_text = response.text
        print(f'Fetched data for search term "{searchterm}", advanced search type "{advancedsearchtype}", channel "{selected_channel}"')
    else:
        print(f'Failed to fetch results for search term "{searchterm}"')
        return response

    # Proceed with the GET request to fetch and display results
    # https://rmm.jaggaer.com/uowisconsin/erd-client/app/secure/sourcesearch/results?p=1&channel=InventoryLocationsCatalog
    result_url = base_url + '/uowisconsin/erd-client/app/secure/sourcesearch/results?' + urlencode({'p': 1, 'channel': selected_channel})
    
    get_response = s.get(result_url)
    if get_response.status_code == 200:
        new_soup = BeautifulSoup(get_response.text, 'html.parser')

        temp_store[(searchterm, advancedsearchtype, selected_channel)] = get_response.text

        # Detect the number of pages
        pager = new_soup.find('nav', {'id': 'pager'})
        if pager:
            page_links = pager.find_all('a', class_='page-link')
            num_pages = len(page_links) - 2 if page_links else 1
        else:
            num_pages = 1
        
        # Print the current URL and content
        print(f'Current URL: {get_response.url}')
        
        # Display the number of pages
        print(f'Successfully fetched results for {selected_channel}. Total pages: {num_pages}')
        display_result(new_soup, num_pages)

    else:
        print(f'Failed to fetch results for {selected_channel}')

    return get_response

# Function to display results
def display_result(soup, num_pages):

    # Extract structureMolFiles from the script tag
    script_tag = soup.find('script', string=re.compile('window\.structureMolFiles'))
    script_content = script_tag.string if script_tag else ''
    structure_mol_files = {}
    for match in re.finditer(r'window\.structureMolFiles\[(\d+)\]\s*=\s*\'(.*?)\';', script_content, re.DOTALL):
        key, value = match.groups()
        structure_mol_files[int(key)] = value.replace('\\n', '\n')
    # Render ChemDraw structures
    render_chemdraw_structures(structure_mol_files)

    # Extract and display search results
    search_results = extract_search_results(soup)
    
    # Save search results to a JSON file
    with open('search_results.json', 'w', encoding='utf-8') as json_file:
        json.dump(search_results, json_file, indent=4)
    
    print("Search results have been saved to search_results.json")

    # Display the results in a table
    display_results_table(search_results)

# Function to extract all search results
def extract_search_results(soup):
    search_results = []
    col_24_divs = soup.find_all('div', class_='col-24 p-0')

    current_supplier = None
    current_brand = None

    for col_24_div in col_24_divs:
        for div in col_24_div.find_all('div', recursive=False):
            if 'sourcesearch-group' in div.get('class', []):
                current_supplier = div.find('div', class_='sourcesearch-supplier').find('span').text.strip()
                current_brand = div.find('div', class_='sourcesearch-brand').find('span').text.strip()
            elif 'grouped-sourcesearch-subrow' in div.get('class', []):
                search_results.append(extract_data(div, current_supplier, current_brand))

    return search_results

# Function to extract and structure data from a result entry
def extract_data(result, supplier, brand):
    label_name = result.find('div', class_='text-bold', title=True)
    label_name = label_name.text.strip() if label_name else ''
    
    location = result.find('div', title="Location")
    location = location.text.strip() if location else ''
    
    bar_code = result.find('div', title="Bar Code")
    bar_code = bar_code.text.strip() if bar_code else ''
    
    supplier_catalog = result.find('div', title="Supplier Catalog #")
    supplier_catalog = supplier_catalog.text.strip() if supplier_catalog else ''
    
    manufacturer_part = result.find('div', title="Manufacturer/Brand Part #")
    manufacturer_part = manufacturer_part.text.strip() if manufacturer_part else ''
    
    # match exact class name
    created_date = result.find(lambda tag: tag.name == 'div' and tag.get('class') == ['col-12'])
    created_date = created_date.text.strip() if created_date else ''
    
    package = result.find('div', title="Package")
    package = package.text.strip() if package else ''
    
    purity = result.find('div', title="Purity")
    purity = purity.text.strip() if purity else ''
    
    view_container_href = base_url + result.find('a', title="View Container")['href'] if result.find('a', title="View Container") else ''

    return {
        'Supplier': supplier,
        'Brand': brand,
        'Label Name': label_name,
        'Location': location,
        'Bar Code': bar_code,
        'Supplier Catalog #': supplier_catalog,
        'Manufacturer/Brand Part #': manufacturer_part,
        'Created Date': created_date,
        'Package': package,
        'Purity': purity,
        'View Container URL': view_container_href
    }

# Function to render ChemDraw structures
def render_chemdraw_structures(structure_mol_files):
    image_url = base_url + '/uowisconsin/erd-client/app/secure/structure/getchemdrawstructures'
    mol_files = list(structure_mol_files.values())
    image_ids = list(structure_mol_files.keys())
    
    if mol_files:
        iters = (len(mol_files) + 24) // 25  # Calculate the number of iterations needed
        for i in range(iters):
            lower_bound = i * 25
            upper_bound = min((i + 1) * 25, len(mol_files))
            chunk = mol_files[lower_bound:upper_bound]
            data = {
                'molfiles[]': chunk,
                'lowerbound': lower_bound,
                'csrftoken': csrftoken
            }
            response = s.post(image_url, data=data)
            if response.status_code == 200:
                data = response.json()
                if data['error'] == 'false':
                    for idx, image in enumerate(data['images']):
                        img_id = f"structure-img-{lower_bound + idx}"
                        display(HTML(f'<img id="{img_id}" src="data:image/svg+xml;base64,{base64.b64encode(image.encode("utf-8")).decode("utf-8")}" width="200px" height="200px" class="bg-white">'))
                else:
                    print("Error getting structure images for mol files")
            else:
                print("Error getting structure images for mol files")

# Function to display results in a tabular format
def display_results_table(data):
    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)
    
    # Display the DataFrame
    display(df)