In [50]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
import pandas as pd
import re

def get_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    return soup

def get_links(soup, base_url):
    links = []
    for link in soup.find_all('a', href=True):
        url = link['href']
        # Resolve relative links
        url = urllib.parse.urljoin(base_url, url)
        # Only add links that start with the base URL
        if url.startswith(base_url) and url not in links:
            links.append(url)
    return links

def get_text(soup):
    # Ignore non-text content by only extracting text within <p> tags
    text = '\n'.join([paragraph.get_text() for paragraph in soup.find_all('p')])
    return text

def get_tables(soup):
    tables = []
    for table in soup.find_all('table'):
        table_text = '\n'.join([str(row) for row in table.find_all('tr')])
        tables.append(table_text)
    return '\n'.join(tables)

def get_title(soup):
    title = soup.find('title').get_text()
    return title

def scrape_website(base_url):
    visited = set()
    to_visit = [(base_url, None)]  # (url, parent_url)
    data = []

    while to_visit:
        url, parent_url = to_visit.pop()
        if url in visited:
            continue
        visited.add(url)

        soup = get_soup(url)

        title = get_title(soup)
        text = get_text(soup)
        tables = get_tables(soup)

        data.append((title, text, tables))

        links = get_links(soup, base_url)
        to_visit.extend((link, url) for link in links if link not in visited)

    df = pd.DataFrame(data, columns=['title', 'text', 'tables'])

    return df

def remove_unwanted_text(df, unwanted_text):
    # Apply the removal to the 'text' column of the DataFrame
    df['text'] = df['text'].apply(lambda x: x.replace(unwanted_text, '', 1) if x.startswith(unwanted_text) else x)
    return df

base_url = "https://www.uscis.gov/working-in-the-united-states"
df = scrape_website(base_url)

unwanted_text = "Official websites use .gov\nA .gov website belongs to an official government organization in the United States.\nSecure .gov websites use HTTPS\nA lock ( A locked padlock  ) or https:// means you've safely connected to the .gov website. Share sensitive information only on official, secure websites.\n"
df = remove_unwanted_text(df, unwanted_text)


In [51]:
def preprocess_text(text):
    # Remove reference tags (e.g., [1], [2], etc.)
    text = re.sub(r'\[\d+\]', '', text)
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def split_into_sections(text, max_length=1600):
    # Split the text into paragraphs
    paragraphs = text.split('\n')
    sections = []
    section = ''
    for paragraph in paragraphs:
        # If adding the next paragraph doesn't exceed the maximum length,
        # add the paragraph to the current section
        if len(section) + len(paragraph) < max_length:
            section += paragraph
        else:
            # If it does, start a new section
            sections.append(section)
            section = paragraph
    # Don't forget to add the last section
    if section:
        sections.append(section)
    return sections

def post_process(df):
    # Discard less relevant-looking sections like "External Links" and "Footnotes"
    df = df[~df['title'].str.contains('External Links|Footnotes', case=False)]
    # Clean up the text
    df['text'] = df['text'].apply(preprocess_text)
    # Split each article into sections
    df['sections'] = df['text'].apply(split_into_sections)
    # Prepend titles to each section's text
    df['sections'] = df.apply(lambda row: [f"{row['title']}\n{section}" for section in row['sections']], axis=1)
    # Unroll the sections into separate rows
    df = df.explode('sections')
    df['text'] = df['sections']
    df.drop(columns=['sections'], inplace=True)
    return df

df = post_process(df)


In [53]:
df.to_csv("visa_embeddings.csv", index=False)

In [35]:
df.head(10)

Unnamed: 0,title,url,parent_url,text,tables
0,Working in the United States | USCIS,https://www.uscis.gov/working-in-the-united-st...,,Working in the United States | USCIS\n,
0,Working in the United States | USCIS,https://www.uscis.gov/working-in-the-united-st...,,Working in the United States | USCIS\nMany non...,
1,Petition Process Overview | USCIS,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Petition Process Overview | USCIS\nIf you woul...,
2,Report Labor Abuses | USCIS,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Report Labor Abuses | USCIS\n,
2,Report Labor Abuses | USCIS,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Report Labor Abuses | USCIS\nWe are committed ...,
3,Options for Nonimmigrant Workers Following Ter...,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Options for Nonimmigrant Workers Following Ter...,
3,Options for Nonimmigrant Workers Following Ter...,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Options for Nonimmigrant Workers Following Ter...,
4,Employment Authorization in Compelling Circums...,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Employment Authorization in Compelling Circums...,"<tr><th scope=""col"">To establish…</th>\n<th sc..."
4,Employment Authorization in Compelling Circums...,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Employment Authorization in Compelling Circums...,"<tr><th scope=""col"">To establish…</th>\n<th sc..."
5,Validation Instrument for Business Enterprises...,https://www.uscis.gov/working-in-the-united-st...,https://www.uscis.gov/working-in-the-united-st...,Validation Instrument for Business Enterprises...,


In [36]:
df.shape

(173, 5)

In [28]:
df.to_csv("visa_embeddings.csv")

In [38]:
df['title'].unique()

array(['Working in the United States | USCIS',
       'Petition Process Overview | USCIS', 'Report Labor Abuses | USCIS',
       'Options for Nonimmigrant Workers Following Termination of Employment | USCIS',
       'Employment Authorization in Compelling Circumstances | USCIS',
       'Validation Instrument for Business Enterprises (VIBE) Program | USCIS',
       'VIBE Questions and Answers | USCIS',
       'Employment Authorization | USCIS', 'Employer Information | USCIS',
       'DHS Support of the Enforcement of Labor and Employment Laws | USCIS',
       'Information for Employers and Employees | USCIS',
       'WB Temporary Business Visitor under Visa Waiver Program | USCIS',
       'GB Temporary Visitor to Guam | USCIS',
       'B-1 Temporary Business Visitor | USCIS',
       'Temporary Visitors for Business | USCIS',
       'Students and Employment | USCIS',
       'Changing to a Nonimmigrant F or M Student Status | USCIS',
       'Exchange Visitors | USCIS', 'Conrad 30 Waiver P

626