## Contact Us form / Formulare

### Approach:
1.  Search for certain keywords related to the contact us formulare. 
2. Check all links one by one which includes these keywords
3. Now for each link which contains this keyword, check if there is a send/submit button present

### Improvements:
1. Use word similarity from BERT with a threshold
2. Translate all the split words from href before the comparison
    

In [1]:
# Example usage
websites = [
    "https://www.kropp.de/",
    "https://beaufort.lu/",
    "http://www.pluzine.me/",
    "https://www.jonava.lt/",
    "http://www.islalocalcouncil.com/",
    "http://qrendilocalcouncil.org.mt/",
    "https://www.michaelnbach.at/BUeRGERSERVICE/Gemeindeservice/Formulare"
    # "http://pembroke.gov.mt/",
    # "https://hamrunspartansfc.com/",
    # "https://www.maltairport.com/"
]

keywords = ["Kontakt", "contact", "contact us", "question", "contactus", "formulare"]


In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, unquote

def extract_links_with_keywords(url, keywords):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all <a> tags with href attribute
    links = soup.find_all("a")

    matched_links = []

    for link in links:
        href = link.get("href")
        if href:
            href = href.strip()
            decoded_href = unquote(href)  # Decode URL

            if any(keyword.lower() in decoded_href.lower() for keyword in keywords):
                # Check if the link starts with "http://" or "https://"
                if not decoded_href.startswith(("http://", "https://")):
                    # Append the link to the base URL of the webpage
                    decoded_href = urljoin(url, decoded_href)

                matched_links.append(decoded_href)

    return matched_links


In [3]:
import requests
from bs4 import BeautifulSoup

def has_contact_form(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")

    forms = soup.find_all("form")

    for form in forms:
        submit_button = form.find("input", type="submit")
        if not submit_button:
            submit_button = form.find("button", type="submit")

        if submit_button:
            return True

    return False


In [4]:
def get_all_websites_with_contactus(websites):
    all_matched_links = []
    for webpage_url in websites:
        matched_links = extract_links_with_keywords(webpage_url, keywords)
        all_matched_links.extend(matched_links)
    return all_matched_links

In [5]:

links =  get_all_websites_with_contactus(websites)

def has_formulare(links):
    for link in links:
        if has_contact_form(link):
            print(f"The link '{link}' contains a contact form.")
        else:
            print(f"The link '{link}' does not contain a contact form.")
has_formulare(links)

The link 'https://www.kropp.de/Kurzmenü/Kontakt/' contains a contact form.
The link 'https://beaufort.lu/contact/' contains a contact form.
The link 'https://beaufort.lu/contact/contactez-nous/' contains a contact form.
The link 'https://beaufort.lu/contact/annuaire/' contains a contact form.
The link 'https://beaufort.lu/contact/contactez-nous/' contains a contact form.
The link 'http://qrendilocalcouncil.org.mt/contactus' contains a contact form.
The link 'http://qrendilocalcouncil.org.mt/contactus.php' does not contain a contact form.
The link 'https://www.michaelnbach.at/BUeRGERSERVICE/Gemeindeservice/Formulare' does not contain a contact form.
The link 'https://www.michaelnbach.at/BUeRGERSERVICE/Gemeindeservice/Formulare' does not contain a contact form.
