CSV Sources:

Top 10k websites:
https://www.domcop.com/top-10-million-websites

Trusted Certificates from Apple:
https://support.apple.com/en-us/103100

In [14]:
import ssl
import socket
import pandas as pd
import datetime
import urllib.parse
from urllib.parse import urlparse, unquote, urljoin
import re
import tldextract
import whois
import requests
from bs4 import BeautifulSoup
from googlesearch import search



In [15]:
def removeHTTPS(url):
    """
    Removes the scheme of a url
    """
    if url.startswith("https://"):
        expected_position = 8 # Length of "https://"
        return url[expected_position:]
    elif url.startswith("http://"):
        expected_position = 7
        return url[expected_position:]
    else:
        return url

url = "https://www.facebook.com"
url2 = "http://www.example.com"
#print(removeHTTPS(url))
#print(removeHTTPS(url2))

www.facebook.com
www.example.com


In [39]:
def get_certificate_info(url):
    """
    Retrieves the certification information from a given url
    """
    hostname = removeHTTPS(url)
    context = ssl.create_default_context()
    with socket.create_connection((hostname, 443)) as sock:
        with context.wrap_socket(sock, server_hostname=hostname) as ssock:
            cert = ssock.getpeercert()
            return cert

def extract_certificate_issuer(url):
    """
    Gets only the issuer of the certificate
    """
    cert = get_certificate_info(url)
    issuer_info = cert.get('issuer', ())
    issuer_common_name = None
    for item in issuer_info:
        for field in item:
            if field[0] == 'commonName':
                issuer_common_name = field[1]
                break
        if issuer_common_name:
            break
    return issuer_common_name.split(' ')[0] if issuer_common_name else None

def read_trusted_certificates(url):
    """
    If the issuer is in the list of trusted certificates given by Apple (List of available trusted root certificates 
    in iOS 16, iPadOS 16, macOS 13, tvOS 16, and watchOS 9), then legitimate.
    https://support.apple.com/en-us/103100
    """
    issuer = extract_certificate_issuer(url)
    dataset = pd.read_csv('apple_trusted_certificates.csv')
    certificate_names = dataset.iloc[:, 0]
    search_results = certificate_names.str.contains(issuer)
    if search_results.any():
        return 0 # legitimate
    else:
        return 1 # phishing
    
def get_certificate_age(url):
    """
    Extracts the age of the certificate
    """
    hostname = removeHTTPS(url)
    cert = get_certificate_info(hostname)
    if cert:
        not_before = cert.get('notBefore', None)
        not_after = cert.get('notAfter', None)
        if not_before and not_after:
            not_before_date = datetime.datetime.strptime(not_before, "%b %d %H:%M:%S %Y %Z")
            not_after_date = datetime.datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
            certificate_age = not_after_date - not_before_date
            return certificate_age.days / 365  # Return age in years
    return None


# Example usage:
url = 'https://www.facebook.com'
'''
print(get_certificate_info(url))
print(extract_certificate_issuer(url))
print(read_trusted_certificates(url))
print(get_certificate_age(url))
'''

{'subject': ((('countryName', 'US'),), (('stateOrProvinceName', 'California'),), (('localityName', 'Menlo Park'),), (('organizationName', 'Meta Platforms, Inc.'),), (('commonName', '*.facebook.com'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('organizationalUnitName', 'www.digicert.com'),), (('commonName', 'DigiCert SHA2 High Assurance Server CA'),)), 'version': 3, 'serialNumber': '0B6FAD4DB050685087922C21187AAFBE', 'notBefore': 'Feb  9 00:00:00 2024 GMT', 'notAfter': 'May  7 23:59:59 2024 GMT', 'subjectAltName': (('DNS', '*.facebook.com'), ('DNS', '*.facebook.net'), ('DNS', '*.fbcdn.net'), ('DNS', '*.fbsbx.com'), ('DNS', '*.m.facebook.com'), ('DNS', '*.messenger.com'), ('DNS', '*.xx.fbcdn.net'), ('DNS', '*.xy.fbcdn.net'), ('DNS', '*.xz.fbcdn.net'), ('DNS', 'facebook.com'), ('DNS', 'messenger.com')), 'OCSP': ('http://ocsp.digicert.com',), 'caIssuers': ('http://cacerts.digicert.com/DigiCertSHA2HighAssuranceServerCA.crt',), 'crlDistributionPoints':

In [17]:
def extract_domain(url):
    extracted = tldextract.extract(url)
    return f"{extracted.domain}"

# Example usage:
url = "https://www.msnbc.com"
domain = extract_domain(url)
#print(domain)  # Output: msnbc


In [69]:
# Address Bar based Features

def having_IP_Address(url):
    # Parse the URL
    parsed_url = urlparse(unquote(url))
    
    # Extract the domain name from the URL
    domain = parsed_url.netloc
    
    # Check if the domain is an IP address
    ip_pattern = r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$'
    if re.match(ip_pattern, domain):
        return 1  # URL has an IP address
    else:
        return 0  # URL does not have an IP address

def URL_Length(url):
    size = len(url)
    if size >= 54:
        return 1 # URL is phishing
    else:
        return 0 # URL is not phishing
    

def shortening_service(url):
    # Extract the domain name from the URL
    domain = tldextract.extract(url).domain
    
    # List of known URL shortening services
    shortening_services = ['bitly', 'tinyurl', 'goo', 'owly', 'tco', 'buffly', 'tinycc']
    
    # Check if the domain matches any of the known shortening services
    if domain in shortening_services:
        return 1  # Phishing
    else:
        return 0  # Legitimate

def having_At_Symbol(url):
    if '@' in url:
        return 1 # Phishing
    else:
        return 0 # Legitimate
    
def double_slashing_redirecting(url):
    # Check if the URL path contains "//"
    if '//' in url:
        # Determine the expected position of "//" based on the URL scheme
        shortened_url = removeHTTPS(url)
        
        # Get the actual position of "//"
        actual_position = shortened_url.find("//")
        
        # Compare the actual and expected positions
        if actual_position == -1:
            return 0  # Legitimate
        else:
            return 1  # Phishing
    else:
        return 0  # Legitimate (no double slash redirecting)
    
def prefix_suffix(url):
    if "-" in url.split("/")[2]:  # Check domain part
        return 1  # Phishing
    else:
        return 0  # Legitimate
    
def having_sub_domain(url):
    domain = url.split("/")[2].replace("www.", "")  # Remove www.
    dots_in_domain = domain.count(".")
    if dots_in_domain == 1:
        return 0
    else:
        return 1

def SSLfinal_state(url):
    if url.startswith("https"):
        cert = get_certificate_info(url)
        if cert:
            issuer_trusted = read_trusted_certificates(url)
            certificate_age_years = get_certificate_age(url)

            # Unable to determine the SSL certificate status
            if certificate_age_years is None:
                return None  
            
            # Check if HTTPS is used and the issuer is trusted and certificate age is at least 1 year
            if issuer_trusted == 0 and certificate_age_years >= 1:
                return 0  # Legitimate
            else:
                return 1 # Phishing
    return None  # Indeterminate

def domain_registration_length(url):
    try:
        domain = whois.whois(url)
        expiration_date = domain.expiration_date
        creation_date = domain.creation_date
        
        # Extract the first element from the list if it's a list
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        
        domain_age = (expiration_date - creation_date).days / 365
        if domain_age <= 1:
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error fetching domain registration information: {e}")
        return 0  # Assuming legitimate if there's an error
'''
def check_favicon(url):
    try:
        # Fetch HTML content of the webpage
        response = requests.get(url)
        if response.status_code != 200:
            return 0  # Assuming legitimate if unable to fetch HTML
        
        html_content = response.text
        
        # Parse HTML content to find the favicon link
        soup = BeautifulSoup(html_content, 'html.parser')
        favicon_link = None
        favicon_list = []
        for link in soup.find_all('link', rel='icon'):
            favicon_link = link.get('href')
            favicon_list.append(favicon_link)            
            #break  # Only consider the first favicon link
        
        if not favicon_link:
            return 0  # Assuming legitimate if no favicon link found
        
        print(favicon_link)
        domain = extract_domain(url)
        
        not_domain_count = 0
        domain_count = 0
        # Compare the domains
        for domain in favicon_list:
            if  domain not in favicon_link:
                not_domain_count += 1
                #return 1
            else:
                domain_count += 1
                #return 0
        return [1 if (not_domain_count/len(favicon_list)) > 0.5 else 0 for _ in range(len(favicon_list))]
        
    except Exception as e:
        print(f"Error checking favicon: {e}")
        return 0  # Assuming legitimate if there's an error
'''

def check_favicon(url):
    try:
        # Fetch HTML content of the webpage
        response = requests.get(url)
        if response.status_code != 200:
            return 0  # Assuming legitimate if unable to fetch HTML
        
        html_content = response.text
        
        # Parse HTML content to find the favicon link
        soup = BeautifulSoup(html_content, 'html.parser')
        favicon_links = [link.get('href') for link in soup.find_all('link', rel='icon')]
        
        if not favicon_links:
            return 0  # Assuming legitimate if no favicon link found
        
        domain = urlparse(url).netloc
        
        not_domain_count = sum(1 for link in favicon_links if domain not in link)
        
        return 1 if not_domain_count / len(favicon_links) > 0.5 else 0
    
    except Exception as e:
        print(f"Error checking favicon: {e}")
        return 0  # Assuming legitimate if there's an error


def check_non_standard_port(url):
    try:
        # Extract the port number from the URL
        parsed_url = urlparse(url)
        port = parsed_url.port
        
        # If no port is specified, default to standard ports
        if not port:
            if parsed_url.scheme == 'https':
                port = 443  # Default HTTPS port
            else:
                port = 80  # Default HTTP port
        
        # Check if the port is the standard port for the protocol
        if (parsed_url.scheme == 'http' and port != 80) or (parsed_url.scheme == 'https' and port != 443):
            return 1
        else:
            return 0
        
    except Exception as e:
        print(f"Error checking non-standard port: {e}")
        return 0  # Assuming legitimate if there's an error

    
def check_https_token(url):
    try:
        # Extract the domain part from the URL
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        
        # Check if the "HTTPS" token exists in the domain part
        if "https" in domain.lower():
            return 1
        else:
            return 0
        
    except Exception as e:
        print(f"Error checking HTTPS token: {e}")
        return 0  # Assuming legitimate if there's an error

url = "https://www.facebook.com"

'''
print(having_IP_Address(url))
print(URL_Length(url))
print(shortening_service(url))
print(having_At_Symbol(url))
print(double_slashing_redirecting(url))
print(prefix_suffix(url))
print(having_sub_domain(url))
print(SSLfinal_state(url))

print(domain_registration_length(url))
'''
print(check_favicon(url))
'''
print(check_non_standard_port(url))
print(check_https_token(url))
'''

1


'\nprint(check_non_standard_port(url))\nprint(check_https_token(url))\n'

In [19]:
def extract_domain_and_extension(url):
    """
    ex: https://www.msnbc.com
    returns: msnbc.com
    """
    extracted = tldextract.extract(url)
    return f"{extracted.domain}.{extracted.suffix}"

def get_html_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.content.decode('utf-8')  # Decode bytes to string
        else:
            print(f"Failed to fetch HTML content from {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred while fetching HTML content from {url}: {e}")
        return None
    
#print(get_html_content(url))


Main domain: msnbc.com
<!DOCTYPE html><html lang="en"><head><link href="https://nodeassets.nbcnews.com/_next/static/css/6646d35dd7542bc5.css" rel="preload" as="style"/><style>@font-face{font-family:iconfont;src:url(https://nodeassets.nbcnews.com/_next/static/assets/fonts/iconfont.d4b55648bc.woff2)format("woff2");font-display:swap;}</style><link href="https://nodeassets.nbcnews.com/_next/static/css/6646d35dd7542bc5.css" rel="stylesheet"/><link rel="shortcut icon" type="image/x-icon" href="https://nodeassets.nbcnews.com/cdnassets/projects/ramen/favicon/msnbc/all-other-sizes-PNG.ico/favicon.ico"/><link rel="icon" type="image/png" sizes="16x16" href="https://nodeassets.nbcnews.com/cdnassets/projects/ramen/favicon/msnbc/all-other-sizes-PNG.ico/favicon-16x16.png"/><link rel="icon" type="image/png" sizes="32x32" href="https://nodeassets.nbcnews.com/cdnassets/projects/ramen/favicon/msnbc/all-other-sizes-PNG.ico/favicon-32x32.png"/><link rel="icon" type="image/png" sizes="96x96" href="https://n

In [46]:
def get_anchor_tags(url):
    # Send a GET request to the webpage
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all anchor tags
        anchor_tags = soup.find_all('a')
        
        # Extract the href attribute from each anchor tag
        anchor_urls = [tag.get('href') for tag in anchor_tags]
        
        return anchor_urls
    else:
        # Print an error message if the request failed
        print("Failed to fetch webpage")
        return []
    
def get_Meta_Script_Link_Tags(url):
    # Send a GET request to the webpage
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all meta tags
        meta_tags = soup.find_all('meta')
        meta_list = [str(tag) for tag in meta_tags]
        
        # Find all script tags
        script_tags = soup.find_all('script')
        script_list = [str(tag) for tag in script_tags]
        
        # Find all link tags
        link_tags = soup.find_all('link')
        link_list = [str(tag) for tag in link_tags]
        
        return meta_list, script_list, link_list
    else:
        # Print an error message if the request failed
        print("Failed to fetch webpage")
        return [], [], []

url = 'https://www.facebook.com'
#print(get_anchor_tags(url))
#print(get_Meta_Script_Link_Tags(url))

['#', 'https://www.facebook.com/recover/initiate/?privacy_mutation_token=eyJ0eXBlIjowLCJjcmVhdGlvbl90aW1lIjoxNzE0NDQxMzE3LCJjYWxsc2l0ZV9pZCI6MzgxMjI5MDc5NTc1OTQ2fQ%3D%3D&ars=facebook_login', '#', '/pages/create/?ref_type=registration_form', 'https://es-la.facebook.com/', 'https://fr-fr.facebook.com/', 'https://zh-cn.facebook.com/', 'https://ar-ar.facebook.com/', 'https://pt-br.facebook.com/', 'https://it-it.facebook.com/', 'https://ko-kr.facebook.com/', 'https://de-de.facebook.com/', 'https://hi-in.facebook.com/', 'https://ja-jp.facebook.com/', '#', '/reg/', '/login/', 'https://messenger.com/', '/lite/', 'https://www.facebook.com/watch/', '/places/', '/games/', '/marketplace/', 'https://pay.facebook.com/', 'https://www.meta.com/', 'https://www.meta.com/quest/', 'https://www.meta.ai/', 'https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.instagram.com%2F&h=AT1vBkaaP1-DkZSIzKcGnVwAUzKGCj1rc6r7Nnevz_8umgQZxUTEJWYHJEciYMb1_WelklL0da7NndC0u2rK7kkcNW_wumZjcbPdSbgCa0k7Ld2sVTGoqhpVIZt14qwTWngCrxs

In [70]:
# Abnormal based features

def request_URL(url):
    try:
        # Retrieve HTML content of the URL
        html_content = requests.get(url).text
        
        # Parse HTML content
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Initialize lists for external and internal URLs
        external_urls = []
        internal_urls = []
        
        # Find all relevant tags containing external URLs
        for tag in soup.find_all(['img', 'video', 'audio', 'a']):
            src = tag.get('src')
            href = tag.get('href')
            if src:
                external_urls.append(src)
            if href:
                internal_urls.append(href)
        
        # Extract domain from the provided URL
        domain = urlparse(url).netloc
        
        # Count external and internal URLs
        external_domain_count = sum(1 for url in external_urls if urlparse(url).netloc != domain)
        internal_domain_count = sum(1 for url in internal_urls if urlparse(url).netloc == domain)
        
        # Calculate the ratio of external URLs to total URLs
        total_request_urls = len(external_urls) + len(internal_urls)
        if total_request_urls > 0:
            external_ratio = external_domain_count / total_request_urls * 100  # Calculate as percentage
        else:
            external_ratio = 0
        
        # Determine legitimacy based on the external URL ratio
        if external_ratio < 22:  # Legitimate if < 22%
            return 0
        else:
            return 1  # Phishing if >= 61%
        
    except Exception as e:
        print("An error occurred:", e)
        return "Error"  # Error or unable to determine legitimacy


def check_anchor_percentage(url):
    try:
        # Get anchor tags from the webpage
        anchors = get_anchor_tags(url)
        website_domain = extract_domain_and_extension(url)
        same_domain_count = 0
        different_domain_count = 0

        # Check if the netloc for each anchor is different from the website_domain
        for anchor in anchors:
            # Parse the anchor URL
            parsed_anchor = urlparse(anchor)

            # Skip empty or irrelevant anchor URLs
            if not parsed_anchor.netloc:
                continue

            # If the anchor URL is relative, resolve it against the base URL of the webpage
            if not parsed_anchor.scheme:
                anchor = urljoin(url, anchor)

            # Parse the resolved anchor URL
            parsed_anchor = urlparse(anchor)

            # Compare the netloc of the anchor with the website_domain
            if website_domain in parsed_anchor.netloc:
                same_domain_count += 1
            else:
                different_domain_count += 1

        # Calculate the percentage of anchor tags with different domains
        total_anchors = same_domain_count + different_domain_count
        if total_anchors == 0:
            return 0  # Assuming legitimate if no anchor tags are found
        else:
            percentage = different_domain_count / total_anchors

            # Classify based on percentage
            if percentage < 0.31:
                return 0
            else:
                return 1

    except Exception as e:
        print(f"Error checking anchor percentage: {e}")
        return 0  # Assuming legitimate if there's an error


def calculate_links_percentage(url):
    try:
        # Send an HTTP GET request to the webpage
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all meta, script, and link tags with href attributes
        meta_tags = soup.find_all("meta", href=True)
        script_tags = soup.find_all("script", href=True)
        link_tags = soup.find_all("link", href=True)

        # Count the number of links pointing to the same domain
        same_domain_count = 0
        total_links_count = len(meta_tags) + len(script_tags) + len(link_tags)

        # Check if there are any links present before calculating percentage
        if total_links_count == 0:
            return "No links found"

        for tag_list in [meta_tags, script_tags, link_tags]:
            for tag in tag_list:
                href = tag['href']
                absolute_url = urlparse(href)
                if absolute_url.netloc == '' or urlparse(url).netloc in absolute_url.netloc:
                    same_domain_count += 1

        # Calculate the percentage of links pointing to the same domain
        links_percentage = (same_domain_count / total_links_count) * 100

        # Determine the legitimacy based on the percentage of links
        if links_percentage < 17:
            return 0
        else:
            return 1

    except Exception as e:
        print("An error occurred:", e)
        return "Error"


def check_sfh(url):
    """
    Detect suspicious form handlers in the HTML code and return 1 for phishing, 0 for legitimate.
    """
    html_content = get_html_content(url)
    page_domain = extract_domain_and_extension(url)

    soup = BeautifulSoup(html_content, 'html.parser')
    forms = soup.find_all('form')

    for form in forms:
        action = form.get('action')
        if not action or action == "about:blank":
            return 1  # Phishing

    return 0  # Legitimate

def check_email_submission(url):
    # Parse HTML content
    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Check for mailto: links
    mailto_links = soup.find_all(href=re.compile(r'^mailto:', re.IGNORECASE))
    
    # Check for mail() function in JavaScript code
    script_tags = soup.find_all('script')
    mail_function_found = False
    for script in script_tags:
        if re.search(r'mail\s*\(.*\)', script.get_text()):
            mail_function_found = True
            break
    
    if mailto_links or mail_function_found:
        return 1
    else:
        return 0
    
def check_abnormal_url(url):
    hostname = extract_domain(url)
    if not hostname:
        return 1  # If hostname is not included in URL
    try:
        domain = whois.whois(hostname)
        if domain:
            return 0
        else:
            return 1
    except Exception as e:
        print("Error:", e)
        return "Unknown"

# Example usage:
#url = "https://www.msnbc.com"
url = "https://www.facebook.com"
#print('Request URL', request_URL(url))
#print('Anchor percentage:', check_anchor_percentage(url))
#print('Links percentage:', calculate_links_percentage(url))
#print('SFH:', check_sfh(url))
#print('Email submission:', check_email_submission(url))
#print('Abnormal url:', check_abnormal_url(url))

SFH: 0


In [67]:
# HTML and JavaScript Based Features

def check_website_forwarding(url):
    try:
        response = requests.get(url)
        if response.history:
            final_url = response.url
            if url not in final_url:
                return 1  # Forwarding detected
        return 0  # No forwarding
    except Exception as e:
        return -1  # Error occurred

def check_status_bar_customization(url):
    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, 'html.parser')
    script_tags = soup.find_all('script')
    for tag in script_tags:
        if 'onmouseover' in tag.get_text():
            return 1
    return 0

def check_right_click_disabling(url):
    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, 'html.parser')
    script_tags = soup.find_all('script')
    for tag in script_tags:
        if 'event.button==2' in tag.get_text():
            return 1
    return 0

import urllib.parse

def check_pop_up_window(url):
    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, 'html.parser')
    pop_up_windows = soup.find_all('div', {'class': 'popup'})  # Assuming popup windows are represented by <div> elements with class 'popup'

    for window in pop_up_windows:
        # Check if the popup window contains input fields
        input_fields = window.find_all('input', {'type': 'text'})  # Assuming text fields are represented by <input> elements with type 'text'
        if input_fields:
            # If the popup contains text fields, consider it as potentially phishing
            return 1
    
    # Check for window.open()
    script_tags = soup.find_all('script')
    for tag in script_tags:
        script_content = tag.get_text()
        if 'window.open(' in script_content:
            # If window.open() is found and contains text fields, consider it as phishing
            if 'input' in script_content:
                return 1
    
    # Check for <a> tags with target="_blank"
    for anchor_tag in soup.find_all('a'):
        if anchor_tag.get('target') == '_blank':
            target_url = anchor_tag.get('href')
            if not target_url.startswith('javascript:'):
                # Decode the URL once before encoding
                target_url = urllib.parse.unquote(target_url)
                target_url = urllib.parse.quote(target_url, safe=':/')
                response = requests.get(target_url)
                if response.status_code == 200:
                    target_soup = BeautifulSoup(response.text, 'html.parser')
                    target_input_fields = target_soup.find_all('input', {'type': 'text'})
                    if target_input_fields:
                        return 1
    
    # Check for :target in styles
    for style_tag in soup.find_all('style'):
        if ':target' in style_tag.get_text():
            # If :target is found in styles and contains text fields, consider it as phishing
            if 'input' in style_tag.get_text():
                return 1
    
    # If no indication of phishing is found, consider it legitimate
    return 0



def check_iframe_redirection(url):
    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, 'html.parser')
    iframes = soup.find_all('iframe')  # Search for iframe tags
    if iframes:
        for iframe in iframes:
            if 'frameborder' in iframe.attrs and iframe['frameborder'].lower() == '0':
                # Phishing if iframe has frameborder attribute set to 0
                return 1
        # If no iframe has frameborder=0, it's legitimate
        return 0
    # If no iframes found, it's legitimate
    return 0

# Example usage:
url = 'https://www.msnbc.com'
url2 = "https://governance.pfasproject.com"
url3 = 'https://www.facebook.com'

#print(check_website_forwarding(url))
#print(check_status_bar_customization(url))
#print(check_right_click_disabling(url))
#print(url, check_pop_up_window(url))
#print(url2, check_pop_up_window(url2))
#print(url3, check_pop_up_window(url3))
#print(check_iframe_redirection(url))

https://www.facebook.com 0


In [32]:
# CSV File that contains the top 10k domains
import pandas as pd
df = pd.read_csv('top10milliondomains.csv')

Index(['Rank', 'Domain', 'Open Page Rank'], dtype='object')


In [33]:
# Domain Based Features

def calculate_age_of_domain(url):
    """
    Calculates the age of a given domain using the whois library
    
    Parameters:
        domainAndExtension (str): the domain and extension of a url (ex: facebook.com)
    """
    domainAndExtension = extract_domain_and_extension(url)
    try:
        domain_info = whois.whois(domainAndExtension)
        creation_date = domain_info.creation_date
        if creation_date is None:  # Check if creation_date is None
            return 1  
        if isinstance(creation_date, list):  # For some domains, creation_date may be a list
            creation_date = creation_date[0]
        today = datetime.datetime.now()
        age = (today - creation_date).days
        if age >= 180:  # 180 days is equivalent to 6 months
            return 0
        else:
            return 1
    except Exception as e:
        print("Error:", e)
        print("function calculate_age_of_domain")
        return 1


def check_dns_record(url):
    """
    Calculates the age of a given domain using the whois library
    
    Parameters:
        domainAndExtension (str): the domain and extension of a url (ex: facebook.com)
    """
    domainAndExtension = extract_domain_and_extension(url)
    try:
        domain_info = whois.whois(domainAndExtension)
        if domain_info is None or not domain_info.name_servers:
            return 1
        else:
            return 0
    except Exception as e:
        print("Error:", e)
        return 1

def get_traffic_and_pageRank(url):
    """
    Determines the amount of website traffic based on how popular the URL is.
    Uses a CSV file that contains the top 10 million websites.

    If PageRank > 2: Legitimate
    """
    domain = extract_domain_and_extension(url)
    top100k = pd.read_csv('top10milliondomains.csv')
    top100k_domains = top100k[['Domain', 'Open Page Rank']]  # Corrected column selection

    # Filter the DataFrame to get the entry with domain 
    exact_match = top100k_domains[top100k_domains['Domain'] == domain]
    if not exact_match.empty:
        # Print out the entry with domain 'msnbc.com'
        print("Domain:", exact_match['Domain'].values[0])
        print("Page Rank:", exact_match['Open Page Rank'].values[0])
        if exact_match['Open Page Rank'].values[0] > 2:
            return 0
        else:
            return 1
    else:
        return 1

def is_indexed_by_google(url):
    results = list(search(f"site:{url}", num_results=1))

    for item in results:
        if url in item:
            return 0
    return 1

# Example usage:
url = "https://www.msnbc.com"
'''
print(calculate_age_of_domain(url))
print(check_dns_record(url))
print(get_traffic_and_pageRank(url))
print(is_indexed_by_google(url))
'''

0
0
Domain: msnbc.com
Page Rank: 5.5
0
0


In [34]:
def links_pointing_to_page(url):
    try:
        # Send an HTTP GET request to the webpage
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all anchor tags with href attributes
        links = soup.find_all("a", href=True)

        # Initialize a counter for links pointing to the webpage
        links_to_page_count = 0

        # Loop through all anchor tags
        for link in links:
            # Get the value of the href attribute
            href = link['href']
            
            # Join the href value with the base URL to handle relative URLs
            absolute_url = urljoin(url, href)

            # Check if the absolute URL matches the base URL
            if url in absolute_url:
                links_to_page_count += 1

        if links_to_page_count >= 2:
            return 0
        else:
            return 1

    except Exception as e:
        print("An error occurred:", e)
        return 1


# Example usage:
url = 'https://www.msnbc.com'
#links_pointing_to_page(url)

0