In [4]:
import google.generativeai as genai
import requests
import pytesseract
from bs4 import BeautifulSoup
from google.cloud import vision_v1 as vision
from PIL import Image
import time
from urllib.parse import urlparse, urljoin


genai.configure(api_key="AIzaSyC6k_EYykry0wV_gnG2sF0d0qiV4FifnMw")


vision_client = vision.ImageAnnotatorClient()

# Path to Tesseract OCR (if using locally)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

### 1. Extract Text from Website ###
def fetch_website_text(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text()
    except Exception as e:
        return f"Error fetching URL: {e}"

### 2. Extract HTML Structure ###
def fetch_html_tags(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")
        return str(soup)[:5000]  # Limit to 5000 chars for LLM
    except Exception as e:
        return f"Error fetching HTML: {e}"

### 3. Extract Text from Screenshot (OCR) ###
def extract_text_from_image(image_path):
    try:
        # Option 1: Using Google Cloud Vision API
        with open(image_path, "rb") as image_file:
            content = image_file.read()
        image = vision.Image(content=content)
        response = vision_client.text_detection(image=image)
        return response.full_text_annotation.text

        # Option 2: Using Tesseract OCR (Alternative)
        # image = Image.open(image_path)
        # return pytesseract.image_to_string(image)

    except Exception as e:
        return f"Error extracting text from image: {e}"

### 4. Analyze Data with Gemini API ###
def analyze_with_gemini(url, website_text, html_tags, screenshot_text):
    model = genai.GenerativeModel("gemini-1.5-flash")

    prompt = f"""
    Analyze the following website details and determine if it is a phishing site.
    Provide a clear explanation for your decision.

    - **URL**: {url}
    - **Website Extracted Text**: {website_text[:5000]}
    - **HTML Structure (partial)**: {html_tags}
    - **Screenshot Extracted Text**: {screenshot_text[:5000]}

    Respond with "Phishing" or "Legitimate" and provide an explanation.
    """

    response = model.generate_content(prompt)
    return response.text  # Extract text from Gemini response

### 5. Run Phishing Detection ###
def detect_phishing(url, screenshot_path):
    website_text = fetch_website_text(url)
    html_tags = fetch_html_tags(url)
    screenshot_text = extract_text_from_image(screenshot_path)

    result = analyze_with_gemini(url, website_text, html_tags, screenshot_text)
    return result

### 6. Web Crawler to Crawl Links from a Website ###
def fetch_page_links(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")
        links = [a['href'] for a in soup.find_all('a', href=True)]
        return links
    except Exception as e:
        return f"Error fetching links from {url}: {e}"

# Function to Crawl and Detect Phishing on Multiple Pages
def crawl_and_detect_phishing(start_url, screenshot_path, max_depth=2):
    visited = set()
    to_visit = [start_url]
    depth = 0

    while to_visit and depth < max_depth:
        url = to_visit.pop(0)
        if url not in visited:
            visited.add(url)
            print(f"Visiting: {url} (Depth {depth})")

            # Run phishing detection on the current page
            result = detect_phishing(url, screenshot_path)
            print(f"Phishing Detection for {url}: {result}")

            # Fetch links and add to the list to visit
            links = fetch_page_links(url)
            for link in links:
                full_url = urljoin(url, link)  # Ensure the link is absolute
                if full_url not in visited:
                    to_visit.append(full_url)

            time.sleep(1)  # Add a delay to avoid overloading the server

        depth += 1


start_url = "https://coupangshope.shop/index/user/login.html"  # Starting URL
screenshot_path = ""  # Path to the screenshot

crawl_and_detect_phishing(start_url, screenshot_path, max_depth=3)


Visiting: https://coupangshope.shop/index/user/login.html (Depth 0)
Phishing Detection for https://coupangshope.shop/index/user/login.html: Phishing

Here's why the website is highly suspicious and likely a phishing site:

* **Suspicious URL:** The URL, `https://coupangshope.shop/index/user/login.html`, is very similar to the legitimate Coupang website but includes a misspelling ("coupangshope" instead of "coupang").  This is a common tactic used by phishers to trick users into believing the site is genuine.

* **Non-English Text and Mixed Language:** The website uses a mix of English and Chinese ("登錄" means "login" in Chinese). This inconsistency is a red flag, as legitimate websites generally stick to one consistent language.

* **Poor Website Design:** The HTML structure hints at a hastily put-together website. The use of generic Bootstrap styling and randomly included/commented-out code suggests a lack of professional design, which is common for phishing sites.

* **Generic Image:*

In [1]:
!pip install pytesseract



In [2]:
pip install google-cloud-vision

