In [4]:
import pdfplumber
import pandas as pd
import os

def clean_text(text):
    lines = text.split("\n")
    cleaned_lines = [line.strip() for line in lines if line.strip()]  # Remove empty lines and strip spaces
    return " ".join(cleaned_lines)  # Join lines into a single paragraph for better readability

# Path to the extracted PDF file
pdf_path = "/Users/kartikvedi/Desktop/Assignment/QA - Supporting Files/QA - 4 - PDF/_000011888-0.2.pdf"

# Dictionary to store structured text
structured_text = {}

# Extract text from pages 13 to 24
with pdfplumber.open(pdf_path) as pdf:
    total_pages = len(pdf.pages)  # Get actual total pages
    for page_num in range(13, min(25, total_pages + 1)):  # Ensure we don't exceed available pages
        page = pdf.pages[page_num - 1]
        structured_text[page_num] = page.extract_text()

# Apply text cleaning and structure it for Excel
cleaned_text_data = {"Page Number": [], "Cleaned Text": []}

for page_num, text in structured_text.items():
    cleaned_text_data["Page Number"].append(page_num)
    cleaned_text_data["Cleaned Text"].append(clean_text(text))

# Ensure the output directory exists
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

# Create a DataFrame with cleaned text and save to Excel
cleaned_text_df = pd.DataFrame(cleaned_text_data)
cleaned_text_excel_path = os.path.join(output_dir, "Cleaned_Extracted_Text.xlsx")
cleaned_text_df.to_excel(cleaned_text_excel_path, index=False)

print(f"Cleaned text saved to: {cleaned_text_excel_path}")


Cleaned text saved to: output/Cleaned_Extracted_Text.xlsx


In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
import pandas as pd
import time
import re

def clean_text(text):
    """ Remove unwanted HTML tags and extra spaces from extracted text """
    return re.sub(r'<[^>]+>', '', text).strip()

def scroll_down(driver, times=5):
    """ Scroll down multiple times to load all content """
    for _ in range(times):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

def debug_page_source(driver, filename):
    """ Save and print page source for debugging """
    page_source = driver.page_source
    with open(filename, "w", encoding="utf-8") as file:
        file.write(page_source)
    print(f"Saved page source for debugging: {filename}")

def scrape_gray_point():
    url = "https://www.gray-point.com/properties/"
    driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
    driver.get(url)
    time.sleep(5)
    
    debug_page_source(driver, "gray_point_debug.html")  # Save page source for debugging
    
    properties = []
    
    scroll_down(driver)  # Ensure all properties are loaded
    
    try:
        listings = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'PropertyContent')]"))
        )
        print(f"Found {len(listings)} listings on Gray Point.")
        
        for listing in listings:
            try:
                name = clean_text(listing.find_element(By.XPATH, ".//h1").text)
            except:
                name = "N/A"
            try:
                address = clean_text(listing.find_element(By.XPATH, ".//p").text)
            except:
                address = "N/A"
            try:
                features = clean_text(listing.find_element(By.XPATH, ".//ul").text)
            except:
                features = "N/A"
            try:
                price = clean_text(listing.find_element(By.XPATH, ".//p[contains(text(),'£')]").text)
            except:
                price = "N/A"
            
            print(f"Extracted: {name}, {address}, {price}")
            properties.append([name, address, features, price])
    except Exception as e:
        print(f"No properties found on Gray Point. Error: {e}")

    driver.quit()
    return properties

def scrape_rightmove():
    url = "https://www.rightmove.co.uk/commercial-property-to-let.html"
    driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
    driver.get(url)
    time.sleep(5)
    
    debug_page_source(driver, "rightmove_debug.html")  # Save page source for debugging
    
    properties = []
    
    scroll_down(driver)  # Ensure all listings load
    
    try:
        # Wait for iframes to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.TAG_NAME, "iframe"))
        )
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        
        for iframe in iframes:
            driver.switch_to.frame(iframe)
            print("Switched to an iframe.")
            time.sleep(3)
            
            try:
                listings = driver.find_elements(By.XPATH, "//div[contains(@class, 'propertyCard-wrapper')]")
                if listings:
                    break  # Stop switching if listings are found
            except:
                driver.switch_to.default_content()  # Reset if no listings found
        
        driver.switch_to.default_content()  # Ensure we are back in main frame
        
        listings = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'propertyCard-wrapper')]"))
        )
        print(f"Found {len(listings)} listings on Rightmove.")
        
        for listing in listings:
            try:
                name = clean_text(listing.find_element(By.XPATH, ".//h2").text)
            except:
                name = "N/A"
            try:
                address = clean_text(listing.find_element(By.XPATH, ".//div[contains(@class, 'propertyCard-address')]").text)
            except:
                address = "N/A"
            try:
                price = clean_text(listing.find_element(By.XPATH, ".//span[contains(@class, 'propertyCard-priceValue')]" ).text)
            except:
                price = "N/A"
            
            print(f"Extracted: {name}, {address}, {price}")
            properties.append([name, address, "N/A", price])  # Features unavailable
    except Exception as e:
        print(f"No properties found on Rightmove. Error: {e}")

    driver.quit()
    return properties

def save_to_excel(data, filename):
    df = pd.DataFrame(data, columns=["Name", "Address", "Features", "Price"])
    df.to_excel(filename, index=False)

if __name__ == "__main__":
    gray_point_data = scrape_gray_point()
    rightmove_data = scrape_rightmove()
    
    save_to_excel(gray_point_data, "GrayPoint_Properties.xlsx")
    save_to_excel(rightmove_data, "RightMove_Properties.xlsx")
    
    print("Scraping completed. Data saved to GrayPoint_Properties.xlsx and RightMove_Properties.xlsx")


Saved page source for debugging: gray_point_debug.html
Found 3 listings on Gray Point.
Extracted: Sovereign Gate, 18-20 Kew Road, Richmond TW9 2NA, OFFICE TO LET RICHMOND / SERVICED OFFICE DESKS, Desk rates from £485 – £500 per desk
Extracted: 179 High Street, Hampton Hill TW12 1NL, NOW LET TO ESTABLISHED SUPERMARKET CHAIN, N/A
Extracted: 28 York Street, Twickenham TW1 3LJ, ﻿, N/A
Saved page source for debugging: rightmove_debug.html
Switched to an iframe.
No properties found on Rightmove. Error: Message: Unable to locate frame for element: [object HTMLIFrameElement]
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:197:5
NoSuchFrameError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:592:5
switchToFrame@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:707:15
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:290:31


In [23]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
import pandas as pd
import time
import traceback

def scrape_instagram_posts(profile_url, post_count=5):
    options = webdriver.FirefoxOptions()
    options.add_argument("--headless")  # Run in headless mode to reduce detection
    driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)
    driver.get(profile_url)
    time.sleep(5)
    
    posts = []
    
    try:
        # Wait until posts appear
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.XPATH, "//article//a"))
        )
        
        # Find all post links
        post_links = driver.find_elements(By.XPATH, "//article//a")[:post_count]
        
        for link in post_links:
            driver.execute_script("arguments[0].scrollIntoView();", link)  # Scroll to element
            time.sleep(2)
            driver.execute_script("arguments[0].click();", link)  # Use JavaScript click
            time.sleep(5)  # Give time for content to load
            
            try:
                caption = driver.find_element(By.XPATH, "//div[@data-testid='post-caption']").text
            except:
                try:
                    caption = driver.find_element(By.XPATH, "//meta[@property='og:description']").get_attribute("content")
                except:
                    caption = "N/A"
            
            try:
                likes = driver.find_element(By.XPATH, "//section[contains(@class, 'EDfFK ygqzn')]//span").text
            except:
                likes = "N/A"
            
            try:
                comments_section = driver.find_elements(By.XPATH, "//ul[contains(@class, 'Mr508')]/div/li/div/div")
                comments = [comment.text for comment in comments_section]
                comments = " | ".join(comments) if comments else "N/A"
            except:
                comments = "N/A"
            
            print(f"Extracted: {caption}, {likes}, {comments}")
            posts.append([profile_url, caption, likes, comments])
            
            driver.back()
            time.sleep(2)
        
    except Exception as e:
        print(f"Error scraping Instagram: {traceback.format_exc()}")  # Capture full error details
    
    driver.quit()
    return posts

# Save extracted data to Excel
def save_to_excel(data, filename):
    df = pd.DataFrame(data, columns=["Profile URL", "Caption", "Likes", "Comments"])
    df.to_excel(filename, index=False)

if __name__ == "__main__":
    profile_url = "https://www.instagram.com/kartikaaryan/"
    scraped_data = scrape_instagram_posts(profile_url, post_count=5)
    save_to_excel(scraped_data, "Instagram_Posts.xlsx")
    print("Scraping completed. Data saved to Instagram_Posts.xlsx")


ValueError: response body:
{"message":"API rate limit exceeded for 146.196.35.178. (But here's the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)","documentation_url":"https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting"}

request url:
https://api.github.com/repos/mozilla/geckodriver/releases/latest
response headers:
{'Date': 'Mon, 24 Mar 2025 22:23:58 GMT', 'Server': 'Varnish', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'deny', 'X-XSS-Protection': '1; mode=block', 'Content-Security-Policy': "default-src 'none'; style-src 'unsafe-inline'", 'Access-Control-Allow-Origin': '*', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset, X-RateLimit-Used, X-RateLimit-Resource, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, Deprecation, Sunset', 'Content-Type': 'application/json; charset=utf-8', 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin', 'X-GitHub-Media-Type': 'github.v3; format=json', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1742855878', 'X-RateLimit-Resource': 'core', 'X-RateLimit-Used': '60', 'Content-Length': '280', 'X-GitHub-Request-Id': 'F06C:2966E8:FD4D:1BB29:67E1DB7E'}


In [28]:
import pytesseract
from PIL import Image
import pandas as pd
import re
import os

# Updated path to Aadhaar images folder
aadhaar_folder_path = "/Users/kartikvedi/Desktop/Assignment/QA - Supporting Files/QA - 5 - Aadhar Cards"

# Path to save the cleaned Aadhaar details Excel file
output_excel_path = "/Users/kartikvedi/Desktop/Aadhaar_Details.xlsx"

def clean_text(text):
    """Removes extra spaces, new lines, and unwanted characters from OCR output."""
    return re.sub(r'[^a-zA-Z0-9\s,.-]', '', text).strip()

def extract_aadhaar_details(text):
    """Extract Name, DOB, Gender, Aadhaar Number, and Address from OCR text"""
    text = clean_text(text)
    name_match = re.search(r"Name\s*[:]?\s*([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)", text)
    dob_match = re.search(r"DOB[:\s-]+(\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4})", text)
    gender_match = re.search(r"\b(Male|Female|Other)\b", text, re.IGNORECASE)
    aadhaar_match = re.search(r"(\d{4}\s\d{4}\s\d{4})", text)
    address_match = re.search(r"Address[:]?\s*(.*)", text, re.DOTALL)

    return {
        "Name": name_match.group(1) if name_match else "N/A",
        "DOB": dob_match.group(1) if dob_match else "N/A",
        "Gender": gender_match.group(1).capitalize() if gender_match else "N/A",
        "Aadhaar Number": aadhaar_match.group(1) if aadhaar_match else "N/A",
        "Address": clean_text(address_match.group(1)) if address_match else "N/A",
    }

def process_aadhaar_images(folder_path):
    """Extract Aadhaar details from images in the specified folder"""
    if not os.path.exists(folder_path):
        print(f"Error: Aadhaar folder not found - {folder_path}")
        return []
    
    aadhaar_records = []
    
    for file in os.listdir(folder_path):
        image_path = os.path.join(folder_path, file)
        if not os.path.isfile(image_path):
            continue

        try:
            image = Image.open(image_path)
            extracted_text = pytesseract.image_to_string(image)
            aadhaar_data = extract_aadhaar_details(extracted_text)
            aadhaar_records.append(aadhaar_data)
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    return aadhaar_records

def save_to_excel(data, filename):
    """Save extracted data to an Excel file on Desktop"""
    df = pd.DataFrame(data)
    df.to_excel(output_excel_path, index=False)
    print(f"Aadhaar details saved to {output_excel_path}")

if __name__ == "__main__":
    extracted_data = process_aadhaar_images(aadhaar_folder_path)
    if extracted_data:
        save_to_excel(extracted_data, output_excel_path)
    else:
        print("No Aadhaar details extracted.")

Aadhaar details saved to /Users/kartikvedi/Desktop/Aadhaar_Details.xlsx
