In [3]:
!pip install googlesearch-python beautifulsoup4



In [4]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from googlesearch import search
from urllib.parse import urlparse

In [5]:


# Keywords for a B2B model
keywords = [
    "digital marketing agency India",
    "SEO service India",
    "web design agency India"
]

# Will store all related url's in set
urls = set()
for keyword in keywords:
    for result in search(keyword, num_results=15):  # Top 15 results per keyword
        urls.add(result)

# Convert to DataFrame and show sample
df = pd.DataFrame(urls, columns=["Website URL"])
print(df.head())

df.to_excel("agency_websites.xlsx", index=False)


                                         Website URL
0                  https://www.whiteriversmedia.com/
1                    https://www.mandywebdesign.com/
2                         https://www.brandloom.com/
3                         https://www.digidarts.com/
4  https://www.ezrankings.com/seo-company-india.html


In [7]:
df.shape

(45, 1)

In [45]:


def extract_company_info(url):
    try:
        def extract_domain_name(url):
                if urlparse(url).netloc.split('.')[0].capitalize()=='Www':
                    return urlparse(url).netloc.split('.')[1].capitalize()
                return urlparse(url).netloc.split('.')[0].capitalize()
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text()
        company=extract_domain_name(url)
        def extract_location(soup):
                location = "N/A"
            
                # Define more comprehensive keywords for detection
                keywords = ["india", "delhi", "mumbai", "bangalore", "pune", "kolkata", "hyderabad", 
                            "chennai", "noida", "gurgaon", "address", "location", "contact us"]
            
                # Collect all potential address-containing tags
                potential_tags = soup.find_all(["footer", "address", "p", "div", "span"], 
                                               string=lambda text: text and any(k in text.lower() for k in keywords))
            
                for tag in potential_tags:
                    text = tag.get_text(separator=" ", strip=True)
                    # Simple heuristic to check if it's an address-like text
                    if len(text) > 15 and any(word in text.lower() for word in keywords):
                        # Optional: Filter out noisy lines like phone numbers or email-only text
                        if not re.search(r"(blocked|sorry|copyright)", text.lower()):
                            location = text
                            break
            
                return location
        def extract_contact_person(text):
                # Common patterns around names
                patterns = [
                    r"(Founder|CEO|Co-Founder|Director|Head|Manager)[^a-zA-Z]+([A-Z][a-z]+ [A-Z][a-z]+)",  # e.g. Founder John Doe
                    r"([A-Z][a-z]+ [A-Z][a-z]+)[^\n]{0,50}(Founder|CEO|Director)",  # e.g. John Doe - CEO
                    r"Name[:\-]\s*([A-Z][a-z]+(?: [A-Z][a-z]+)?)",  # Name: John Doe
                ]
            
                for pattern in patterns:
                    match = re.search(pattern, text)
                    if match:
                        return match.group(0).strip()
            
                return "N/A"
        # Extract email addresses
        emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
        email = emails[0] if emails else "N/A"

        # Extract phone numbers (Indian format or general format)
        phones = re.findall(r"(\+91[\-\s]?\d{10}|\d{3,5}[\-\s]?\d{6,8})", text)
        phone = phones[0] if phones else "N/A"

        # Try to get location/address from footer or text
        location=extract_location(soup)

        # Try to find a contact person like CEO, Founder, etc.
        
        contact_person=extract_contact_person(text)

        # Assume industry from context — since we're targeting B2B services
        industry = "Digital Marketing / Web Agency"

        return {
            "Company Name": company,
            "Contact Person": contact_person,
            "Industry": industry,
            "Website": url,
            "Email": email,
            "Phone": phone,
            "Location": location
        }

    except Exception as e:
        return {
            "Company Name": "N/A",
            "Contact Person": "N/A",
            "Industry": "N/A",
            "Website": url,
            "Email": "N/A",
            "Phone": "N/A",
            "Location": "N/A",
        }

# Extract info for each company
data = [extract_company_info(url) for url in urls]
df_companyInfo = pd.DataFrame(data)

# Show sample output



In [46]:
df_companyInfo.to_excel("agency_detailed_info.xlsx", index=False)

In [47]:
df_companyInfo


Unnamed: 0,Company Name,Contact Person,Industry,Website,Email,Phone,Location
0,Vocso,,Digital Marketing / Web Agency,https://www.vocso.com/,,,At Vertelo we’re helping to redefine transport...
1,Seoexpertscompanyindia,"CEO, Friction Studio",Digital Marketing / Web Agency,https://seoexpertscompanyindia.com/,info@seoexpertscompanyindia.com,,“Since onboarding the SEO Experts Company Indi...
2,Foduu,Manager – Znergy Cable,Digital Marketing / Web Agency,https://www.foduu.com/,info@foduu.com,,FODUU (Foundation Of Design Uprising Unit) is ...
3,Mumbaiwebdesign,Director\n\nGood Day,Digital Marketing / Web Agency,https://www.mumbaiwebdesign.in/,sales@mumbaiwebdesign.in,+91 9967857485,Mumbai Web Design
4,Brandloom,Director Ashley Stewart,Digital Marketing / Web Agency,https://www.brandloom.com/,care@brandloom.com,+91-7669647020,Take advantage of our expertise. Build your Br...
5,The7eagles,,Digital Marketing / Web Agency,https://the7eagles.com/india/seo-company/,,+919042275793,As the internet is available in every street a...
6,Webeesocial,"Manager, Teamwork Arts",Digital Marketing / Web Agency,https://webeesocial.com/,soumya.sharma@webeesocial.com,,WeBeeSocial is a full-scale Digital Marketing ...
7,Indiawebdesigns,,Digital Marketing / Web Agency,https://indiawebdesigns.in/,,,India's Most Trusted Web Design & Digital Mark...
8,Wefttechnologies,,Digital Marketing / Web Agency,https://wefttechnologies.com/digital-marketing...,,,
9,Idigitalise,Blog News Testimonials ABOUT About iDigital...,Digital Marketing / Web Agency,https://idigitalise.net/,ihelp@idigitalise.net,355694095558,Get Google's #1 ranking with iDigitalize's SEO...


In [54]:
# Step 1: Drop rows missing both Email and Phone
df_cleaned = df_companyInfo[~((df_companyInfo["Email"] == "N/A") & (df_companyInfo["Phone"] == "N/A"))]


# Step 2: Filter out rows where Industry is not relevant (safety check)
relevant_industries = ["digital marketing", "web agency", "web design", "seo", "development"]
df_cleaned = df_cleaned[df_cleaned["Industry"].str.lower().str.contains("|".join(relevant_industries), na=False)]

# Step 3: Drop rows where Website is missing or invalid (e.g., search links)
df_cleaned = df_cleaned[~df_cleaned["Website"].astype(str).str.contains("/search|linkedin|pulse", case=False, na=False)]

# Step 4: Strip whitespace and newlines from Contact Person and Location
df_cleaned["Contact Person"] = df_cleaned["Contact Person"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
df_cleaned["Location"] = df_cleaned["Location"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

# Show final cleaned dataframe and how many rows remain
df_cleaned.reset_index(drop=True, inplace=True)
df_cleaned.shape, df_cleaned.head()


((26, 7),
              Company Name           Contact Person  \
 0  Seoexpertscompanyindia     CEO, Friction Studio   
 1                   Foduu   Manager – Znergy Cable   
 2         Mumbaiwebdesign        Director Good Day   
 3               Brandloom  Director Ashley Stewart   
 4              The7eagles                      N/A   
 
                          Industry                                    Website  \
 0  Digital Marketing / Web Agency        https://seoexpertscompanyindia.com/   
 1  Digital Marketing / Web Agency                     https://www.foduu.com/   
 2  Digital Marketing / Web Agency            https://www.mumbaiwebdesign.in/   
 3  Digital Marketing / Web Agency                 https://www.brandloom.com/   
 4  Digital Marketing / Web Agency  https://the7eagles.com/india/seo-company/   
 
                              Email           Phone  \
 0  info@seoexpertscompanyindia.com             N/A   
 1                   info@foduu.com             N/A   
 2   

In [55]:
df_cleaned.head(10)

Unnamed: 0,Company Name,Contact Person,Industry,Website,Email,Phone,Location
0,Seoexpertscompanyindia,"CEO, Friction Studio",Digital Marketing / Web Agency,https://seoexpertscompanyindia.com/,info@seoexpertscompanyindia.com,,“Since onboarding the SEO Experts Company Indi...
1,Foduu,Manager – Znergy Cable,Digital Marketing / Web Agency,https://www.foduu.com/,info@foduu.com,,FODUU (Foundation Of Design Uprising Unit) is ...
2,Mumbaiwebdesign,Director Good Day,Digital Marketing / Web Agency,https://www.mumbaiwebdesign.in/,sales@mumbaiwebdesign.in,+91 9967857485,Mumbai Web Design
3,Brandloom,Director Ashley Stewart,Digital Marketing / Web Agency,https://www.brandloom.com/,care@brandloom.com,+91-7669647020,Take advantage of our expertise. Build your Br...
4,The7eagles,,Digital Marketing / Web Agency,https://the7eagles.com/india/seo-company/,,+919042275793,As the internet is available in every street a...
5,Webeesocial,"Manager, Teamwork Arts",Digital Marketing / Web Agency,https://webeesocial.com/,soumya.sharma@webeesocial.com,,WeBeeSocial is a full-scale Digital Marketing ...
6,Idigitalise,Blog News Testimonials ABOUT About iDigitalise...,Digital Marketing / Web Agency,https://idigitalise.net/,ihelp@idigitalise.net,355694095558,Get Google's #1 ranking with iDigitalize's SEO...
7,Ezrankings,,Digital Marketing / Web Agency,https://www.ezrankings.com/,contactus@ezrankings.com,+91-9560133711,Our experts are here to help you with advice a...
8,Noviindus,,Digital Marketing / Web Agency,https://noviindus.com/web-designing-company-in...,crm@noviindus.com,+91 9995843985,PREMIER WEB DESIGNING COMPANY IN INDIA
9,Rankingbyseo,,Digital Marketing / Web Agency,https://www.rankingbyseo.com/,sales@rankingbyseo.com,+919953532683,This is to recommend Ranking By SEO India as a...


In [56]:
df_cleaned.to_excel("cleaned_agency_list.xlsx", index=False)