In [None]:
from googlesearch import search
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import re

def get_domain_urls(query, keyword, num_results=10):
    """
    Fetch URLs containing the root keyword in the domain name.

    Parameters:
        query (str): Search query keyword.
        keyword (str): Root keyword to match in domain names.
        num_results (int): Number of search results to fetch per query.

    Returns:
        list: Filtered list of URLs with the keyword in the domain name.
    """
    try:
        # Perform Google search
        url_list = list(search(query, tld="com", num=num_results, stop=num_results, pause=2))

        # Filter URLs to ensure keyword is in the domain name (not path or query string)
        filtered_urls = [
            url for url in url_list
            if re.search(rf"\b{keyword}\b", url.split("//")[-1].split("/")[0], re.IGNORECASE)
        ]
        return filtered_urls
    except Exception as e:
        print(f"An error occurred while searching for '{query}': {e}")
        return []

# Example usage
def main():
    keyword = "physiocare"  # Root keyword to search for in domain names
    num_results_per_query = 10
    search_query = keyword  # Modify if you want additional terms, e.g., "physiocare services"

    # Use ThreadPoolExecutor for parallel processing (optional for batch queries)
    with ThreadPoolExecutor(max_workers=5) as executor:
        future = executor.submit(get_domain_urls, search_query, keyword, num_results_per_query)
        urls = future.result()

    # Display results
    print("Filtered URLs with the keyword in the domain name:")
    for url in urls:
        print(url)

    # Save results to a DataFrame
    df = pd.DataFrame({'Domain URLs': urls})
    print("\nDataFrame:")
    print(df)

if __name__ == "__main__":
    main()


Filtered URLs with the keyword in the domain name:

DataFrame:
Empty DataFrame
Columns: [Domain URLs]
Index: []


In [None]:
!pip install google --quiet

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from googlesearch import search
import random
from concurrent.futures import ThreadPoolExecutor
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

# Loading Data
df = pd.read_excel('Trip_Advisor_Scraper.xlsx', sheet_name=0)  # You have to upload the sheet before running this code
df = df[5500:]
query_list = df['SEARCH_TERM'].to_list()
print(len(query_list))

5833


In [None]:
# You have to use a proxy rotation service to get the 'proxy_10sec.txt' file


file_path = 'proxy_10sec.txt'
with open(file_path, 'r') as file:
    proxy_addresses = [line.strip() for line in file]
proxy_list = proxy_addresses

In [None]:
from concurrent.futures import ThreadPoolExecutor
from googlesearch import search  # Make sure you have googlesearch-python installed
import pandas as pd

# List of website names to search for
website_names = [
    "carpetclean",
]

results = []

def get_website_urls(query, num_results=50):
    """
    Fetch multiple URLs for a given query using Google search.

    Parameters:
        query (str): The search query.
        num_results (int): The number of results to fetch per query.

    Returns:
        list: A list of URLs found for the query.
    """
    try:
        # Perform Google search and fetch URLs
        url_list = list(search(query, tld="co.in", num=num_results, stop=num_results, pause=4))
        return url_list
    except Exception as e:
        print(f"An error occurred while searching for '{query}': {e}")
        return []

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor for each query
    futures = {executor.submit(get_website_urls, query): query for query in website_names}

    # Wait for all tasks to complete and collect results
    for future in futures:
        query = futures[future]
        try:
            urls = future.result()
            results.append({'Website': query, 'URLs': urls})
        except Exception as e:
            print(f"An error occurred for query '{query}': {e}")
            results.append({'Website': query, 'URLs': []})

# Flatten the results into a DataFrame
data = []
for result in results:
    for url in result['URLs']:
      if website_names[0] in str(url):
          data.append({'Website': result['Website'], 'URL': url})

df = pd.DataFrame(data)

# Display the DataFrame
print(df)



       Website                                                URL
0  carpetclean          https://www.markscarpetcleaningomaha.com/
1  carpetclean               https://www.carpetcleaningomaha.com/
2  carpetclean         https://affordablecarpetcleaningomaha.com/
3  carpetclean  https://begreencarpetcleaning.com/service-area...
4  carpetclean                    https://a-1-carpetcleaning.com/
5  carpetclean                           https://carpetclean.com/
6  carpetclean                     https://carpetcleaner-usa.com/
7  carpetclean                   http://www.tlccarpetcleaning.us/
8  carpetclean  https://www.wecancarpetcleaningservicesomaha.c...
9  carpetclean  https://www.facebook.com/markscarpetcleaningom...


In [None]:
results = []
def get_tripadvisor_url(query, proxy):
  """PLEASE MAKE AT MOST 500 REQUEST PER EXECUTION SO AS NOT TO GET BLOCKED BY GOOGLE DUE TO TOO MANY REQUESTS"""
  try:
        url_list = list(search(query, tld="co.in", num=10, stop=10, pause=4))[:10]
        if url_list:
          url = [x for x in url_list if 'Hotel_Review' in x]
          url = url[0] if url else url_list[0]
        else:
            url = None
  except Exception as e:
    print(f"An error occurred: {e}")
    url = None
  return url
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit the tasks (get_tripadvisor_url function) to the executor
    futures = [executor.submit(get_tripadvisor_url, query, random.choice(proxy_list)) for query in query_list]

    # Wait for all tasks to complete and retrieve results
    for future in futures:
        # Append the results from each task to the list
        results.append(future.result())
df['WEBSITE'] = results

In [None]:
import re
import random
import requests
from bs4 import BeautifulSoup

selected_proxy = random.choice(proxy_list)
ratings = ["Excellent", "Very Good", "Average", "Poor", "Terrible"]
pattern = "(" + "|".join(ratings) + ")(\d{1,3}(?:,\d{3})?)"

def scrape_sheet(url):
    rating_dict = {}
    failed_urls_403 = []

    def scrape_url(url):
        try:
            r = requests.get(url, headers=headers, proxies={'http': selected_proxy}, timeout=15)
            if r.status_code == 200:
                return r
            elif r.status_code == 403:
                failed_urls_403.append(url)
            else:
                print(r.status_code)
        except Exception as e:
            print(f"Error occurred while fetching URL: {url}. Error message: {e}")
        return None

    if 'Hotel_Review' not in str(url):
        rating_dict['WEBSITE'] = url
        rating_dict['TOTAL_REVIEWS'] = 'No specified Hotel from this URL'
        rating_dict['EXCELLENT'] = 'No specified Hotel from this URL'
        rating_dict['VERY_GOOD'] = 'No specified Hotel from this URL'
        rating_dict['AVERAGE'] = 'No specified Hotel from this URL'
        rating_dict['POOR'] = 'No specified Hotel from this URL'
        rating_dict['TERRIBLE'] = 'No specified Hotel from this URL'
        return rating_dict

    r = scrape_url(url)

    try:
        while r is None and failed_urls_403:
            url = failed_urls_403.pop(0)
            r = scrape_url(url)
    except Exception as e:
        print(f"Error occurred while retrying failed URL: {url}. Error message: {e}")

    if r:
        rating_dict['WEBSITE'] = url
        soup = BeautifulSoup(r.content, 'html.parser')
        ss = soup.find('div', class_='DWitr')

        s = ''  # Initialize s with a default value

        try:
            if ss:
                s = ss.text
        except AttributeError:
            try:
                while ss is None:
                    r = requests.get(url, headers=headers, proxies={'http': selected_proxy, 'http': selected_proxy}, timeout=15)
                    soup = BeautifulSoup(r.content, 'html.parser')
                    ss = soup.find('div', class_='DWitr')
            except Exception as e:
                print(f"Error occurred while finding review section in URL: {url}. Error message: {e}")

        matches = re.findall(pattern, s)

        for match in matches:
            rating = match[0]
            count = match[1].replace(",", "")
            rating_dict[rating.upper()] = count

        if not matches:
            rating_dict['EXCELLENT'] = '0'
            rating_dict['VERY GOOD'] = '0'
            rating_dict['AVERAGE'] = '0'
            rating_dict['POOR'] = '0'
            rating_dict['TERRIBLE'] = '0'

        total_reviews = sum(int(value) for key, value in rating_dict.items() if key != 'TOTAL_REVIEWS' and value.isdigit())
        rating_dict['TOTAL_REVIEWS'] = str(total_reviews)
        rating_dict = {'WEBSITE': rating_dict['WEBSITE'], 'TOTAL_REVIEWS': str(total_reviews), **rating_dict}
        return rating_dict
    else:
        return None


In [None]:
full_list = []
for i in results:
  full_list.append(scrape_sheet(i))

In [None]:
df2  = pd.DataFrame(full_list)
merged_df = pd.merge(df, df2, on="WEBSITE", how="inner")
merged_df.drop(columns=['TOTAL_REVIEWS_x', 'EXCELLENT_x', 'VERY_GOOD_x','VERY_GOOD_y','AVERAGE_x', 'POOR_x', 'TERRIBLE_x'], inplace=True)
merged_df.rename(columns={
    'TOTAL_REVIEWS_y': 'TOTAL_REVIEWS',
    'EXCELLENT_y': 'EXCELLENT',
    'AVERAGE_y': 'AVERAGE',
    'POOR_y': 'POOR',
    'TERRIBLE_y': 'TERRIBLE'
}, inplace=True)

df_no_duplicates = merged_df.drop_duplicates()
print(len(df_no_duplicates))


df_no_duplicates.to_csv('Rating_Data.csv', index = False)

333
