# Company Web Scraping through Google

Import all necessary packages for web scraping

In [1]:
import time
import csv
import os
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Graduate\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Conduct Web Scraping through Google search

In [6]:
#Load the DataFrame with company and description columns
df = pd.read_csv("C:/Users/Graduate/Desktop/Grad MSBA/Advanced Python/Final Project/company_info_final.csv")

# Load the NAICS_subcategories file
naics_subcategories = pd.read_csv("C:/Users/Graduate/Desktop/Grad MSBA/Advanced Python/Final Project/NAICS_subcategories.csv")

# Merge the two DataFrames on the first 3 digits of the NAICS code
df = pd.merge(df, naics_subcategories, left_on=df["NAICS on SoS site"].astype(str).str[:3], right_on=naics_subcategories["NAICS Code"].astype(str).str[:3], how="left")

# Define an empty dictionary to store the results
keyword_dict = {}

# Add the following columns to the DataFrame
df["keyword_count"] = 0
df["found_count"] = 0
df["Food"] = 0
df["Beverage"] = 0
df["Tobacco"] = 0
df["Mills"] = 0
df["Textile"] = 0
df["Apparel"] = 0
df["Leather"] = 0
df["Footwear"] = 0
df["Wood"] = 0
df["Paper"] = 0
df["Printing"] = 0
df["Petroleum"] = 0
df["Coal"] = 0
df["Chemicals"] = 0
df["Plastics"] = 0
df["Rubber"] = 0
df["Mineral"] = 0
df["Metal"] = 0
df["Machinery"] = 0
df["Electronic"] = 0
df["Electrical"] = 0
df["Transportation"] = 0
df["Furniture"] = 0
df["Medical"] = 0

# Add a "Manufacturing" column to the DataFrame and set it to False
df["Manufacturing"] = False

# Create a new instance of the Chrome driver
driver = webdriver.Chrome()

# Define a set of stopwords
stop_words = set(stopwords.words('english'))

# Add a "Website URL" column to the DataFrame
df["Website URL"] = ""

# Loop through all rows in the DataFrame
for index, row in df.iterrows():
    # Check if the NAICS Code is blank
    if pd.isnull(row["NAICS on SoS site"]):
        continue  # skip to next row
    
    company = row["Company"].strip()
    description = row["Description"]
    
    company_name = company.replace("&", "and")
        
    # Navigate to Google.com with the search query
    driver.get("https://www.google.com/search?q=" + company_name + " " + "manufacturing" + " " + "Rhode Island")
        
    # Wait for the search results to load
    driver.implicitly_wait(10)
        
    # Find the first search result link that is not an advertisement
    search_results = driver.find_elements('css selector', "div.tF2Cxc")
    for result in search_results:
        try:
            search_link = result.find_element('tag name', 'a')
            if 'http' in search_link.get_attribute('href') and 'google' not in search_link.get_attribute('href'):
                url = search_link.get_attribute('href')
                df.at[index, 'Website URL'] = url
                search_link.click()
                break
        except:
            pass
        
    # Wait for the resulting page to load
    driver.implicitly_wait(10)
        
    # Get the webpage content
    webpage = driver.page_source
        
    # Loop through each keyword for the current company
    if pd.isnull(description):
        continue  # skip to next row
    for word in description.split():        
        if word.lower() not in stop_words:
            keyword_dict.setdefault(word.lower(), []).append(1 if word.lower() in webpage.lower() else 0)
            # Increment the found count for the current row if the keyword is found in the webpage
            df.at[index, "found_count"] += 1 if word.lower() in webpage.lower() else 0
            # Increment the keyword count for the current row
            df.at[index, "keyword_count"] += 1
            # Set the value of the industry column to 1 if the word is found in the webpage
            if word.lower() == "food":
                df.at[index, "Food"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "beverage":
                df.at[index, "Beverage"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "tobacco":
                df.at[index, "Tobacco"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "mills":
                df.at[index, "Mills"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "textile":
                df.at[index, "Textile"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "apparel":
                df.at[index, "Apparel"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "leather":
                df.at[index, "Leather"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "footwear":
                df.at[index, "Footwear"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "wood":
                df.at[index, "Wood"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "paper":
                df.at[index, "Paper"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "printing":
                df.at[index, "Printing"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "petroleum":
                df.at[index, "Petroleum"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "coal":
                df.at[index, "Coal"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "chemicals":
                df.at[index, "Chemicals"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "plastics":
                df.at[index, "Plastics"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "rubber":
                df.at[index, "Rubber"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "mineral":
                df.at[index, "Mineral"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "metal":
                df.at[index, "Metal"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "machinery":
                df.at[index, "Machinery"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "electronic":
                df.at[index, "Electronic"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "electrical":
                df.at[index, "Electrical"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "transportation":
                df.at[index, "Transportation"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "furniture":
                df.at[index, "Furniture"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "medical":
                df.at[index, "Medical"] = 1 if word.lower() in webpage.lower() else 0
            elif word.lower() == "manufacturing":
                df.at[index, "Manufacturing"] = 1 if word.lower() in webpage.lower() else 0
        else:
            keyword_dict.setdefault(word.lower(), []).append(0)
    
    # Add the found count to the DataFrame
    df.at[index, "found_count"] = df.at[index, "found_count"]
    df.at[index, "Manufacturing"] = df.at[index, "Manufacturing"]
        
    # Calculate the percentage of keywords found and add to the DataFrame
    percentage = df.at[index, "found_count"] / df.at[index, "keyword_count"] * 100 if df.at[index, "keyword_count"] > 0 else 0
    df.at[index, "found_percentage"] = "{:.2f}%".format(percentage)
        
# Close the web driver
driver.quit()

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=112.0.5615.122)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x0022DCE3+50899]
	(No symbol) [0x001BE111]
	(No symbol) [0x000C5588]
	(No symbol) [0x000AD333]
	(No symbol) [0x0010F4DB]
	(No symbol) [0x0011DB33]
	(No symbol) [0x0010B6F6]
	(No symbol) [0x000E7708]
	(No symbol) [0x000E886D]
	GetHandleVerifier [0x00493EAE+2566302]
	GetHandleVerifier [0x004C92B1+2784417]
	GetHandleVerifier [0x004C327C+2759788]
	GetHandleVerifier [0x002C5740+672048]
	(No symbol) [0x001C8872]
	(No symbol) [0x001C41C8]
	(No symbol) [0x001C42AB]
	(No symbol) [0x001B71B7]
	BaseThreadInitThunk [0x74ED7D49+25]
	RtlInitializeExceptionChain [0x76F7B74B+107]
	RtlClearBits [0x76F7B6CF+191]
	(No symbol) [0x00000000]


Generate Excel file and place it in the "Web Scrape" folder

In [8]:
# Create the folder if it doesn't exist
if not os.path.exists("Web Scrape"):
    os.makedirs("Web Scrape")

# Save the updated DataFrame to a new CSV file in the Web Scrape folder
df.to_csv("Web Scrape/google_web_scrape.csv", index=False)