In [18]:
import requests
import json
import pandas as pd
import itertools
from bs4 import BeautifulSoup
import urllib.parse
import numpy as np

In [2]:
config = pd.read_csv('google.config')

In [3]:
# Example usage
api_key = config.key[0]
cx_key = config.cx[0]

In [4]:
def search_companies(query, api_key):
    
    # API endpoint
    url = 'https://places.googleapis.com/v1/places:searchText'
    
    # Data payload
    payload = {
      "textQuery": query
    }
    
    # Headers
    headers = {
      'Content-Type': 'application/json',
      'X-Goog-Api-Key': api_key,
      'X-Goog-FieldMask': 'places.displayName,places.formattedAddress,places.websiteUri'
    }
    
    # Make the POST request
    response = requests.post(url, data=json.dumps(payload), headers=headers)
    
    return response

In [5]:
types = ['Technology', 
         'Tech', 
         'AI', 
         'Artificial Intelligence',
         'Machine Learning',
         'Data Science',
         'Data Analytics',
         # 'Green',
         # 'Climate Change',
         # 'Global Warming',
         'Venture Backed',
         'Seed Funded',
         'Series A Funded',
         'Series B Funded',
         'Series C Funded',
         'Series D Funded'
        ]

where = ['Philadelphia, PA',
         'Greater Philadelphia Region',
         'Havertown, PA',
         'Mainline PA']

queries = [f'{type} Companies in {loc}' for type, loc in itertools.product(types, where)]
queries

['Technology Companies in Philadelphia, PA',
 'Technology Companies in Greater Philadelphia Region',
 'Technology Companies in Havertown, PA',
 'Technology Companies in Mainline PA',
 'Tech Companies in Philadelphia, PA',
 'Tech Companies in Greater Philadelphia Region',
 'Tech Companies in Havertown, PA',
 'Tech Companies in Mainline PA',
 'AI Companies in Philadelphia, PA',
 'AI Companies in Greater Philadelphia Region',
 'AI Companies in Havertown, PA',
 'AI Companies in Mainline PA',
 'Artificial Intelligence Companies in Philadelphia, PA',
 'Artificial Intelligence Companies in Greater Philadelphia Region',
 'Artificial Intelligence Companies in Havertown, PA',
 'Artificial Intelligence Companies in Mainline PA',
 'Machine Learning Companies in Philadelphia, PA',
 'Machine Learning Companies in Greater Philadelphia Region',
 'Machine Learning Companies in Havertown, PA',
 'Machine Learning Companies in Mainline PA',
 'Data Science Companies in Philadelphia, PA',
 'Data Science Com

In [6]:
all_dat = pd.DataFrame()

for query in queries:
    response = search_companies(query, api_key)

    df = pd.DataFrame(response.json().get('places', []))
    if df.shape[0] == 0:
        continue
    df['comp_name'] = df['displayName'].apply(lambda x: x['text'])
    df.drop('displayName', axis = 1, inplace = True)

    all_dat = pd.concat([all_dat, df], axis = 0, ignore_index = True)

In [7]:
all_dat.drop_duplicates(inplace = True)
all_dat

Unnamed: 0,formattedAddress,websiteUri,comp_name
0,"121 S Broad St floor 9, Philadelphia, PA 19107...",http://getguru.com/,"Guru Technologies, Inc."
1,"1515 Market St Suite 1200, Philadelphia, PA 19...",http://www.pumextech.com/,Pumex Technologies
2,"2400 Market St #200, Philadelphia, PA 19103, USA",http://arcweb.co/,Arcweb Technologies
3,"701 S 50th St Floor 2, Philadelphia, PA 19143,...",http://oceanringtech.com/contact/,Ocean Ring Technologies
4,"601 Walnut St 12th floor, Philadelphia, PA 191...",http://360.technology/,360 Technology LLC
...,...,...,...
712,"435 Devon Park Dr Building 800, Wayne, PA 1908...",http://www.semcap.com/,SemCap
717,"1100 E Hector St #210, Conshohocken, PA 19428,...",https://www.thrivestlink.com/,Thrivest Link Legal Funding™
724,"150 Monument Rd, Bala Cynwyd, PA 19004, USA",http://www.taaffeitecm.com/,Taaffeite Capital Management - TCM Global Inde...
766,"55 S Valley Rd Apt F3, Paoli, PA 19301, USA",https://bigfundings.com/,Big Fundings - Short term business loans


In [8]:
def find_linkedin_url(company_name):
    # Encode the company name for the URL
    query = urllib.parse.quote_plus(f"{company_name} LinkedIn")
    url = f"https://www.google.com/search?q={query}"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
    
    response = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Attempt to find the first LinkedIn URL in the search results
    for a in soup.find_all('a', href=True):
        link = a['href']
        if "linkedin.com/company" in link:
            # Extract the URL from Google's redirect URL if necessary
            start = link.find("http")  # Find the start of the actual URL
            if start != -1:
                end = link.find("&", start)  # Find the end of the actual URL
                if end != -1:
                    return link[start:end]
                return link[start:]
    
    return "LinkedIn URL not found"

In [9]:
li_prof = [find_linkedin_url(company_name) for company_name in list(all_dat['comp_name'])]

In [10]:
li_prof

['https://www.linkedin.com/company/guru-technologies',
 'https://www.linkedin.com/company/pumex-computing-llc',
 'https://www.linkedin.com/company/arcweb',
 'https://www.linkedin.com/company/ocean-ring-technologies',
 'https://www.linkedin.com/company/360technology-usa',
 'https://www.linkedin.com/company/cdw-sirius',
 'https://www.linkedin.com/company/poweredbybosstech',
 'https://www.linkedin.com/company/ibusiness-technologies',
 'https://www.linkedin.com/company/taltech-inc',
 'https://www.linkedin.com/company/tammaninc',
 'https://www.linkedin.com/company/officialgates',
 'https://www.linkedin.com/company/untold-hq',
 'https://www.linkedin.com/company/chirp-technologies-net',
 'https://www.linkedin.com/company/jtm',
 'https://www.linkedin.com/company/kashyak',
 'https://www.linkedin.com/company/paratech-inc.',
 'https://www.linkedin.com/company/netbridge',
 'https://uk.linkedin.com/company/trouvaille-global',
 'https://www.linkedin.com/company/envative',
 'https://www.linkedin.com/

In [11]:
all_dat['linkedin_profile'] = li_prof

In [12]:
all_dat

Unnamed: 0,formattedAddress,websiteUri,comp_name,linkedin_profile
0,"121 S Broad St floor 9, Philadelphia, PA 19107...",http://getguru.com/,"Guru Technologies, Inc.",https://www.linkedin.com/company/guru-technolo...
1,"1515 Market St Suite 1200, Philadelphia, PA 19...",http://www.pumextech.com/,Pumex Technologies,https://www.linkedin.com/company/pumex-computi...
2,"2400 Market St #200, Philadelphia, PA 19103, USA",http://arcweb.co/,Arcweb Technologies,https://www.linkedin.com/company/arcweb
3,"701 S 50th St Floor 2, Philadelphia, PA 19143,...",http://oceanringtech.com/contact/,Ocean Ring Technologies,https://www.linkedin.com/company/ocean-ring-te...
4,"601 Walnut St 12th floor, Philadelphia, PA 191...",http://360.technology/,360 Technology LLC,https://www.linkedin.com/company/360technology...
...,...,...,...,...
712,"435 Devon Park Dr Building 800, Wayne, PA 1908...",http://www.semcap.com/,SemCap,https://www.linkedin.com/company/seminalcapital
717,"1100 E Hector St #210, Conshohocken, PA 19428,...",https://www.thrivestlink.com/,Thrivest Link Legal Funding™,https://www.linkedin.com/company/thrivestlink
724,"150 Monument Rd, Bala Cynwyd, PA 19004, USA",http://www.taaffeitecm.com/,Taaffeite Capital Management - TCM Global Inde...,https://www.linkedin.com/company/taaffeite-cap...
766,"55 S Valley Rd Apt F3, Paoli, PA 19301, USA",https://bigfundings.com/,Big Fundings - Short term business loans,https://www.linkedin.com/company/bigfundings


In [13]:
def find_careers_page(website):
    try:
        response = requests.get(website, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            links = soup.find_all('a')
            for link in links:
                if link and link.text:
                    if 'career' in link.text.lower() or 'job' in link.text.lower() or 'work with us' in link.text.lower():
                        return link.get('href')
    except Exception as e:
        return 'no link found'
        
    return 'no link found'

In [14]:
careers_pages = [find_careers_page(site) for site in list(all_dat['websiteUri'])]
careers_pages

['/careers',
 'no link found',
 'no link found',
 'no link found',
 'https://360.technology/careers/',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'https://envative.com/career-opportunities',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'https://careers.staples.com/',
 'no link found',
 'no link found',
 'no link found',
 'careers/index.html',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'https://careers.encora.com/',
 'no link found',
 'no link found',
 'no link found',
 'no link found',
 'benefits.aspx',
 'https://www.cigniti.com/cigniti-careers/',
 '/careers',
 'no link found',
 'no link found',
 '/jobs',
 'no link found',
 '/careers',
 'no link found',
 'no link found',
 '/career.html',

In [15]:
all_dat['careers_page'] = careers_pages

In [16]:
all_dat

Unnamed: 0,formattedAddress,websiteUri,comp_name,linkedin_profile,careers_page
0,"121 S Broad St floor 9, Philadelphia, PA 19107...",http://getguru.com/,"Guru Technologies, Inc.",https://www.linkedin.com/company/guru-technolo...,/careers
1,"1515 Market St Suite 1200, Philadelphia, PA 19...",http://www.pumextech.com/,Pumex Technologies,https://www.linkedin.com/company/pumex-computi...,no link found
2,"2400 Market St #200, Philadelphia, PA 19103, USA",http://arcweb.co/,Arcweb Technologies,https://www.linkedin.com/company/arcweb,no link found
3,"701 S 50th St Floor 2, Philadelphia, PA 19143,...",http://oceanringtech.com/contact/,Ocean Ring Technologies,https://www.linkedin.com/company/ocean-ring-te...,no link found
4,"601 Walnut St 12th floor, Philadelphia, PA 191...",http://360.technology/,360 Technology LLC,https://www.linkedin.com/company/360technology...,https://360.technology/careers/
...,...,...,...,...,...
712,"435 Devon Park Dr Building 800, Wayne, PA 1908...",http://www.semcap.com/,SemCap,https://www.linkedin.com/company/seminalcapital,no link found
717,"1100 E Hector St #210, Conshohocken, PA 19428,...",https://www.thrivestlink.com/,Thrivest Link Legal Funding™,https://www.linkedin.com/company/thrivestlink,no link found
724,"150 Monument Rd, Bala Cynwyd, PA 19004, USA",http://www.taaffeitecm.com/,Taaffeite Capital Management - TCM Global Inde...,https://www.linkedin.com/company/taaffeite-cap...,no link found
766,"55 S Valley Rd Apt F3, Paoli, PA 19301, USA",https://bigfundings.com/,Big Fundings - Short term business loans,https://www.linkedin.com/company/bigfundings,no link found


In [21]:
all_dat['careers_link'] = pd.Series(np.where(all_dat['careers_page'].str.startswith('/'), all_dat['websiteUri'] + all_dat['careers_page'].str.lstrip('/'), all_dat['careers_page']))

In [22]:
all_dat

Unnamed: 0,formattedAddress,websiteUri,comp_name,linkedin_profile,careers_page,careers_link
0,"121 S Broad St floor 9, Philadelphia, PA 19107...",http://getguru.com/,"Guru Technologies, Inc.",https://www.linkedin.com/company/guru-technolo...,/careers,http://getguru.com/careers
1,"1515 Market St Suite 1200, Philadelphia, PA 19...",http://www.pumextech.com/,Pumex Technologies,https://www.linkedin.com/company/pumex-computi...,no link found,no link found
2,"2400 Market St #200, Philadelphia, PA 19103, USA",http://arcweb.co/,Arcweb Technologies,https://www.linkedin.com/company/arcweb,no link found,no link found
3,"701 S 50th St Floor 2, Philadelphia, PA 19143,...",http://oceanringtech.com/contact/,Ocean Ring Technologies,https://www.linkedin.com/company/ocean-ring-te...,no link found,no link found
4,"601 Walnut St 12th floor, Philadelphia, PA 191...",http://360.technology/,360 Technology LLC,https://www.linkedin.com/company/360technology...,https://360.technology/careers/,https://360.technology/careers/
...,...,...,...,...,...,...
712,"435 Devon Park Dr Building 800, Wayne, PA 1908...",http://www.semcap.com/,SemCap,https://www.linkedin.com/company/seminalcapital,no link found,
717,"1100 E Hector St #210, Conshohocken, PA 19428,...",https://www.thrivestlink.com/,Thrivest Link Legal Funding™,https://www.linkedin.com/company/thrivestlink,no link found,
724,"150 Monument Rd, Bala Cynwyd, PA 19004, USA",http://www.taaffeitecm.com/,Taaffeite Capital Management - TCM Global Inde...,https://www.linkedin.com/company/taaffeite-cap...,no link found,
766,"55 S Valley Rd Apt F3, Paoli, PA 19301, USA",https://bigfundings.com/,Big Fundings - Short term business loans,https://www.linkedin.com/company/bigfundings,no link found,
