diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py index c076357..235133f 100644 --- a/linkedin_scraper/company.py +++ b/linkedin_scraper/company.py @@ -11,29 +11,32 @@ AD_BANNER_CLASSNAME = ('ad-banner-container', '__ad') + def getchildren(elem): return elem.find_elements_by_xpath(".//*") + class CompanySummary(object): linkedin_url = None name = None followers = None - def __init__(self, linkedin_url = None, name = None, followers = None): + def __init__(self, linkedin_url=None, name=None, followers=None): self.linkedin_url = linkedin_url self.name = name self.followers = followers def __repr__(self): if self.followers == None: - return """ {name} """.format(name = self.name) + return """ {name} """.format(name=self.name) else: - return """ {name} {followers} """.format(name = self.name, followers = self.followers) + return """ {name} {followers} """.format(name=self.name, followers=self.followers) + class Company(Scraper): linkedin_url = None name = None - about_us =None + about_us = None website = None headquarters = None founded = None @@ -41,10 +44,13 @@ class Company(Scraper): company_type = None company_size = None specialties = None - showcase_pages =[] + showcase_pages = [] affiliated_companies = [] + education_statistics = [] - def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, industry = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True, get_employees = True, close_on_complete = True): + def __init__(self, linkedin_url=None, name=None, about_us=None, website=None, headquarters=None, founded=None, + industry=None, company_type=None, company_size=None, specialties=None, showcase_pages=[], + affiliated_companies=[], driver=None, scrape=True, get_employees=True, close_on_complete=True): self.linkedin_url = linkedin_url self.name = name self.about_us = about_us @@ -83,22 +89,37 @@ def __get_text_under_subtitle_by_class(self, driver, class_name): def scrape(self, get_employees=True, close_on_complete=True): if self.is_signed_in(): - self.scrape_logged_in(get_employees = get_employees, close_on_complete = close_on_complete) + self.scrape_logged_in(get_employees=get_employees, close_on_complete=close_on_complete) else: - self.scrape_not_logged_in(get_employees = get_employees, close_on_complete = close_on_complete) + self.scrape_not_logged_in(get_employees=get_employees, close_on_complete=close_on_complete) def __parse_employee__(self, employee_raw): try: return Person( - linkedin_url = employee_raw.find_element_by_tag_name("a").get_attribute("href"), - name = (employee_raw.text.split("\n") or [""])[0].strip(), - driver = self.driver, - get = False, - scrape = False - ) + linkedin_url=employee_raw.find_element_by_tag_name("a").get_attribute("href"), + name=(employee_raw.text.split("\n") or [""])[0].strip(), + driver=self.driver, + get=False, + scrape=False + ) except: return None + def get_education_statistics(self): + driver = self.driver + driver.get(os.path.join(self.linkedin_url, "people")) + _ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "insight-container"))) + containers = driver.find_elements_by_class_name("insight-container") + for container in containers: + parts = container.text.split('\n') + if parts[0] == 'Where they studied': + result = [] + for i in range(2, len(parts)): + sub_parts = parts[i].split() + result.append((' '.join(sub_parts[1:]), int(sub_parts[0]))) + return result + return None + def get_employees(self, wait_time=10): total = [] list_css = "org-people-profiles-module__profile-list" @@ -125,15 +146,15 @@ def get_employees(self, wait_time=10): total.append(self.__parse_employee__(res)) def is_loaded(previous_results): - loop = 0 - driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight));") - results_li = results_list.find_elements_by_tag_name("li") - while len(results_li) == previous_results and loop <= 5: - time.sleep(1) + loop = 0 driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight));") results_li = results_list.find_elements_by_tag_name("li") - loop += 1 - return loop <= 5 + while len(results_li) == previous_results and loop <= 5: + time.sleep(1) + driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight));") + results_li = results_list.find_elements_by_tag_name("li") + loop += 1 + return loop <= 5 def get_data(previous_results): results_li = results_list.find_elements_by_tag_name("li") @@ -161,9 +182,7 @@ def get_data(previous_results): results_li_len = len(total) return total - - - def scrape_logged_in(self, get_employees = True, close_on_complete = True): + def scrape_logged_in(self, get_employees=True, close_on_complete=True): driver = self.driver driver.get(self.linkedin_url) @@ -177,17 +196,19 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): # Click About Tab or View All Link try: - self.__find_first_available_element__( - navigation.find_elements_by_xpath("//a[@data-control-name='page_member_main_nav_about_tab']"), - navigation.find_elements_by_xpath("//a[@data-control-name='org_about_module_see_all_view_link']"), - ).click() + self.__find_first_available_element__( + navigation.find_elements_by_xpath("//a[@data-control-name='page_member_main_nav_about_tab']"), + navigation.find_elements_by_xpath("//a[@data-control-name='org_about_module_see_all_view_link']"), + ).click() except: - driver.get(os.path.join(self.linkedin_url, "about")) + driver.get(os.path.join(self.linkedin_url, "about")) _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'section'))) time.sleep(3) - if 'Cookie Policy' in driver.find_elements_by_tag_name("section")[1].text or any(classname in driver.find_elements_by_tag_name("section")[1].get_attribute('class') for classname in AD_BANNER_CLASSNAME): + if 'Cookie Policy' in driver.find_elements_by_tag_name("section")[1].text or any( + classname in driver.find_elements_by_tag_name("section")[1].get_attribute('class') for classname in + AD_BANNER_CLASSNAME): section_id = 4 else: section_id = 3 @@ -203,23 +224,22 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): for i in range(num_attributes): txt = labels[i].text.strip() if txt == 'Website': - self.website = values[i+x_off].text.strip() + self.website = values[i + x_off].text.strip() elif txt == 'Industry': - self.industry = values[i+x_off].text.strip() + self.industry = values[i + x_off].text.strip() elif txt == 'Company size': - self.company_size = values[i+x_off].text.strip() + self.company_size = values[i + x_off].text.strip() if len(values) > len(labels): x_off = 1 elif txt == 'Type': - self.company_type = values[i+x_off].text.strip() + self.company_type = values[i + x_off].text.strip() elif txt == 'Founded': - self.founded = values[i+x_off].text.strip() + self.founded = values[i + x_off].text.strip() elif txt == 'Specialties': - self.specialties = "\n".join(values[i+x_off].text.strip().split(", ")) + self.specialties = "\n".join(values[i + x_off].text.strip().split(", ")) driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));") - try: _ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'company-list'))) showcase, affiliated = driver.find_elements_by_class_name("company-list") @@ -228,20 +248,21 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): # get showcase for showcase_company in showcase.find_elements_by_class_name("org-company-card"): companySummary = CompanySummary( - linkedin_url = showcase_company.find_element_by_class_name("company-name-link").get_attribute("href"), - name = showcase_company.find_element_by_class_name("company-name-link").text.strip(), - followers = showcase_company.find_element_by_class_name("company-followers-count").text.strip() - ) + linkedin_url=showcase_company.find_element_by_class_name("company-name-link").get_attribute("href"), + name=showcase_company.find_element_by_class_name("company-name-link").text.strip(), + followers=showcase_company.find_element_by_class_name("company-followers-count").text.strip() + ) self.showcase_pages.append(companySummary) # affiliated company for affiliated_company in showcase.find_elements_by_class_name("org-company-card"): companySummary = CompanySummary( - linkedin_url = affiliated_company.find_element_by_class_name("company-name-link").get_attribute("href"), - name = affiliated_company.find_element_by_class_name("company-name-link").text.strip(), - followers = affiliated_company.find_element_by_class_name("company-followers-count").text.strip() - ) + linkedin_url=affiliated_company.find_element_by_class_name("company-name-link").get_attribute( + "href"), + name=affiliated_company.find_element_by_class_name("company-name-link").text.strip(), + followers=affiliated_company.find_element_by_class_name("company-followers-count").text.strip() + ) self.affiliated_companies.append(companySummary) except: @@ -255,7 +276,7 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): if close_on_complete: driver.close() - def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10, get_employees = True): + def scrape_not_logged_in(self, close_on_complete=True, retry_limit=10, get_employees=True): driver = self.driver retry_times = 0 while self.is_signed_in() and retry_times <= retry_limit: @@ -282,9 +303,9 @@ def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10, get_e for showcase_company in showcase_pages.find_elements_by_tag_name("li"): name_elem = showcase_company.find_element_by_class_name("name") companySummary = CompanySummary( - linkedin_url = name_elem.find_element_by_tag_name("a").get_attribute("href"), - name = name_elem.text.strip(), - followers = showcase_company.text.strip().split("\n")[1] + linkedin_url=name_elem.find_element_by_tag_name("a").get_attribute("href"), + name=name_elem.text.strip(), + followers=showcase_company.text.strip().split("\n")[1] ) self.showcase_pages.append(companySummary) driver.find_element_by_class_name("dialog-close").click() @@ -294,13 +315,14 @@ def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10, get_e # affiliated company try: affiliated_pages = driver.find_element_by_class_name("affiliated-companies") - for i, affiliated_page in enumerate(affiliated_pages.find_elements_by_class_name("affiliated-company-name")): + for i, affiliated_page in enumerate( + affiliated_pages.find_elements_by_class_name("affiliated-company-name")): if i % 3 == 0: affiliated_pages.find_element_by_class_name("carousel-control-next").click() companySummary = CompanySummary( - linkedin_url = affiliated_page.find_element_by_tag_name("a").get_attribute("href"), - name = affiliated_page.text.strip() + linkedin_url=affiliated_page.find_element_by_tag_name("a").get_attribute("href"), + name=affiliated_page.text.strip() ) self.affiliated_companies.append(companySummary) except: @@ -335,15 +357,15 @@ def __repr__(self): Affiliated Companies {affiliated_companies} """.format( - name = self.name, - about_us = self.about_us, - specialties = self.specialties, - website= self.website, - industry= self.industry, - company_type= self.company_type, - headquarters= self.headquarters, - company_size= self.company_size, - founded= self.founded, - showcase_pages = self.showcase_pages, - affiliated_companies = self.affiliated_companies - ) + name=self.name, + about_us=self.about_us, + specialties=self.specialties, + website=self.website, + industry=self.industry, + company_type=self.company_type, + headquarters=self.headquarters, + company_size=self.company_size, + founded=self.founded, + showcase_pages=self.showcase_pages, + affiliated_companies=self.affiliated_companies + )