Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added company's education statistics parsing. #86

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 86 additions & 64 deletions linkedin_scraper/company.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,46 @@

AD_BANNER_CLASSNAME = ('ad-banner-container', '__ad')


def getchildren(elem):
return elem.find_elements_by_xpath(".//*")


class CompanySummary(object):
linkedin_url = None
name = None
followers = None

def __init__(self, linkedin_url = None, name = None, followers = None):
def __init__(self, linkedin_url=None, name=None, followers=None):
self.linkedin_url = linkedin_url
self.name = name
self.followers = followers

def __repr__(self):
if self.followers == None:
return """ {name} """.format(name = self.name)
return """ {name} """.format(name=self.name)
else:
return """ {name} {followers} """.format(name = self.name, followers = self.followers)
return """ {name} {followers} """.format(name=self.name, followers=self.followers)


class Company(Scraper):
linkedin_url = None
name = None
about_us =None
about_us = None
website = None
headquarters = None
founded = None
industry = None
company_type = None
company_size = None
specialties = None
showcase_pages =[]
showcase_pages = []
affiliated_companies = []
education_statistics = []

def __init__(self, linkedin_url = None, name = None, about_us =None, website = None, headquarters = None, founded = None, industry = None, company_type = None, company_size = None, specialties = None, showcase_pages =[], affiliated_companies = [], driver = None, scrape = True, get_employees = True, close_on_complete = True):
def __init__(self, linkedin_url=None, name=None, about_us=None, website=None, headquarters=None, founded=None,
industry=None, company_type=None, company_size=None, specialties=None, showcase_pages=[],
affiliated_companies=[], driver=None, scrape=True, get_employees=True, close_on_complete=True):
self.linkedin_url = linkedin_url
self.name = name
self.about_us = about_us
Expand Down Expand Up @@ -83,22 +89,37 @@ def __get_text_under_subtitle_by_class(self, driver, class_name):

def scrape(self, get_employees=True, close_on_complete=True):
if self.is_signed_in():
self.scrape_logged_in(get_employees = get_employees, close_on_complete = close_on_complete)
self.scrape_logged_in(get_employees=get_employees, close_on_complete=close_on_complete)
else:
self.scrape_not_logged_in(get_employees = get_employees, close_on_complete = close_on_complete)
self.scrape_not_logged_in(get_employees=get_employees, close_on_complete=close_on_complete)

def __parse_employee__(self, employee_raw):
try:
return Person(
linkedin_url = employee_raw.find_element_by_tag_name("a").get_attribute("href"),
name = (employee_raw.text.split("\n") or [""])[0].strip(),
driver = self.driver,
get = False,
scrape = False
)
linkedin_url=employee_raw.find_element_by_tag_name("a").get_attribute("href"),
name=(employee_raw.text.split("\n") or [""])[0].strip(),
driver=self.driver,
get=False,
scrape=False
)
except:
return None

def get_education_statistics(self):
driver = self.driver
driver.get(os.path.join(self.linkedin_url, "people"))
_ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "insight-container")))
containers = driver.find_elements_by_class_name("insight-container")
for container in containers:
parts = container.text.split('\n')
if parts[0] == 'Where they studied':
result = []
for i in range(2, len(parts)):
sub_parts = parts[i].split()
result.append((' '.join(sub_parts[1:]), int(sub_parts[0])))
return result
return None

def get_employees(self, wait_time=10):
total = []
list_css = "org-people-profiles-module__profile-list"
Expand All @@ -125,15 +146,15 @@ def get_employees(self, wait_time=10):
total.append(self.__parse_employee__(res))

def is_loaded(previous_results):
loop = 0
driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight));")
results_li = results_list.find_elements_by_tag_name("li")
while len(results_li) == previous_results and loop <= 5:
time.sleep(1)
loop = 0
driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight));")
results_li = results_list.find_elements_by_tag_name("li")
loop += 1
return loop <= 5
while len(results_li) == previous_results and loop <= 5:
time.sleep(1)
driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight));")
results_li = results_list.find_elements_by_tag_name("li")
loop += 1
return loop <= 5

def get_data(previous_results):
results_li = results_list.find_elements_by_tag_name("li")
Expand Down Expand Up @@ -161,9 +182,7 @@ def get_data(previous_results):
results_li_len = len(total)
return total



def scrape_logged_in(self, get_employees = True, close_on_complete = True):
def scrape_logged_in(self, get_employees=True, close_on_complete=True):
driver = self.driver

driver.get(self.linkedin_url)
Expand All @@ -177,17 +196,19 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):

# Click About Tab or View All Link
try:
self.__find_first_available_element__(
navigation.find_elements_by_xpath("//a[@data-control-name='page_member_main_nav_about_tab']"),
navigation.find_elements_by_xpath("//a[@data-control-name='org_about_module_see_all_view_link']"),
).click()
self.__find_first_available_element__(
navigation.find_elements_by_xpath("//a[@data-control-name='page_member_main_nav_about_tab']"),
navigation.find_elements_by_xpath("//a[@data-control-name='org_about_module_see_all_view_link']"),
).click()
except:
driver.get(os.path.join(self.linkedin_url, "about"))
driver.get(os.path.join(self.linkedin_url, "about"))

_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'section')))
time.sleep(3)

if 'Cookie Policy' in driver.find_elements_by_tag_name("section")[1].text or any(classname in driver.find_elements_by_tag_name("section")[1].get_attribute('class') for classname in AD_BANNER_CLASSNAME):
if 'Cookie Policy' in driver.find_elements_by_tag_name("section")[1].text or any(
classname in driver.find_elements_by_tag_name("section")[1].get_attribute('class') for classname in
AD_BANNER_CLASSNAME):
section_id = 4
else:
section_id = 3
Expand All @@ -203,23 +224,22 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
for i in range(num_attributes):
txt = labels[i].text.strip()
if txt == 'Website':
self.website = values[i+x_off].text.strip()
self.website = values[i + x_off].text.strip()
elif txt == 'Industry':
self.industry = values[i+x_off].text.strip()
self.industry = values[i + x_off].text.strip()
elif txt == 'Company size':
self.company_size = values[i+x_off].text.strip()
self.company_size = values[i + x_off].text.strip()
if len(values) > len(labels):
x_off = 1
elif txt == 'Type':
self.company_type = values[i+x_off].text.strip()
self.company_type = values[i + x_off].text.strip()
elif txt == 'Founded':
self.founded = values[i+x_off].text.strip()
self.founded = values[i + x_off].text.strip()
elif txt == 'Specialties':
self.specialties = "\n".join(values[i+x_off].text.strip().split(", "))
self.specialties = "\n".join(values[i + x_off].text.strip().split(", "))

driver.execute_script("window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));")


try:
_ = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, 'company-list')))
showcase, affiliated = driver.find_elements_by_class_name("company-list")
Expand All @@ -228,20 +248,21 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
# get showcase
for showcase_company in showcase.find_elements_by_class_name("org-company-card"):
companySummary = CompanySummary(
linkedin_url = showcase_company.find_element_by_class_name("company-name-link").get_attribute("href"),
name = showcase_company.find_element_by_class_name("company-name-link").text.strip(),
followers = showcase_company.find_element_by_class_name("company-followers-count").text.strip()
)
linkedin_url=showcase_company.find_element_by_class_name("company-name-link").get_attribute("href"),
name=showcase_company.find_element_by_class_name("company-name-link").text.strip(),
followers=showcase_company.find_element_by_class_name("company-followers-count").text.strip()
)
self.showcase_pages.append(companySummary)

# affiliated company

for affiliated_company in showcase.find_elements_by_class_name("org-company-card"):
companySummary = CompanySummary(
linkedin_url = affiliated_company.find_element_by_class_name("company-name-link").get_attribute("href"),
name = affiliated_company.find_element_by_class_name("company-name-link").text.strip(),
followers = affiliated_company.find_element_by_class_name("company-followers-count").text.strip()
)
linkedin_url=affiliated_company.find_element_by_class_name("company-name-link").get_attribute(
"href"),
name=affiliated_company.find_element_by_class_name("company-name-link").text.strip(),
followers=affiliated_company.find_element_by_class_name("company-followers-count").text.strip()
)
self.affiliated_companies.append(companySummary)

except:
Expand All @@ -255,7 +276,7 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
if close_on_complete:
driver.close()

def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10, get_employees = True):
def scrape_not_logged_in(self, close_on_complete=True, retry_limit=10, get_employees=True):
driver = self.driver
retry_times = 0
while self.is_signed_in() and retry_times <= retry_limit:
Expand All @@ -282,9 +303,9 @@ def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10, get_e
for showcase_company in showcase_pages.find_elements_by_tag_name("li"):
name_elem = showcase_company.find_element_by_class_name("name")
companySummary = CompanySummary(
linkedin_url = name_elem.find_element_by_tag_name("a").get_attribute("href"),
name = name_elem.text.strip(),
followers = showcase_company.text.strip().split("\n")[1]
linkedin_url=name_elem.find_element_by_tag_name("a").get_attribute("href"),
name=name_elem.text.strip(),
followers=showcase_company.text.strip().split("\n")[1]
)
self.showcase_pages.append(companySummary)
driver.find_element_by_class_name("dialog-close").click()
Expand All @@ -294,13 +315,14 @@ def scrape_not_logged_in(self, close_on_complete = True, retry_limit = 10, get_e
# affiliated company
try:
affiliated_pages = driver.find_element_by_class_name("affiliated-companies")
for i, affiliated_page in enumerate(affiliated_pages.find_elements_by_class_name("affiliated-company-name")):
for i, affiliated_page in enumerate(
affiliated_pages.find_elements_by_class_name("affiliated-company-name")):
if i % 3 == 0:
affiliated_pages.find_element_by_class_name("carousel-control-next").click()

companySummary = CompanySummary(
linkedin_url = affiliated_page.find_element_by_tag_name("a").get_attribute("href"),
name = affiliated_page.text.strip()
linkedin_url=affiliated_page.find_element_by_tag_name("a").get_attribute("href"),
name=affiliated_page.text.strip()
)
self.affiliated_companies.append(companySummary)
except:
Expand Down Expand Up @@ -335,15 +357,15 @@ def __repr__(self):
Affiliated Companies
{affiliated_companies}
""".format(
name = self.name,
about_us = self.about_us,
specialties = self.specialties,
website= self.website,
industry= self.industry,
company_type= self.company_type,
headquarters= self.headquarters,
company_size= self.company_size,
founded= self.founded,
showcase_pages = self.showcase_pages,
affiliated_companies = self.affiliated_companies
)
name=self.name,
about_us=self.about_us,
specialties=self.specialties,
website=self.website,
industry=self.industry,
company_type=self.company_type,
headquarters=self.headquarters,
company_size=self.company_size,
founded=self.founded,
showcase_pages=self.showcase_pages,
affiliated_companies=self.affiliated_companies
)