Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Usable] fixed person.py working as of 17 May #220

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions linkedin_scraper/company.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):

driver.get(self.linkedin_url)

_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))

navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")

self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()

# Click About Tab or View All Link
try:
Expand Down Expand Up @@ -360,6 +360,6 @@ def __repr__(self):
_output['affiliated_companies'] = self.affiliated_companies
_output['employees'] = self.employees
_output['headcount'] = self.headcount

return json.dumps(_output).replace('\n', '')

122 changes: 36 additions & 86 deletions linkedin_scraper/person.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
Expand All @@ -11,7 +13,7 @@

class Person(Scraper):

__TOP_CARD = "pv-top-card"
__TOP_CARD = "scaffold-layout__main"
__WAIT_FOR_ELEMENT_TIMEOUT = 5

def __init__(
Expand Down Expand Up @@ -113,13 +115,15 @@ def get_experiences(self):
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
self.scroll_to_half()
self.scroll_to_bottom()
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
for position in main_list.find_elements(By.XPATH,"li"):
position = position.find_element(By.CLASS_NAME,"pvs-entity")
company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
company_logo_elem, position_details = position.find_elements(By.XPATH, "*")

# company elem
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
if not company_linkedin_url:
continue

# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
Expand All @@ -143,15 +147,26 @@ def get_experiences(self):
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
else:
position_title = ""
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
work_times = ""
location = ""


times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None

from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""

if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
else:
inner_positions = []
if len(inner_positions) > 1:
descriptions = inner_positions
for description in descriptions:
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
position_title_elem = res[0] if len(res) > 0 else None
Expand Down Expand Up @@ -200,8 +215,9 @@ def get_educations(self):
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
self.scroll_to_half()
self.scroll_to_bottom()
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"):
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")

# company elem
Expand All @@ -214,13 +230,17 @@ def get_educations(self):
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")

institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
if len(outer_positions) > 1:
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
else:
degree = None

if len(outer_positions) > 2:
times = outer_positions[2].find_element(By.TAG_NAME,"span").text

from_date = " ".join(times.split(" ")[:2])
to_date = " ".join(times.split(" ")[3:])
if times != "":
from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
to_date = times.split(" ")[-1]
else:
from_date = None
to_date = None
Expand All @@ -240,10 +260,9 @@ def get_educations(self):
self.add_education(education)

def get_name_and_location(self):
top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
self.location = top_panels[1].find_element(By.TAG_NAME,"span").text

top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")
self.name = top_panel.find_element(By.TAG_NAME, "h1").text
self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text

def get_about(self):
try:
Expand Down Expand Up @@ -288,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
self.get_educations()

driver.get(self.linkedin_url)

# get interest
try:

_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located(
(
By.XPATH,
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
)
)
)
interestContainer = driver.find_element(By.XPATH,
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
)
for interestElement in interestContainer.find_elements(By.XPATH,
"//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
):
interest = Interest(
interestElement.find_element(By.TAG_NAME, "h3").text.strip()
)
self.add_interest(interest)
except:
pass

# get accomplishment
try:
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located(
(
By.XPATH,
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
)
)
)
acc = driver.find_element(By.XPATH,
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
)
for block in acc.find_elements(By.XPATH,
"//div[@class='pv-accomplishments-block__content break-words']"
):
category = block.find_element(By.TAG_NAME, "h3")
for title in block.find_element(By.TAG_NAME,
"ul"
).find_elements(By.TAG_NAME, "li"):
accomplishment = Accomplishment(category.text, title.text)
self.add_accomplishment(accomplishment)
except:
pass

# get connections
try:
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
)
connections = driver.find_element(By.CLASS_NAME, "mn-connections")
if connections is not None:
for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
url = anchor.get_attribute("href")
name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()

contact = Contact(name=name, occupation=occupation, url=url)
self.add_contact(contact)
except:
connections = None

if close_on_complete:
driver.quit()

Expand Down
35 changes: 31 additions & 4 deletions samples/scrape_person.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,36 @@
import os
from linkedin_scraper import Person, actions
from linkedin_scraper import Person, actions, Company
from selenium import webdriver
driver = webdriver.Chrome("./chromedriver")

driver = webdriver.Chrome()

email = os.getenv("LINKEDIN_USER")
password = os.getenv("LINKEDIN_PASSWORD")
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
user_input = []
urls = []
while True:
user_input = input("Enter a comma-separated list of linkedin urls: ")
if user_input == "exit":
break
urls = user_input.split(",")
results = []
for url in urls:
print(f'scraping {url}')
person = Person(url, driver=driver, close_on_complete=False)
company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
results.append((person, company))

print('RESULTS:')
print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
for person, company in results:
experience = person.experiences[0]
print(f'"{person.name}", '
f'"{person.location}", '
f'"{experience.position_title}", '
f'"{experience.institution_name}", '
f'"{experience.linkedin_url}", '
f'"{company.industry}", '
f'"{company.website}", '
f'"{company.company_size}", '
)