joeyism · axblueblader · May 16, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py
@@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
 
         driver.get(self.linkedin_url)
 
-        _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
+        _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))
 
         navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")
 
-        self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
+        self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()
 
         # Click About Tab or View All Link
         try:
@@ -360,6 +360,6 @@ def __repr__(self):
         _output['affiliated_companies'] = self.affiliated_companies
         _output['employees'] = self.employees
         _output['headcount'] = self.headcount
-        
+
         return json.dumps(_output).replace('\n', '')
 
diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py
@@ -1,3 +1,5 @@
+import time
+
 import requests
 from selenium import webdriver
 from selenium.webdriver.common.by import By
@@ -11,7 +13,7 @@
 
 class Person(Scraper):
 
-    __TOP_CARD = "pv-top-card"
+    __TOP_CARD = "scaffold-layout__main"
     __WAIT_FOR_ELEMENT_TIMEOUT = 5
 
     def __init__(
@@ -113,13 +115,15 @@ def get_experiences(self):
         main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
         self.scroll_to_half()
         self.scroll_to_bottom()
-        main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
-        for position in main_list.find_elements(By.XPATH,"li"):
-            position = position.find_element(By.CLASS_NAME,"pvs-entity")
-            company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
+        main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
+        for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
+            position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
+            company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
 
             # company elem
             company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
+            if not company_linkedin_url:
+                continue
 
             # position details
             position_details_list = position_details.find_elements(By.XPATH,"*")
@@ -143,15 +147,26 @@ def get_experiences(self):
                     company = outer_positions[0].find_element(By.TAG_NAME,"span").text
                     work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
                     location = outer_positions[2].find_element(By.TAG_NAME,"span").text
+            else:
+                position_title = ""
+                company = outer_positions[0].find_element(By.TAG_NAME,"span").text
+                work_times = ""
+                location = ""
+
 
             times = work_times.split("·")[0].strip() if work_times else ""
             duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
 
             from_date = " ".join(times.split(" ")[:2]) if times else ""
             to_date = " ".join(times.split(" ")[3:]) if times else ""
-
-            if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
-                descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
+            if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
+                inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
+                                  .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
+                                  .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
+            else:
+                inner_positions = []
+            if len(inner_positions) > 1:
+                descriptions = inner_positions
                 for description in descriptions:
                     res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
                     position_title_elem = res[0] if len(res) > 0 else None
@@ -200,8 +215,9 @@ def get_educations(self):
         main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
         self.scroll_to_half()
         self.scroll_to_bottom()
-        main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
-        for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"):
+        main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
+        for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
+            position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
             institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")
 
             # company elem
@@ -214,13 +230,17 @@ def get_educations(self):
             outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
 
             institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
-            degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
+            if len(outer_positions) > 1:
+                degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
+            else:
+                degree = None
 
             if len(outer_positions) > 2:
                 times = outer_positions[2].find_element(By.TAG_NAME,"span").text
 
-                from_date = " ".join(times.split(" ")[:2])
-                to_date = " ".join(times.split(" ")[3:])
+                if times != "":
+                    from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
+                    to_date = times.split(" ")[-1]
             else:
                 from_date = None
                 to_date = None
@@ -240,10 +260,9 @@ def get_educations(self):
             self.add_education(education)
 
     def get_name_and_location(self):
-        top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
-        self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
-        self.location = top_panels[1].find_element(By.TAG_NAME,"span").text
-
+        top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")
+        self.name = top_panel.find_element(By.TAG_NAME, "h1").text
+        self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text
 
     def get_about(self):
         try:
@@ -288,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
         self.get_educations()
 
         driver.get(self.linkedin_url)
-
-        # get interest
-        try:
-
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located(
-                    (
-                        By.XPATH,
-                        "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
-                    )
-                )
-            )
-            interestContainer = driver.find_element(By.XPATH,
-                "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
-            )
-            for interestElement in interestContainer.find_elements(By.XPATH, 
-                "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
-            ):
-                interest = Interest(
-                    interestElement.find_element(By.TAG_NAME, "h3").text.strip()
-                )
-                self.add_interest(interest)
-        except:
-            pass
-
-        # get accomplishment
-        try:
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located(
-                    (
-                        By.XPATH,
-                        "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
-                    )
-                )
-            )
-            acc = driver.find_element(By.XPATH,
-                "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
-            )
-            for block in acc.find_elements(By.XPATH, 
-                "//div[@class='pv-accomplishments-block__content break-words']"
-            ):
-                category = block.find_element(By.TAG_NAME, "h3")
-                for title in block.find_element(By.TAG_NAME, 
-                    "ul"
-                ).find_elements(By.TAG_NAME, "li"):
-                    accomplishment = Accomplishment(category.text, title.text)
-                    self.add_accomplishment(accomplishment)
-        except:
-            pass
-
-        # get connections
-        try:
-            driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
-            )
-            connections = driver.find_element(By.CLASS_NAME, "mn-connections")
-            if connections is not None:
-                for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
-                    anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
-                    url = anchor.get_attribute("href")
-                    name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
-                    occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()
-
-                    contact = Contact(name=name, occupation=occupation, url=url)
-                    self.add_contact(contact)
-        except:
-            connections = None
-
         if close_on_complete:
             driver.quit()
 

diff --git a/samples/scrape_person.py b/samples/scrape_person.py
@@ -1,9 +1,36 @@
 import os
-from linkedin_scraper import Person, actions
+from linkedin_scraper import Person, actions, Company
 from selenium import webdriver
-driver = webdriver.Chrome("./chromedriver")
+
+driver = webdriver.Chrome()
 
 email = os.getenv("LINKEDIN_USER")
 password = os.getenv("LINKEDIN_PASSWORD")
-actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
-person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
+actions.login(driver, email, password)  # if email and password isnt given, it'll prompt in terminal
+user_input = []
+urls = []
+while True:
+    user_input = input("Enter a comma-separated list of linkedin urls: ")
+    if user_input == "exit":
+        break
+    urls = user_input.split(",")
+    results = []
+    for url in urls:
+        print(f'scraping {url}')
+        person = Person(url,  driver=driver, close_on_complete=False)
+        company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
+        results.append((person, company))
+
+    print('RESULTS:')
+    print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
+    for person, company in results:
+        experience = person.experiences[0]
+        print(f'"{person.name}", '
+              f'"{person.location}", '
+              f'"{experience.position_title}", '
+              f'"{experience.institution_name}", '
+              f'"{experience.linkedin_url}", '
+              f'"{company.industry}", '
+              f'"{company.website}", '
+              f'"{company.company_size}", '
+              )