# Linkedin Profile Scraper Project
Return profile information via linkedin login. Combining the power of Selenium to parse webpages and LLM to understand the parsed context.

In [1]:
# env imports
import warnings, os
warnings.filterwarnings('ignore')
from dotenv import load_dotenv
load_dotenv()

# webscraping imports
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import json

# llm imports
from langchain_ollama import ChatOllama
from langchain_core.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate)
from langchain_core.output_parsers import StrOutputParser

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

base_url='http://localhost:11434'
model='llama3.2'

### Testing Selenium connection and login credentials

In [9]:
driver = webdriver.Chrome()
driver.get('https://www.linkedin.com/login')
driver.title

'LinkedIn Login, Sign in | LinkedIn'

### Successfully log into Linkedin account and uncheck 'Remember me' box

In [10]:
# find fields and send creds from .env
email = driver.find_element(By.ID, 'username')
email.send_keys(os.getenv('LINKEDIN_USER'))
password = driver.find_element(By.ID, 'password')
password.send_keys(os.getenv('LINKEDIN_PASS'))

# Find and uncheck the remember me checkbox
try:
    # Wait for the checkbox to be present and interactable
    wait = WebDriverWait(driver, 10)
    checkbox = wait.until(EC.presence_of_element_located((By.ID, 'rememberMeOptIn-checkbox')))
    
    # Check if it's selected and uncheck if needed
    if checkbox.get_attribute('checked') is not None:
        # Try JavaScript click as a more reliable method
        driver.execute_script("arguments[0].click();", checkbox)
        # Verify the change
        if checkbox.get_attribute('checked') is not None:
            print("Warning: Checkbox may still be checked")
except Exception as e:
    print(f"Could not find or interact with remember me checkbox: {str(e)}")

password.submit()

### Optional/For future project:
Scraping job listings would be more useful to me than scraping profiles. See if I can get into the Jobs page and get some listing html.

In [11]:
# Wait for the Jobs link to be clickable
try:
    wait = WebDriverWait(driver, 10)
    jobs_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[href*='/jobs/?']")))
    
    # Alternative selectors if the above doesn't work
    # jobs_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[text()='Jobs']/ancestor::a")))
    # jobs_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-test-app-aware-link] span[title='Jobs']")))
    
    jobs_button.click()
    print("Successfully clicked Jobs button")
except Exception as e:
    print(f"Could not find or click Jobs button: {str(e)}")

Successfully clicked Jobs button


In [None]:
try:
    show_all_button = driver.find_element(
        By.XPATH, 
        "//a[contains(@class, 'artdeco-button') and .//span[text()='Show all']]"
    )
    driver.execute_script("arguments[0].click();", show_all_button)
    print("Successfully clicked 'Show all' button")
except Exception as e:
    print(f"Could not find or click 'Show all' button: {str(e)}")

Successfully clicked 'Show all' button using alternative method


In [16]:
# After navigating to a job listing page
try:
    wait = WebDriverWait(driver, 10)
    job_details_div = wait.until(EC.presence_of_element_located((
        By.CSS_SELECTOR, 
        "div.jobs-details__main-content--single-pane"
    )))
    
    # Extract the HTML content
    job_html = job_details_div.get_attribute('innerHTML')
    
    # Parse with BeautifulSoup
    job_soup = BeautifulSoup(job_html, 'lxml')

    # Extract text content
    job_text = clean_text(job_soup.get_text())
    
    print("Successfully extracted job details")
except Exception as e:
    print(f"Could not extract job details: {str(e)}")

Successfully extracted job details


In [19]:
job_text[:1000]

'\nTikTok\nShare\nShow more options\nData Scientist, Product Analytics - USDS\xa0\nMountain View, CA · Reposted 2 weeks ago · Over 100 people clicked apply\n$114K/yr - $177.8K/yrMatches your job preferences, minimum pay preference is 120000.\nApply\nSave\nSave Data Scientist, Product Analytics - USDS\xa0  at TikTok\nData Scientist, Product Analytics - USDS\nTikTok · Mountain View, CA\nApply\nSave\nSave Data Scientist, Product Analytics - USDS\xa0  at TikTok\nShow more options\nHow your profile and resume fit this job\nGet AI-powered advice on this job and more exclusive features with Premium.\nRetry Premium for $0\nTailor my resume to this job\nAm I a good fit for this job?\nHow can I best position myself for this job?\nPeople you can reach out to\nApache Corporation logo\nCompany alumni from Apache Corporation and others in your network\nShow all\nAbout the job\nResponsibilitiesAbout the teamThe Data Science team of the Tech and Product department at TikTok USDS is responsible for bui

In [None]:
### LLM PROMPT ###
template = """
Extract and return key job information from the LinkedIn job listing in a structured format.

### Job Listing Data:
{}

### Information to Extract:
- Job Title
- Company
- Location
- Employment Type
- Required Skills
- Job Description
- Requirements
- Benefits

### Extracted Data:
"""

## Scrape desired profile
Now that we've successfully logged in, we can scrape a desired page. Note: Linkedin does have limits on scrapes per day, so be aware of this!

In [70]:
url = 'https://www.linkedin.com/in/kevinjin7'
driver.get(url)
page_source = driver.page_source # html for scraped url

### Had to do some delay and scrolling to load the full page. Content is not fully loaded immediately.

In [71]:
# Navigate to profile
url = 'https://www.linkedin.com/in/kevinjin7'
driver.get(url)

# Wait for initial page load and scroll to load dynamic content
wait = WebDriverWait(driver, 10)

# Wait for the profile sections to be present
try:
    # First wait for any artdeco-card to appear
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'artdeco-card')))
    
    # Scroll slowly through the page to trigger lazy loading
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down in smaller increments
        for i in range(10):
            driver.execute_script(f"window.scrollTo(0, {last_height * (i/10)});")
            time.sleep(0.5)
            
        # Wait for new content
        time.sleep(2)
        
        # Calculate new scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # Scroll back to top
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(2)
    
    # Now get the fully loaded page source
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')
    
    # Find sections and print count for debugging
    sections = soup.find_all('section', {'class': 'artdeco-card'})
    print(f"Found {len(sections)} profile sections")
    
except Exception as e:
    print(f"Error loading profile content: {str(e)}")

Found 16 profile sections


## Preprocessing html to feed to LLM
LLM context window runs into issues with entire page of html. Preprocess via bs4 and clean up before sending to LLM for better results.

In [74]:
# grab text from sections
sections_text = [section.get_text() for section in sections]

In [15]:
import re

def clean_text(text):
    # remove multiple newlines and tabs
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', '\t', text)
    text = re.sub(r'\t\s+', ' ', text)
    text = re.sub(r'\n\s+', '\n', text)

    # remove duplicates in each line. this is sometimes a scraped formatting issue for this page. See sections[2]
    lines = text.split('\n')
    new_lines = []
    for line in lines:
        if line[:len(line)//2] == line[len(line)//2:]:
            new_lines.append(line[:len(line)//2])
        else:
            new_lines.append(line)

    return '\n'.join(new_lines)

In [77]:
# clean sections
sections_text = [clean_text(section) for section in sections_text]

In [83]:
print("Sections: " + str(len(sections_text)))
sections_text[0]

Sections: 16


"\nKevin Jin\nData Science | Machine Learning Engineer\nApache Corporation\nUniversity of California, Los Angeles\nHouston, Texas, United States\nContact info\nhttps://kevinjin.crd.co/\n291 connections\nOpen to\nAdd profile section\nEnhance profile\nSend profile in a message\nSave to PDF\nSaved items\nActivity\nAbout this profile\nAbout this profile\nEnhance profile\nTell non-profits you're interested in getting involved with your time and skills\nGet started\n"

## Feed text to LLM
Now that the text has been (somewhat) cleaned, it is ready to be fed into the LLM for processing. Note that depending on the LLM, trying to parse the entire profile as a whole may be outside of the context size. In this case, we just process section by section, which solves this issue. However, other scraping projects may have large volumes of data, so it is something to keep in mind.

In [132]:
def ask_llm(llm, prompt, system=None):
    # default system message if not provided
    if system is None:
        system = SystemMessagePromptTemplate.from_template("""You are helpful AI assistant who answer LinkedIn profile parsing related 
                                                    user question based on the provided profile text data.""")

    prompt = HumanMessagePromptTemplate.from_template(prompt)

    messages = [system, prompt]
    template = ChatPromptTemplate(messages)

    qna_chain = template | llm | StrOutputParser()

    return qna_chain.invoke({})

In [133]:
context = sections_text[0]
key = 'Name and Headline'

template = """
Extract and return the requested information from the LinkedIn profile data in a concise, point-by-point format (up to 5 points). Avoid preambles or any additional context.

### LinkedIn Profile Data:
{}

### Information to Extract:
Extract '{}' in bullet points, limiting the output to 5 points. Provide only the necessary details.
Remember, It is LinkedIn profile data.

### Extracted Data:"""

prompt = template.format(context, key) # context and key will be formatted into {} placeholders in prompt
prompt

"\nExtract and return the requested information from the LinkedIn profile data in a concise, point-by-point format (up to 5 points). Avoid preambles or any additional context.\n\n### LinkedIn Profile Data:\n\nKevin Jin\nData Science | Machine Learning Engineer\nApache Corporation\nUniversity of California, Los Angeles\nHouston, Texas, United States\nContact info\nhttps://kevinjin.crd.co/\n291 connections\nOpen to\nAdd profile section\nEnhance profile\nSend profile in a message\nSave to PDF\nSaved items\nActivity\nAbout this profile\nAbout this profile\nEnhance profile\nTell non-profits you're interested in getting involved with your time and skills\nGet started\n\n\n### Information to Extract:\nExtract 'Name and Headline' in bullet points, limiting the output to 5 points. Provide only the necessary details.\nRemember, It is LinkedIn profile data.\n\n### Extracted Data:"

In [134]:
llm = ChatOllama(base_url=base_url, model=model)

system = SystemMessagePromptTemplate.from_template("""You are helpful AI assistant who answer LinkedIn profile parsing related 
                                                    user question based on the provided profile text data.""")

response = ask_llm(llm, prompt, system) 

In [135]:
print(response)

* Name: Kevin Jin
* Headline: Data Science | Machine Learning Engineer
* Current Company: Apache Corporation
* University: University of California, Los Angeles
* Location: Houston, Texas, United States


### Get section headers to give to LLM
Not all sections need to be parsed (some links to other profiles and other items). We can also use the header to tell the LLM what it is looking for. For example, we want to extract the 'about' information from the 'About' section.

In [136]:
section_keys = ['Name and Headline']
for section in sections_text[1:11]:
    section_keys.append(section.strip().split('\n')[0])

section_keys

['Name and Headline',
 "Tell non-profits you're interested in getting involved with your time and skills",
 'Analytics',
 'About',
 'Featured',
 'Activity',
 'Experience',
 'Education',
 'Projects',
 'Skills',
 'Interests']

### Generate extracted responses
For each key in section keys, the LLM will extract the relevant info from each matching section.

In [137]:
responses = {}

for k,context in zip(section_keys, sections_text):
    prompt = template.format(context, k)
    response = ask_llm(llm, prompt)
    responses[k] = response

In [138]:
responses

{'Name and Headline': '* Name: Kevin Jin\n* Title: \n* Current Company: Apache Corporation\n* Past Education: University of California, Los Angeles',
 "Tell non-profits you're interested in getting involved with your time and skills": "• Tell non-profits you're interested in getting involved with your time and skills\n• Get started",
 'Analytics': '* Private to you\n* 61 profile views\n* 0 post impressions\n* 32 search appearances\n* Show all analytics',
 'About': '• Born and raised in Houston, Texas\n• Attended UCLA in Los Angeles for Electrical Engineering\n• Worked in Hwaseong, South Korea after studies\n• Currently working at Apache Corporation\n• Loves to explore new places and try new restaurants',
 'Featured': "• Link\n• Personal Projects: My ML Story\n• Kevin's ML Portfolio\n• Featured \n• I’ve always been a big fan of learning and making connections through what I see and observe",
 'Activity': '* 292 followers\n* Create a post\n* You haven’t posted yet\n* Posts you share will

### Optional: Save to JSON

In [139]:
import json

with open('linkedin_profile_data.json', 'w') as f:
    json.dump(responses, f, indent=4)

## Adding another LLM layer to further customize the output

In [140]:
template = """You are provided with LinkedIn profile data in JSON format.
            Parse the data according to the specified schema, correct any spelling errors,
            and condense the information if possible.

### LinkedIn Profile JSON Data:
{context}

### Schema You need to follow:
You need to extract
Name:
Headline:
About:
Experience:
Education:
Skills:
Projects:
Summary:

Do not return preambles or any other information.
### Parsed Data:"""

prompt2 = template.format(context=responses).replace("{", "{{").replace("}", "}}") # fixing json formatting
response_2 = ask_llm(llm, prompt=prompt2)

In [141]:
print(response_2)

{
  "Name": "Kevin Jin",
  "Headline": "",
  "About": "Born and raised in Houston, Texas\nAttended UCLA in Los Angeles for Electrical Engineering\nWorked in Hwaseong, South Korea after studies\nCurrently working at Apache Corporation\nLoves to explore new places and try new restaurants",
  "Experience":
  [
    "Data Analyst ",
    "Apache Corporation (Nov 2023 - Present)",
    "Machine Learning Engineering Fellow ",
    "UC San Diego Extended Studies (Apr 2023 - Oct 2023)",
    "Senior Instructor ",
    "CHUNGDAHM Learning (Mar 2021 - Feb 2023)",
    "Online Tutor ",
    "Sparks Academy (Jul 2020 - Feb 2021)"
  ],
  "Education":
  [
    "UCLA",
    "Electrical and Computer Engineering, Computer Science (2015 - 2019)",
    "UC San Diego Extended Studies ",
    "Machine Learning Engineering Bootcamp (May 2023 - Sep 2023)",
    "UCSD Extended Studies ML"
  ],
  "Skills": 
  [
    "LangChain",
    "Data Analyst",
    "Palantir",
    "Apache Corporation"
  ],
  "Projects": 
  [
    "Disney

# NOTE: 
This output may not be that refined. This is because we're using Llama3:3.2b, which is not a very big model. Using a bigger model will generate better results, but for the sake of this demo, we kept it simple. There is a lot of room to improve on this! From better field key extraction to having the llm better understand and parse fields. Will be interesting to potentially use this in a future project (especially nice to learn selenium as a side quest).