In [125]:
import numpy as np
import pandas as pd

from requests import get
from bs4 import BeautifulSoup
import os
import re

### For this project, I want to pull in LinkedIn data scientist job descriptions and do some natural language processing analysis

In [140]:
# define link and headers and use get to request server content
link = 'https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States'
headers = {'User-Agent': 'data science project'}
response = get(link, headers=headers)

In [141]:
# check response
print(response.text[:400])

<!DOCTYPE html>

    
    
    
    
    

    
    <html lang="en">
      <head>
        <meta name="pageKey" content="d_jobs_guest_search">
          <meta name="linkedin:pageTag" content="urlType=jserp_custom;emptyResult=false">
        <meta name="locale" content="en_US">
        <meta id="config" data-app-version="2.0.939" data-call-tree-id="AAXaqHGBlHbVcHnDI7uBVg==" data-multiproduct-name="j


In [142]:
# make soup variable holding the response content
soup = BeautifulSoup(response.content, 'html.parser')

In [143]:
# get title
soup.title.string

'13,000+ Data Scientist jobs in Atlanta, Georgia, United States (153 new)'

In [144]:
# print HTML
soup.prettify()[:400]

'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <meta content="d_jobs_guest_search" name="pageKey"/>\n  <meta content="urlType=jserp_custom;emptyResult=false" name="linkedin:pageTag"/>\n  <meta content="en_US" name="locale"/>\n  <meta data-app-version="2.0.939" data-browser-id="67fcb5e8-c8ca-45cd-8a89-e5e2edc6f1c1" data-call-tree-id="AAXaqHGBlHbVcHnDI7uBVg==" data-enable-core-web-tracking="" data-enable-p'

In [145]:
# get job listings
listings = soup.find_all('a', class_='base-card__full-link')
listings[:3]

[<a class="base-card__full-link" data-tracking-client-ingraph="" data-tracking-control-name="public_jobs_jserp-result_search-card" data-tracking-will-navigate="" href="https://www.linkedin.com/jobs/view/data-scientist-decision-analytics-at-the-home-depot-2975499059?refId=7xJrn84%2FhF5oygquxTnkZA%3D%3D&amp;trackingId=MzjXcVvOk0BqvdUdJz7WTg%3D%3D&amp;position=1&amp;pageNum=0&amp;trk=public_jobs_jserp-result_search-card">
 <span class="screen-reader-text">
             
         
         Data Scientist, Decision Analytics
       
       
           </span>
 </a>,
 <a class="base-card__full-link" data-tracking-client-ingraph="" data-tracking-control-name="public_jobs_jserp-result_search-card" data-tracking-will-navigate="" href="https://www.linkedin.com/jobs/view/junior-data-scientist-at-the-coca-cola-company-2973139481?refId=7xJrn84%2FhF5oygquxTnkZA%3D%3D&amp;trackingId=iPMVYXT70dOh%2BmGL65VQHA%3D%3D&amp;position=2&amp;pageNum=0&amp;trk=public_jobs_jserp-result_search-card">
 <span class

In [146]:
# check number of listings for page
len(listings)

25

In [147]:
# get only first listing
listing = listings[0]
listing

<a class="base-card__full-link" data-tracking-client-ingraph="" data-tracking-control-name="public_jobs_jserp-result_search-card" data-tracking-will-navigate="" href="https://www.linkedin.com/jobs/view/data-scientist-decision-analytics-at-the-home-depot-2975499059?refId=7xJrn84%2FhF5oygquxTnkZA%3D%3D&amp;trackingId=MzjXcVvOk0BqvdUdJz7WTg%3D%3D&amp;position=1&amp;pageNum=0&amp;trk=public_jobs_jserp-result_search-card">
<span class="screen-reader-text">
            
        
        Data Scientist, Decision Analytics
      
      
          </span>
</a>

In [134]:
# get job role for listing
role = listing.get_text().strip()

In [148]:
# get link for listing
link = listing['href']
link

'https://www.linkedin.com/jobs/view/data-scientist-decision-analytics-at-the-home-depot-2975499059?refId=7xJrn84%2FhF5oygquxTnkZA%3D%3D&trackingId=MzjXcVvOk0BqvdUdJz7WTg%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card'

In [149]:
# create a function that retrieves the role and link and returns a dictionary with these items 
def parse_listing_info(listing):
    role = listing.get_text().strip()
    link = listing['href']
    return { 'role': role, 'link': link }

In [150]:
# loop through all the listings with the function and create a dataframe with job role and listing
listing_info = pd.DataFrame([parse_listing_info(listing) for listing in listings])
listing_info

Unnamed: 0,role,link
0,"Data Scientist, Decision Analytics",https://www.linkedin.com/jobs/view/data-scient...
1,Junior Data Scientist,https://www.linkedin.com/jobs/view/junior-data...
2,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
3,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
4,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
5,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
6,Data Scientist - Associate,https://www.linkedin.com/jobs/view/data-scient...
7,Associate Data Scientist,https://www.linkedin.com/jobs/view/associate-d...
8,"Associate Data Scientist, Decision Analytics",https://www.linkedin.com/jobs/view/associate-d...
9,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...


### Each page has 25 results & from page to page the link changes only by adding a start point at the end, so I am going to loop through the first 10 pages to gather 250 results.

In [152]:
# check that I will be able to get the correct links
for i in range(25,250,25):
    link = 'https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States&start=' + str(i)
    print(link)

https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States&start=25
https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States&start=50
https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States&start=75
https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States&start=100
https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States&start=125
https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States&start=150
https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20sci

In [164]:
# create a function to return soup object
def get_soup(link): 
    response = get(link, headers={'user-agent': 'data science project'}) 
    soup = BeautifulSoup(response.content, 'html.parser') 
    return soup 

# create a function that retrieves the role and link and returns a dictionary with these items 
def parse_listing_info(listing):
    role = listing.get_text().strip()
    link = listing['href']
    return { 'role': role, 'link': link }

# create a function to get the first page results and loop through the following pages to gather 250 results 
def get_job_listings():
    link = 'https://www.linkedin.com/jobs/search/?distance=25&geoId=106224388&keywords=data%20scientist&location=Atlanta%2C%20Georgia%2C%20United%20States'
    soup = get_soup(link) 
    listings = soup.find_all('a', class_='base-card__full-link') 
    listing_info = pd.DataFrame([parse_listing_info(listing) for listing in listings])
    
    for i in range(25,250,25):
        link = link + str(i)
        soup = get_soup(link) 
        listings = soup.find_all('a', class_='base-card__full-link')
        listing_info = pd.concat([listing_info, 
                                 pd.DataFrame([parse_listing_info(listing) for listing in listings])],
                                 ignore_index=True)
        print('iteration')
    return listing_info

In [165]:
# call function
datascience_jobs = get_job_listings()
datascience_jobs

iteration
iteration
iteration
iteration
iteration
iteration
iteration
iteration
iteration


Unnamed: 0,role,link
0,"Data Scientist, Decision Analytics",https://www.linkedin.com/jobs/view/data-scient...
1,Junior Data Scientist,https://www.linkedin.com/jobs/view/junior-data...
2,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
3,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
4,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
...,...,...
245,Data Scientist,https://www.linkedin.com/jobs/view/data-scient...
246,"Sr Data Scientist, Content Science",https://www.linkedin.com/jobs/view/sr-data-sci...
247,"Data Scientist, Revenue Acceleration, Google C...",https://www.linkedin.com/jobs/view/data-scient...
248,Data Scientist - 100% Virtual,https://www.linkedin.com/jobs/view/data-scient...
