## Project Name
> Linkedin scraping

### General info
* Scraping linkedin webpage for jobs defined in start_url
* Writing results to pandas df
    * df is saved to _*.h5_ file for later use/comparison
    * example: _linkedin-Wednesday08July202007-11-28-01.h5_
* At the end there is comparison of most recent scraping and present status diffrence is displayed so the user has nice summary


### Technologies
* Python, 
* BeautifulSoup, 
* pandas, 
* re

In [1]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

import pandas as pd
import numpy  as np
import re
import datetime
import os,glob

In [2]:
# url with search keyword and place 
start_url='https://de.linkedin.com/jobs/search?keywords=biotech&location=Berlin&trk=public_jobs_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0'

In [3]:
# preparing list of existing(previously created by scrapping) pandas linkedin .h5 files
search_dir = "/home/lubuntu/output/" #directory where *.h5 files are stored

def list_per_time(search_dir=search_dir):
    files = [x for x in (filter(os.path.isfile, glob.glob(search_dir + "linkedin*.h5"))) ] 
    files_sorted= sorted(files, key=os.path.getmtime)
    print(files_sorted[:10])
    return files_sorted

In [4]:
# get source code of the page
def get_url(url):
    return urllib.request.urlopen(url).read()

In [5]:
# makes the source tree format like

def beautify(url):
    hdr = {'User-Agent': 'Mozilla/5.0'}
    req = Request(url,headers=hdr)
    page = urlopen(req)

    return BeautifulSoup(page, "html.parser")

In [6]:
jobs = beautify(start_url)

In [7]:
# getting general information about overall number of jobs and new jobs

spans = jobs.find_all('span', {'class' : 'results-context-header__job-count'})
jobs_nr=int(spans[0].get_text())
spans_new = jobs.find_all('span', {'class' : 'results-context-header__new-jobs'})
jobs_new = [span.get_text() for span in spans_new]
if len(jobs_new)<1:
    print('Jobs summary all/no new jobs--->',jobs_nr)
else:
    print('Jobs summary all/new  --->',jobs_nr,'/', int(jobs_new[0][1]))

Jobs summary all/new  ---> 99 / 9


In [8]:
# scrapping details from the webpage of particular job in previous summary
def get_job_record(url):
    jobs = beautify(url)
    job_name = jobs.select('h1.topcard__title')[0].text.strip()
    c = jobs.findAll('span', {'class':'topcard__flavor'})
    company_name =c[0].get_text()
    p = jobs.findAll('span', {'class':'job-result-card__location'})
    place_name =p[0].get_text()
    po = jobs.findAll('span', {'topcard__flavor--metadata posted-time-ago__text'})
    po_new =jobs.findAll('span', {'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})
    if len(po)>0:
        posted_ago =po[0].get_text()
    else:
        posted_ago =po_new[0].get_text() #when a job is new  there is another topcard see above
    
    timestamp = datetime.datetime.now()
    job_descr='XXX ' # extra separator for the future to be flexible when manipulating job descriptions
    xx=' XXX '
    results_context = jobs.find('div', {'class' : 'show-more-less-html__markup'}).findAll('ul')
    
    for x, result in enumerate(results_context):
#         print(results_context[x].text)
        job_descr=job_descr+ results_context[x].text+xx
#         print('-------------------------')
    if job_descr== 'XXX ':
        job_descr = xx+ jobs.find('div', {'class' : 'show-more-less-html__markup'}).text


    return job_name, company_name, place_name, posted_ago, job_descr,timestamp
    

In [9]:
j_links=jobs.findAll('a', attrs={'href': re.compile("^https://de.linkedin.com/jobs/view/")}) #filtering links to webpages of selected jobs
jobs_links = [link.get('href') for link in j_links] #creating a list of job links

In [10]:
# =================================temporary  tables to store  info from webpages
job=[]
company=[]
place=[]
posted=[]
job_d=[]
time_s=[]

In [11]:
#================================ populating tables above with data from particular jobs
def populate_temp_tables(url_list):

    
    for x,url in enumerate(url_list):
        job_name, company_name, place_name, posted_ago, job_descr, timestamp = get_job_record(url)
        print(x+1,'-->',url)
        job.append(job_name)
        company.append(company_name)
        place.append(place_name)
        posted.append(posted_ago)
        job_d.append(job_descr)
        time_s.append(timestamp)
    return job, company, place, posted, job_d, time_s
    

In [12]:
populate_temp_tables(jobs_links)

1 --> https://de.linkedin.com/jobs/view/biopharma-strategy-consultant-at-scitaris-1901548288?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=1&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
2 --> https://de.linkedin.com/jobs/view/emea-training-development-lead-m-f-d-at-alimera-sciences-1901604842?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=2&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
3 --> https://de.linkedin.com/jobs/view/salesperson-%E2%80%93-nucleic-acid-chemistry-germany-eu-at-emp-biotech-gmbh-1926071176?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=3&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
4 --> https://de.linkedin.com/jobs/view/business-development-manager-biopharma-emea-at-twist-bioscience-1857199294?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=4&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
5 --> https://de.linkedin.com/jobs/view/bio-scientist-schwerpunkt-antik%C3%B6rper

(['Biopharma Strategy Consultant',
  'EMEA Training & Development Lead (m/f/d)',
  'Salesperson – Nucleic Acid Chemistry (Germany/ EU)',
  'Business Development Manager, Biopharma - EMEA',
  'BIO-SCIENTIST, SCHWERPUNKT ANTIKÖRPER',
  'Medical Affairs Manager',
  'Automatisierungsexperte Pharma',
  'Head of Marketing Hämatologie (m/w/d)',
  'Solution Consultant (Remote)',
  'Process Validation Specialist (Berlin) (m/w/d)',
  'EMEA Training & Development Lead (m/f/d)',
  'Implementation Consultant (Remote)',
  'Head of External Activation and Identity Management',
  'Sales Representative / Pharmareferent (m/f/x)',
  'Natural Language Processing Engineer',
  'Kommunikationsdesigner Pharma Relations (m/w/d)',
  'Solution Consultant (Remote)',
  'Gesamtentwicklungsleiter (m/w/d) - Mitglied der Geschäftsleitung',
  'Lead Quantitative Methodology Statistician (m/f/d)',
  'Projekt Manager Biotech (m/w/d)',
  'Projekt Manager Biotech (m/w/d)',
  'CMC Analytical development project manager - Gro

In [13]:
# ====================creating pandas dataframe with all the temp tables populated before

In [14]:
jobs_linkedin = pd.DataFrame({'posted_ago' : posted, 'when_saved' : time_s,'job_title' : job, 'company': company, 'location': place, 'job_descr' : job_d, 'link' : jobs_links})

In [15]:
jobs_linkedin.info() #double checking 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   posted_ago  25 non-null     object        
 1   when_saved  25 non-null     datetime64[ns]
 2   job_title   25 non-null     object        
 3   company     25 non-null     object        
 4   location    25 non-null     object        
 5   job_descr   25 non-null     object        
 6   link        25 non-null     object        
dtypes: datetime64[ns](1), object(6)
memory usage: 1.5+ KB


In [16]:
# generating file name with timestamp info for later comparison
def gen_h5_file_with_timestamp(name='name'):
    timestamp = datetime.datetime.now()
    ts1 = timestamp.strftime("%A%d%B%Y%m")
    ts2 = timestamp.strftime("%H")
    ts3 = timestamp.strftime("%M")
    ts4 = timestamp.strftime("%S")
    file_name= name + ts1 +"-"+ ts2+"-"+ ts3+"-"+ ts4+".h5"
    return file_name


In [17]:
file_name= gen_h5_file_with_timestamp(name='linkedin-')
file_name

'linkedin-Wednesday08July202007-20-46-17.h5'

In [18]:
# saving dataframe in .h5 file for later use
jobs_linkedin.to_hdf(search_dir+file_name, key='df', mode='w') 

In [19]:
# loading last( creation time) two dfs for comparison
def read_two_last_dfs():
    files_sorted=list_per_time()
    if len(files_sorted)<2:
        print('There is no recent .h5 files to compare')
        f1=files_sorted[-1:]
        f2=f1
        df1=pd.read_hdf(f1[0], 'df')
        df2=pd.read_hdf(f2[0], 'df')
    else:
        f1=files_sorted[-2:-1]
        f2=files_sorted[-1:]
        df1=pd.read_hdf(f1[0], 'df')
        df2=pd.read_hdf(f2[0], 'df')
    return df1, df2

In [20]:
df1, df2=read_two_last_dfs()

['/home/lubuntu/output/linkedin-Wednesday08July202007-12-26-23.h5', '/home/lubuntu/output/linkedin-Wednesday08July202007-12-30-06.h5', '/home/lubuntu/output/linkedin-Wednesday08July202007-13-59-17.h5', '/home/lubuntu/output/linkedin-Wednesday08July202007-20-46-17.h5']


In [21]:
# overview of loaded dfs
df1.info(), df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 0 to 10
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   posted_ago  11 non-null     object        
 1   when_saved  11 non-null     datetime64[ns]
 2   job_title   11 non-null     object        
 3   company     11 non-null     object        
 4   location    11 non-null     object        
 5   job_descr   11 non-null     object        
 6   link        11 non-null     object        
dtypes: datetime64[ns](1), object(6)
memory usage: 704.0+ bytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   posted_ago  25 non-null     object        
 1   when_saved  25 non-null     datetime64[ns]
 2   job_title   25 non-null     object        
 3   company     25 non-null     object        
 4   locatio

(None, None)

In [22]:
# checking what new/diffrent records(False) are new in df2
df2['job_title'].isin(df1['job_title'])

0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19     True
20     True
21    False
22    False
23    False
24    False
Name: job_title, dtype: bool

In [23]:
# checking what records are new in df2 compared to df1(~not in) as overview
df2.loc[~df2['job_title'].isin(df1['job_title'])]

Unnamed: 0,posted_ago,when_saved,job_title,company,location,job_descr,link
0,Vor 1 Woche,2020-07-08 20:45:16.667374,Biopharma Strategy Consultant,Scitaris,"Berlin, Deutschland",XXX Consultant (PhD-entry Level) XXX ﻿Minimum ...,https://de.linkedin.com/jobs/view/biopharma-st...
1,Vor 4 Wochen,2020-07-08 20:45:17.542857,EMEA Training & Development Lead (m/f/d),Alimera Sciences,Berlin,"XXX Degree level education required, prefera...",https://de.linkedin.com/jobs/view/emea-trainin...
3,Vor 3 Monaten,2020-07-08 20:45:19.747807,"Business Development Manager, Biopharma - EMEA",Twist Bioscience,Berlin,XXX Prospect and generate leads of potential ...,https://de.linkedin.com/jobs/view/business-dev...
4,Vor 1 Woche,2020-07-08 20:45:20.914567,"BIO-SCIENTIST, SCHWERPUNKT ANTIKÖRPER",diamond inventics,Berlin,XXX Projektkoordination und -leitung im Bereic...,https://de.linkedin.com/jobs/view/bio-scientis...
5,Vor 6 Tagen,2020-07-08 20:45:22.204992,Medical Affairs Manager,BioTalent,Berlin,XXX Berlin-based Biotech recruiting for a rem...,https://de.linkedin.com/jobs/view/medical-affa...
6,Vor 2 Wochen,2020-07-08 20:45:23.789976,Automatisierungsexperte Pharma,TRS Staffing Solutions,"Berlin, Deutschland",XXX Unterstützung der Entwicklung der Automat...,https://de.linkedin.com/jobs/view/automatisier...
7,Vor 1 Tag,2020-07-08 20:45:24.878593,Head of Marketing Hämatologie (m/w/d),Takeda,"Berlin, Deutschland",XXX Betonung der Förderung des Erfolgs des HEM...,https://de.linkedin.com/jobs/view/head-of-mark...
8,Vor 8 Monaten,2020-07-08 20:45:25.919945,Solution Consultant (Remote),Veeva Systems,Berlin,XXX Pre-sales supportSolution presentations & ...,https://de.linkedin.com/jobs/view/solution-con...
9,Vor 7 Tagen,2020-07-08 20:45:27.002177,Process Validation Specialist (Berlin) (m/w/d),NextPharma,Berlin,XXX Process ValidationSpecialist(m/w/d)Stando...,https://de.linkedin.com/jobs/view/process-vali...
10,Vor 4 Wochen,2020-07-08 20:45:28.625712,EMEA Training & Development Lead (m/f/d),Alimera Sciences,"Berlin, Deutschland",XXX Develop and manage a field sales onboardi...,https://de.linkedin.com/jobs/view/emea-trainin...


In [24]:
# df2.link.values[~df2['job_title'].isin(df1['job_title'])]

In [25]:
# list(df2.link.values[~df2['job_title'].isin(df1['job_title'])])[0]

In [27]:
print('=============links to new jobs since last checking=============\n')
for x,item in enumerate(df2.link.values[~df2['job_title'].isin(df1['job_title'])]):
    print(x+1, '-->',item )


1 --> https://de.linkedin.com/jobs/view/biopharma-strategy-consultant-at-scitaris-1901548288?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=1&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
2 --> https://de.linkedin.com/jobs/view/emea-training-development-lead-m-f-d-at-alimera-sciences-1901604842?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=2&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
3 --> https://de.linkedin.com/jobs/view/business-development-manager-biopharma-emea-at-twist-bioscience-1857199294?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=4&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
4 --> https://de.linkedin.com/jobs/view/bio-scientist-schwerpunkt-antik%C3%B6rper-at-diamond-inventics-1866259218?refId=aec4ae5e-7868-4f8e-a56d-b652d3688546&position=5&pageNum=0&trk=public_jobs_job-result-card_result-card_full-click
5 --> https://de.linkedin.com/jobs/view/medical-affairs-manager-at-biotalent-1930470218?refId