## Imports

In [352]:
# !pip install pandas webdriver-manager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
from bs4 import BeautifulSoup
import random
import pandas as pd
import re
from tqdm import tqdm

## Functions

In [354]:
def get_el_str(el):
    return el.get_text(separator = '\n', strip = True)

def random_wait(min=2,max_1 = 4, max_2 = 8):
    return random.randint(min,random.randint(max_1, max_2))

def get_profile_name(driver):
    return driver.find_element(by = By.CSS_SELECTOR, value = 'a[href*="about-this-profil"]').text

def get_profile_desc(driver):
    try:
        desc = driver.find_element(by = By.CSS_SELECTOR, value = 'div[class="text-body-medium break-words"]').text
    except:
        desc = ''

    return desc

def get_company_from_exp(exp):
    keep_searching = True
    for el in exp.find_all('a',{'data-field':'experience_company_logo'}):
        if el.find('span'):
            keep_searching = False
            return get_el_str(el)
    
    if keep_searching:
        company = exp.find('div',class_ = 'display-flex flex-row justify-space-between')
        if company:
            return get_el_str(company)
    
def get_detail_from_exp(exp):
    detail = exp.find('div', class_=re.compile('pvs-entity__sub-components'))
    if detail:
        return get_el_str(detail)

def get_experiences_bloc(driver):
    try:
        soup_type = 'exp_page'
        all_xp_btn = driver.find_element(by = By.ID, value = 'navigation-index-see-all-experiences')
        all_xp_btn.click()
        wait = random_wait()
        # print(f'clicked exp page, wait {wait} secs...')
        sleep(wait)
        exp_container = driver.find_element(by = By.CSS_SELECTOR, value = 'main[class="scaffold-layout__main"]')
        #---
    
    except:
        # print('exp page not available')
        soup_type = 'profile_page'
        sections = driver.find_elements(by = By.CSS_SELECTOR, value = 'section[class*="artdeco-card pv-profile-card break-words"]')
        # print('found sections')
    
        for section in sections:
            try:
                section.find_element(by = By.ID, value = 'experience')
                exp_container = section
                # print('found experiences section')
                #---
                break
            except:
                continue
    
    html_content = exp_container.find_element(by = By.TAG_NAME, value = 'ul').get_attribute('innerHTML')
    # print('collected list of experiences')
    soup = BeautifulSoup(html_content, features='html.parser')

    return {
        'type':soup_type,
        'soup':soup,
        }

def list_experiences(experience_bloc):
    exp_list = []
    # strip hidden tags that creates duplicate content
    for el in experience_bloc.find_all(class_=re.compile(pattern = '-hidden')):
        el.decompose()

    exp_elmts = experience_bloc.find_all('li', class_= re.compile('artdeco-list__item'))
    
    for ind, exp in enumerate(exp_elmts, start = 1):
        # company_exp = get_company_from_exp(exp)
        # detail_exp = get_detail_from_exp(exp)
        # print(company_exp)
        # print('-'*30)
        positions = get_positions_list(exp)
        [pos.update({'ind_exp' : ind}) for pos in positions]
        exp_list.extend(positions)

    return exp_list

def get_header_from_exp(exp):
    # Retrieve exp header:
    keep_searching = True
    exp_type = None
    exp_parts = exp.find_all('div', class_ = 'display-flex flex-row justify-space-between')
    if len(exp_parts)>1:
        exp_type = 'multiple'
    else:
        exp_type = 'single'

    header = get_el_str(exp_parts[0])
        
    return {
        'header' : header,
        'exp_type' : exp_type,
        }
    # for el in exp.find_all('a',{'data-field':'experience_company_logo'}):
    #     if el.find('span'):
    #         # exp with multiple positions
    #         exp_type = 'multiple'
    #         header = get_el_str(el)
    #         keep_searching = False
    #         break

    # if keep_searching:
    #     all_

    #     header = get_el_str(exp.find('div',class_ = 'display-flex flex-row justify-space-between'))
    #     exp_type = 'single'

    # return {
    #     'header' : header,
    #     'exp_type' : exp_type,
    #     }

def get_multiple_positions_from_exp(exp):
    pos_list = []
    for ind, el in enumerate(exp.select('div.scaffold-finite-scroll__content > ul > li'), start = 1):
        pos_list.append(
            {
                'ind' : ind,
                'position' : get_el_str(el),
                }
        )
    return pos_list

def get_positions_list(exp):
    exp_infos = []
    location = contract = duration = ''

    header = get_header_from_exp(exp)
    header_infos = header['header'].split('\n')

    if header['exp_type'] == 'multiple':
        
        pos_list = get_multiple_positions_from_exp(exp)
        company = header_infos[0]

        if len(header_infos) >= 3:
            location = header_infos[2]

        for pos in pos_list:
            pos_infos = pos['position'].split('\n')

            if len(pos_infos) > 3:
                title, contract, duration = pos_infos[:3]
            elif len(pos_infos) == 3:
                title, contract, duration = pos_infos
            elif len(pos_infos) == 2:
                title, duration = pos_infos

            exp_infos.append(
                {
                    'company' : company,
                    'title' : title,
                    'contract' : contract,
                    'duration' : duration,
                    'location' : location,
                    'exp_type' : header['exp_type'],
                }
            )

    else:

        title = header_infos[0]
        company = header_infos[1]

        if len(header_infos) == 4:
            duration = header_infos[2]
            location = header_infos[3]
        
        elif len(header_infos) == 3:
            duration = header_infos[2]
        
        exp_infos.append(
                {
                    'company' : company,
                    'title' : title,
                    'contract' : contract,
                    'duration' : duration,
                    'location' : location,
                    'exp_type' : header['exp_type'],
                }
            )
    return exp_infos

def clean_company_contract(exp):
    if ' · ' in exp['company']:
        exp['company'], exp['contract'] = exp['company'].split(' · ')

## Steps

### Launch driver

In [181]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

### Connect to linkedin (manually)

### Add profiles to scrape

In [182]:
profiles = [
# https://www.linkedin.com/in/xxxx',
# ...
]

### Collect experiences from profiles

In [None]:
res = []
for profile in tqdm(profiles):
    tmp = {}
    # print(profile)
    driver.get(profile)
    sleep(random_wait())
    
    name = get_profile_name(driver)
    desc = get_profile_desc(driver)
    tmp.update(
        {
            'url' : profile,
            'name' : name,
            'desc' : desc,
            }
        )
    
    tmp.update(get_experiences_bloc(driver))
    
    res.append(
        tmp
        )
    
    sleep(random_wait())


### Extract infos and format result

In [356]:
exp_list = []
for tmp in res:
    url = tmp['url']
    exp_bloc = tmp['soup']
    positions_list = list_experiences(exp_bloc)

    [el.update({'url' : url}) for el in positions_list] #Add url to exps infos
    
    exp_list.extend(positions_list)

for exp in exp_list:
    clean_company_contract(exp)

### Download results

In [358]:
df = pd.DataFrame(res, columns=['url','name','desc','type']).merge(pd.DataFrame(exp_list), on = 'url')
df

Unnamed: 0,url,name,desc,type,company,title,contract,duration,location,exp_type,ind_exp
0,https://www.linkedin.com/in/filippopesci,Filippo Pesci,BCG | Politecnico di Torino,profile_page,Procter & Gamble,IT Manager,Temps plein,févr. 2021 - août 2021 · 7 mois,"Roma, Lazio, Italia",single,2
1,https://www.linkedin.com/in/filippopesci,Filippo Pesci,BCG | Politecnico di Torino,profile_page,Procter & Gamble,IT Project Manager Intern,Stage,mars 2020 - août 2020 · 6 mois,"Pomezia, Lazio, Italia",single,3
2,https://www.linkedin.com/in/filippopesci,Filippo Pesci,BCG | Politecnico di Torino,profile_page,ComparaSemplice.it,Business Analyst Intern,Stage,févr. 2018 - juil. 2018 · 6 mois,"Roma, Italia",single,4
3,https://www.linkedin.com/in/filippopesci,Filippo Pesci,BCG | Politecnico di Torino,profile_page,The Boston Consulting Group (BCG),Business@school,,sept. 2013 - juin 2014 · 10 mois,"Roma, Italia",single,5
4,https://www.linkedin.com/in/damianooliva,Damiano Oliva,Head Of Digital Marketing presso Secret Key - ...,exp_page,Secret Key - Web Marketing Agency,Head Of Digital Marketing,Temps plein,janv. 2019 - aujourd’hui · 5 ans 6 mois,,multiple,1
...,...,...,...,...,...,...,...,...,...,...,...
212,https://www.linkedin.com/in/alessia-mazzotta-3...,Alessia Mazzotta,Data Protection Officer,exp_page,Comparasemplice Broker,Legal Expert,Temps plein,janv. 2021 - déc. 2021 · 1 an,Roma,multiple,3
213,https://www.linkedin.com/in/alessia-mazzotta-3...,Alessia Mazzotta,Data Protection Officer,exp_page,Comparasemplice Broker,Impiegato amministrativo,Temps plein,juin 2020 - déc. 2021 · 1 an 7 mois,Roma,multiple,3
214,https://www.linkedin.com/in/alessia-mazzotta-3...,Alessia Mazzotta,Data Protection Officer,exp_page,Tribunale Amministrativo Regionale per il Lazi...,Tirocinio extracurriculare,Stage,mars 2018 - sept. 2019 · 1 an 7 mois,Roma,single,4
215,https://www.linkedin.com/in/alessia-mazzotta-3...,Alessia Mazzotta,Data Protection Officer,exp_page,Banca d'Italia - Eurosistema,Stagista,,sept. 2018 - mars 2019 · 7 mois,"Roma, Italia",single,5


In [359]:
df.to_csv('linkedin_experiences.csv', index=False)

In [None]:
driver.quit()

---