# Исследование по теме наставничества. Часть I - Парсер

**Информация о компании**\
Отрасль и направления деятельности: EdTech, сервис онлайн-образования

**Общее описание задачи**\
Провести исследование по теме настаничества и менторства на основании контента социальной сети Linkedin, размещенного в открытом доступе, созданнного целевой аудиторией

**Цели исследования**
- Определить топ-10 тем в направлении наставничества на основании наибольшего охвата, используя теги `Наставничество`, `менторство`, `коучинг`, `mentorship`, `mentor`, `coaching`, `buddy`.
- Определить топ-10 тем по просмотрам, реакциям: лайкам, комментариям, репостам среди IT-специалистов, подходящих под описание целевой аудитории исследования
- Дополнить профили целевой аудитории новыми параметрами

**Требования к результату**
- Собранный датасет в виде CSV или JSON файла (не ссылки)
- Презентация в виде ссылки на Google Slides
- Ссылка на код проекта размещенного на GitHub

## Установка библиотек

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import time
import re
import random
import json

import requests
from bs4 import BeautifulSoup
import base64
import lxml

import undetected_chromedriver as uc

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

Установим параметры для браузера парсера, возьмем логин и пароль для Linkedin из `licreds.txt` 

In [50]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0'}

# login password saved in licreds.txt
with open("licreds.txt") as f:
    lines = f.readlines()
    USER_LOGIN, USER_PASSWORD = lines[0], lines[1]

Готовим функции для сбора общей информации из профиля, попутно собираем немного сырой информации(вдруг пригодится), и посты с реакциями

In [51]:
def get_and_print_profile_info(driver, profile_url):
    time.sleep(random.uniform(3,5))

    driver.get(profile_url)        # this will open the link

    # Extracting data from page with BeautifulSoup
    src = driver.page_source

    # Now using beautiful soup
    soup = BeautifulSoup(src, 'lxml') # 'lxml'  "html.parser"

    # Extracting the HTML of the complete introduction box
    # that contains the name, company name, and the location
    intro = soup.find('div', {'class': 'pv-text-details__left-panel'})
    
    # collect some additional raw info
    expa = soup.findAll('span', {'class': 'visually-hidden'})
    experience_list = [x.text for x in expa[:]]

    # In case of an error, try changing the tags used here.
    name_loc = intro.find("h1")

    # Extracting the Name
    name = name_loc.get_text().strip()
    # strip() is used to remove any extra blank spaces

    works_at_loc = intro.find("div", {'class': 'text-body-medium'})

    # this gives us the HTML of the tag in which the Company Name is present
    # Extracting the Company Name
    works_at = works_at_loc.get_text().strip()

    POSTS_URL_SUFFIX = 'recent-activity/all/'

    time.sleep(random.uniform(3,7))

    # Get current url from browser
    cur_profile_url = driver.current_url

    # Parse posts
    posts = get_and_print_user_posts(driver, cur_profile_url + POSTS_URL_SUFFIX)
    
    return cur_profile_url, name, works_at, experience_list, posts

In [52]:
def get_and_print_user_posts(driver, posts_url):
    driver.get(posts_url)

    #Simulate scrolling to capture all posts
    SCROLL_PAUSE_TIME = random.uniform(3,7)

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    # We can adjust this number to get more posts
    NUM_SCROLLS = random.randint(6,10)

    for i in range(NUM_SCROLLS):
        time.sleep(random.uniform(4,6))
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Parsing posts
    src = driver.page_source

    # Now using beautiful soup
    soup = BeautifulSoup(src, 'lxml')
    # soup.prettify()

    posts = soup.find_all('li', class_='profile-creator-shared-feed-update__container')

    all_posts_with_react = []

    for post_src in posts:
        posts_reactions = {} #  post:reaction
        post_text_div = post_src.find('div', {'class': 'feed-shared-update-v2__description-wrapper mr2'})

        # if post_text_div is None:
        #     print(post_src)

        if post_text_div is not None:
            post_text = post_text_div.find('span', {'dir': 'ltr'})
        else:
            post_text = None

        # If post text is found
        if post_text is not None:
            post_text = post_text.get_text().strip()
            #print(f'Post text: {post_text}')

        reaction_cnt = post_src.find('span', {'class': 'social-details-social-counts__reactions-count'})

        # If number of reactions is written as text
        # It has different class name
        if reaction_cnt is None:
            reaction_cnt = post_src.find('span', {'class': 'social-details-social-counts__social-proof-text'})

        if reaction_cnt is not None:
            reaction_cnt = reaction_cnt.get_text().strip()
            #print(f'Reactions: {reaction_cnt}')

        posts_reactions[str(post_text)] = str(reaction_cnt)

        all_posts_with_react.append(posts_reactions)

    return all_posts_with_react

Запускаем браузер со страницей авторизации

In [54]:
# start Chrome browser
driver = uc.Chrome(headers=HEADERS)

# Opening linkedIn's login page
# NOTE: We need to turn of 2 step authentification
driver.get("https://linkedin.com/uas/login")

Вводим учетные данные

In [55]:
# waiting for the page to load
time.sleep(random.uniform(3,7))

# entering username
username = driver.find_element(By.ID, "username")

# Enter Your Email Address
username.send_keys(USER_LOGIN)

time.sleep(random.uniform(1,3))
# entering password
pword = driver.find_element(By.ID, "password")

# Enter Your Password
pword.send_keys(USER_PASSWORD)

# Clicking on the log in button
driver.find_element(By.XPATH, "//button[@type='submit']").click()

Доступных анкет на каждой странице варьируется от 2-3 до 7-8. Ограничение на парсинг 100 анкет потом релогин. Способ по снятию ограничения таймера, во второй вкладке проявлять активность: смотреть видео, проходить тесты....

In [56]:
# creat 
profile_urls = []

NUM_PAGES_TO_PARSE = 50  # num pages to parsed

Обходим все страницы поиска и сохраняем с них анкеты

In [57]:
# Iterate over pages of search results
# to collect profile urls
for i in range(NUM_PAGES_TO_PARSE):
    driver.get(f'https://www.linkedin.com/search/results/people/?geoUrn=%5B%22101728296%22%5D&keywords=qa%20engineer&origin=FACETED_SEARCH&page={i+1}&sid=SJb&titleFreeText=senior')

    search_result_links = driver.find_elements(By.CSS_SELECTOR, "div.entity-result__item a.app-aware-link")

    for link in search_result_links:
        href = link.get_attribute("href")
        if 'linkedin.com/in' in href:
            profile_urls.append(href)
    time.sleep(random.uniform(5,7))
    # next_button = driver.find_element(By.CLASS_NAME,'artdeco-pagination__button--next')
    # next_button.click()
    print(i)

profile_urls = list(set(profile_urls))
print(len(set(profile_urls)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
145


Проходим по анкетам собирая данные в json

In [None]:
final_result = {}
counter = 0
# Parse profile urls
for profile_url in profile_urls[:99]: # ограничение LNKD 100
    cur_profile_url, name, works_at, experience_list, posts = get_and_print_profile_info(driver, profile_url)
    all_profile_info = {}

    all_profile_info['name'] = name
    all_profile_info['works_at'] = works_at
    all_profile_info['posts'] = posts
    all_profile_info['cur_profile_url'] = cur_profile_url
    all_profile_info['experience_list'] = experience_list

    final_result[str(profile_url)] = all_profile_info
    counter+=1
    print(counter)
    time.sleep(random.uniform(4,7))

len(final_result)

In [71]:
open("parsed_jsons\result_final.json", "w", encoding='utf-8').write(json.dumps(final_result, indent=4, ensure_ascii=False))

438666

In [72]:
# close the Chrome browser
driver.quit()

Вывод: Данные собраны, дальнейший анализ будет проведен в следующей тетрадке