Skip to content

Commit a8026c4

Browse files
Merge pull request avinashkranjan#698 from XZANATOL/LinkedIn_Connections_Scrapper
LinkedIn Connections Scrapper
2 parents 4a6b64e + 8aeba85 commit a8026c4

File tree

2 files changed

+244
-0
lines changed

2 files changed

+244
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# LinkedIn Connections Scrapper
2+
3+
It's a script built with the help of Selenium and Pandas to scrap LinkedIn connections list along with the skills of each connection if you want to. Using just a oneline command you can sitback and have a CSV file prepared for your cause.
4+
5+
# Installation
6+
7+
Make sure you have the following Python libraries:
8+
> pip3 install selenium pandas
9+
10+
The rest should be present as core Python modules.
11+
Next thing is to place ChromeDriver.exe in the same directory of the script. You can download it from [here](https://sites.google.com/a/chromium.org/chromedriver/downloads)
12+
(Note: Download the one with the same version of your Chrome browser.)
13+
14+
# Usage
15+
16+
For basic use:
17+
> python scrapper.py -e \<email\> -p \<password\>
18+
19+
For scrapping skills:
20+
> python scrapper.py -e \<email\> -p \<password\> -s
21+
22+
# Furthur Notes
23+
24+
- The time of script progress depends on the number of connections the account has. For basic use, the script can take a time complexity of O(n^2).
25+
- For skills scraping, the time will rise even more depending on the each profile and its contained details.
26+
- The scripts prints out a couple of messages to explain in which phase it is.
27+
- efficieny is also affected by Internet speed.
28+
29+
# Output
30+
31+
Basic use will output a \"scrap.csv\" file that will contain columns of Name, Headline, & Link. There will be a skills column but it will be empty.
32+
33+
Using the skills scrapper mode will add the skills of each profile to that column, each skill will be " -- " separated.
34+
35+
# Authors
36+
37+
Written by [XZANATOL](https://www.github.com/XZANATOL).
38+
39+
The project was built as a contribution during [GSSOC'21](https://gssoc.girlscript.tech/).
40+
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
# Linkedin My_Connections Scrapper
2+
# Written by XZANATOL
3+
from selenium.webdriver.common.action_chains import ActionChains
4+
from optparse import OptionParser
5+
from selenium import webdriver
6+
import pandas as pd
7+
import time
8+
import sys
9+
import re
10+
11+
pattern_name = "\\n(.+)\\n" # Used to extract names
12+
pattern_headline = 'occupation\\n(.+)\\n' # Used to extract headlines
13+
14+
# Help menu
15+
usage = """
16+
<Script> [Options]
17+
18+
[Options]
19+
-h, --help Show this help message and exit.
20+
-e, --email Enter login email
21+
-p, --password Enter login password
22+
-s, --skills Flag to scrap each profile, and look at its skill set
23+
24+
Operation Modes:
25+
> Basic mode
26+
This will scrap all LinkedIn connections list with there corresponding Name, Headline, and Profile link.
27+
> Skills scrapper mode (-s/--skills)
28+
(Time Consuming mode)
29+
This will do the same job of basic mode but along with visiting each profile and extracting the skills of each.
30+
"""
31+
32+
# Load args
33+
parser = OptionParser()
34+
parser.add_option("-e", "--email", dest="email", help="Enter login email")
35+
parser.add_option("-p", "--password", dest="password", help="Enter login password")
36+
parser.add_option("-s", "--skills", action="store_true", dest="skills", help="Flag to scrap each profile, and look at its skill set")
37+
38+
39+
def login(email, password):
40+
"""LinkedIn automated login function"""
41+
# Get LinkedIn login page
42+
driver = webdriver.Chrome("chromedriver.exe")
43+
driver.get("https://www.linkedin.com")
44+
# Locate Username field and fill it
45+
session_key = driver.find_element_by_name("session_key")
46+
session_key.send_keys(email)
47+
# Locate Password field and fill it
48+
session_password = driver.find_element_by_name("session_password")
49+
session_password.send_keys(password)
50+
# Locate Submit button and click it
51+
submit = driver.find_element_by_class_name("sign-in-form__submit-button")
52+
submit.click()
53+
# Check credentials output
54+
if driver.title != "LinkedIn":
55+
print("Provided E-mail/Password is wrong!")
56+
driver.quit()
57+
sys.exit()
58+
# Return session
59+
return driver
60+
61+
62+
def scrap_basic(driver):
63+
"""Returns 3 lists of Names, Headlines, and Profile Links"""
64+
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
65+
# Bypassing Ajax Call through scrolling the page up and down multiple times
66+
# Base case is when the height of the scroll bar is constant after 2 complete scrolls
67+
time_to_wait = 3 # Best interval for a 512KB/Sec download speed - Change it according to your internet speed
68+
last_height = driver.execute_script("return document.body.scrollHeight")
69+
while True:
70+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom
71+
72+
# This loop is for bypassing a small bug upon scrolling that causes the Ajax call to be cancelled
73+
for i in range(2):
74+
time.sleep(time_to_wait)
75+
driver.execute_script("window.scrollTo(0, 0);") # Scroll up to top
76+
time.sleep(time_to_wait)
77+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom
78+
79+
new_height = driver.execute_script("return document.body.scrollHeight") # Update scroll bar height
80+
if new_height == last_height:
81+
break
82+
last_height = new_height
83+
84+
# Extract card without links
85+
extracted_scrap = driver.find_elements_by_class_name("mn-connection-card__details")
86+
extracted_scrap = [ _.text for _ in extracted_scrap ]
87+
# Append data to a seperate list
88+
names = []
89+
headlines = []
90+
for card in extracted_scrap:
91+
# Try statements just in case of headline/name type errors
92+
try:
93+
names.append( re.search(pattern_name, card)[0] )
94+
except:
95+
names.append(" ")
96+
97+
try:
98+
headlines.append( re.search(pattern_headline, card)[0] )
99+
except:
100+
headlines.append(" ")
101+
102+
103+
# Extract links
104+
extracted_scrap = driver.find_elements_by_tag_name('a')
105+
links = []
106+
for i in extracted_scrap:
107+
link = i.get_attribute("href")
108+
if "https://www.linkedin.com/in" in link and not link in links:
109+
links.append(link)
110+
# Return outputs
111+
return driver, names, headlines, links
112+
113+
114+
def scrap_skills(driver, links):
115+
skill_set = []
116+
length = len(links)
117+
for i in range(length):
118+
link = links[i] # Get profile link
119+
driver.get(link)
120+
121+
# Bypassing Ajax Call through scrolling through profile multiple sections
122+
time_to_wait = 3
123+
last_height = driver.execute_script("return document.body.scrollHeight")
124+
while True:
125+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom
126+
127+
# This loop is for bypassing a small bug upon scrolling that causes the Ajax call to be cancelled
128+
for i in range(2):
129+
time.sleep(time_to_wait)
130+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/4);")
131+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
132+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
133+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight*3/4);")
134+
time.sleep(time_to_wait)
135+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Scroll down to bottom
136+
137+
new_height = driver.execute_script("return document.body.scrollHeight") # Update scroll bar height
138+
if new_height == last_height:
139+
break
140+
last_height = new_height
141+
142+
# Locate button
143+
buttons = driver.find_elements_by_tag_name('button')
144+
length = len(buttons)
145+
for button_num in range(length):
146+
i = buttons[button_num].get_attribute("data-control-name")
147+
if i == "skill_details":
148+
button = buttons[button_num]
149+
break
150+
# Scroll then click the button
151+
actions = ActionChains(driver)
152+
actions.move_to_element(button).click().perform()
153+
# Finally extract the skills
154+
skills = driver.find_elements_by_xpath("//*[starts-with(@class,'pv-skill-category-entity__name-text')]")
155+
skill_set_list = []
156+
for skill in skills:
157+
skill_set_list.append(skill.text)
158+
# Append each skill set to its corresponding name
159+
skill_set.append(" -- ".join(skill_set_list)) # Appending all to one string
160+
# Return session & skills
161+
return driver, skill_set
162+
163+
164+
def save_to_csv(names, headlines, links, skills):
165+
# If skills argument was false
166+
if skills is None:
167+
skills = [None]*len(names)
168+
# Make a dataframe and append data to it
169+
df = pd.DataFrame()
170+
for i in range(len(names)):
171+
df = df.append({"Name":names[i], "Headline":headlines[i], "Link":links[i], "Skills":skills[i]}, ignore_index=True)
172+
# Save to CSV
173+
df.to_csv("scrap.csv", index=False, columns=["Name", "Headline", "Link", "Skills"])
174+
175+
176+
# Start checkpoint
177+
if __name__ == "__main__":
178+
(options, args) = parser.parse_args()
179+
180+
# Inputs
181+
email = options.email
182+
password = options.password
183+
skills = options.skills
184+
185+
driver = login(email, password) # Login Phase
186+
print("Successfull Login!")
187+
print("Commencing 'My-Connections' list scrap...")
188+
driver, names, headlines, links = scrap_basic(driver) # Basic Scrap Phase
189+
print("Finished basic scrap, scrapped {}".format(len(names)))
190+
191+
if skills:
192+
print("Commencing 'Skills' scrap...")
193+
driver, skill_set = scrap_skills(driver, links) # Skills Scrap Phase
194+
print("Finished Skills scrap.")
195+
print("Saving to CSV file...")
196+
save_to_csv(names, headlines, links, skill_set) # Save to CSV
197+
else:
198+
save_to_csv(names, headlines, links, None) # Save to CSV
199+
200+
print("Scrapping session has ended.")
201+
# End Session
202+
driver.quit()
203+
204+

0 commit comments

Comments
 (0)