forked from swiftyloves/linkedin
/
linkedin.py
477 lines (393 loc) · 19.5 KB
/
linkedin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
# -*- coding: utf-8 -*-
"""
Simple Linkedin crawler to interate through a list of urls, pausing to allow a break so that I can browse the page to ensure it the correct one. The file copies the experience as proof that the person is the correct one (as the current job may not be the identifying company). It also has a method to find unlinked founders so I can pass the link back to techireland so that they can link to the linkedin profiles on their website. Finally, a basic google search is iterated to do a quick desktop search for missing founders. Basically, all this does is saves me from using copy-paste into the url so often. Limited automation of pasting links into the browser so I can focus on reading the pages.
.
To use this you need linkedin account, all search is done through your account
Requirements:
python-selenium
python-click
python-keyring
Tested on Python 3 not sure how Python 2 behaves
"""
import sys
import csv
import time
import click
import getpass
import keyring
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import (WebDriverException, NoSuchElementException)
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import random
LINKEDIN_URL = 'https://www.linkedin.com'
INITIAL_PAGE_NUMBER = 1
MAX_PAGE_NUMBER = 1 # I turned this off - if I can't find the right person in 1 page, they're too difficult to find
class UnknownUserException(Exception):
pass
class UnknownBrowserException(Exception):
pass
class WebBus:
"""
context manager to handle webdriver part
"""
def __init__(self, browser):
self.browser = browser
self.driver = None
def __enter__(self):
# XXX: This is not so elegant
# should be written in better way
if self.browser.lower() == 'firefox':
self.driver = webdriver.Firefox()
elif self.browser.lower() == 'chrome':
self.driver = webdriver.Chrome()
elif self.browser.lower() == 'phantomjs':
self.driver = webdriver.PhantomJS()
else:
raise UnknownBrowserException("Unknown Browser")
return self
def __exit__(self, _type, value, traceback):
if _type is OSError or _type is WebDriverException:
click.echo("Please make sure you have this browser")
return False
if _type is UnknownBrowserException:
click.echo("Please use either Firefox, PhantomJS or Chrome")
return False
print('__exit__, driver close')
self.driver.close()
def get_password(username):
"""
get password from stored keychain service
"""
password = keyring.get_password('linkedinpy', username)
if not password:
raise UnknownUserException("""You need to store password for this user
first.""")
return password
def login_into_linkedin(driver, username):
"""
Just login to linkedin if it is not already loggedin
"""
# print('starting login')
userfield = driver.find_element_by_id('login-email')
passfield = driver.find_element_by_id('login-password')
submit_form = driver.find_element_by_class_name('login-form')
password = get_password(username)
# If we have login page we get these fields
# I know it's a hack but it works
# print('starting login entry')
if userfield and passfield:
userfield.send_keys(username)
passfield.send_keys(password)
submit_form.submit()
click.echo("Logging in")
def login_in_the_middle(driver, username):
userfield = driver.find_element_by_css_selector('.form-email input')
passfield = driver.find_element_by_class_name('password')
submit_form = driver.find_element_by_id('login')
password = get_password(username)
# If we have login page we get these fields
# I know it's a hack but it works
if userfield and passfield:
userfield.send_keys(username)
passfield.send_keys(password)
submit_form.submit()
click.echo("Logging in")
def collect_names(filepath):
"""
collect names from the file given
"""
names = []
with open(filepath, 'r') as _file:
# names = [line.strip() for line in _file.readlines()]
names = [line[:-1] for line in _file.readlines()]
return names
def collect_urls(filepath):
"""
collect urls from the file given
"""
items = []
with open(filepath, 'r') as _file:
# names = [line.strip() for line in _file.readlines()]
items = [line[:-1] + '' for line in _file.readlines()]
return items
@click.group()
def cli():
"""
First store password
$ python linkedin store username@example.com
Password: **
Then crawl linkedin for users
$ python linkedin crawl username@example.com with_names output.csv --browser=firefox
"""
pass
@click.command()
@click.option('--browser', default='phantomjs', help='Browser to run with')
@click.argument('username')
@click.argument('infile') #this file has the name and company in a string - should only be one or two possibilities that are the correct profile that techireland needs to display
@click.argument('outfile')
def crawl(browser, username, infile, outfile):
"""
Run this crawler with specified username
"""
# first check and read the input file
all_names = collect_names(infile)
fieldnames = ['Search name', 'Name', 'URL']
# then check we can write the output file
# we don't want to complete process and show error about not
# able to write outputs
with open(outfile, 'w', newline='') as csvfile:
# just write headers now
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# now open the browser
with WebBus(browser) as bus:
bus.driver.get(LINKEDIN_URL)
login_into_linkedin(bus.driver, username)
time.sleep(random.uniform(30, 60))
for name in all_names:
links = []
nametexts = []
try:
search_input = bus.driver.find_element_by_css_selector('.ember-view input')
print('Found search box')
time.sleep(random.uniform(2, 5))
except NoSuchElementException:
print('NoSuchElementException search_input')
continue
search_input.clear()
search_input.send_keys(name)
print('Input name: ', name)
time.sleep(random.uniform(2, 5))
try:
bus.driver.find_element_by_css_selector('.search-typeahead-v2__button').click()
print('Clicked search')
time.sleep(random.uniform(5, 10))
except NoSuchElementException:
print('Click search button fails')
profiles = []
# collect the profile links - later I'll iterate through the experience to decide which is the right one
results = None
print('Current URL: ', bus.driver.current_url)
try:
links = bus.driver.find_elements_by_css_selector(".search-result__info .search-result__result-link")
except NoSuchElementException:
print('Links failed', NoSuchElementException)
links = [link.get_attribute('href') for link in links]
print('Links:', links)
if links != []:
i = 0
try:
nametexts = bus.driver.find_elements_by_css_selector("span.name.actor-name")
nametexts = [nametext.text for nametext in nametexts]
except NoSuchElementException:
print('Name texts failed', NoSuchElementException)
while len(links)>len(nametexts):
nametexts.append("No name found")
print('Appended name')
print('Name texts:', nametexts[i])
with open(outfile, 'a+', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
for link in links:
# every search result
print('Link: ', link)
print('Name text: ', nametexts[i])
## time.sleep(random.uniform(0.2, 2))
data = {'Search name': name.encode('ascii', 'ignore').decode('utf-8'), 'Name': nametexts[i].encode('ascii', 'ignore').decode('utf-8'), 'URL': link}
print(data)
profiles.append(data)
i = i + 1
writer.writerows(profiles)
click.echo("Checked: " + name)
else:
print("Not found: " + name)
time.sleep(random.uniform(2, 5))
@click.command()
@click.option('--browser', default='phantomjs', help='Browser to run with')
@click.argument('username')
@click.argument('infile')
@click.argument('inrandomfile')
@click.argument('outfile')
def crawlexperience(browser, username, infile, inrandomfile, outfile):
"""
Run this crawler with specified username
"""
# first check and read the input file
links = collect_urls(infile) #get urls from file - could make a single smarter file reader proc
## randomsites = collect_urls(inrandomfile) #get urls from file - used to make sure I'm not getting mixed up between profiles
fieldnames = ['Number', 'Link', 'Resolved URL', 'Name', 'Title','Company', 'Date Range', 'Location']
# then check we can write the output file
# we don't want to complete process and show error about not
# able to write outputs
with open(outfile, 'w', newline='') as csvfile:
# just write headers now
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# now open the browser
with WebBus(browser) as bus:
## driver = webdriver.Firefox()
## driver = webdriver.Remote(command_executor="http://127.0.0.1:59456",desired_capabilities={})
## driver.session_id = '6efbea68-f5ec-4b7f-8470-0c966e96f545'
print(bus.driver.command_executor._url)
print(bus.driver.session_id)
## bus.driver = driver
bus.driver.get(LINKEDIN_URL)
login_into_linkedin(bus.driver, username)
print('Logged in')
time.sleep(random.uniform(60, 60)) #setup break to manually login
# print('Survived sleep')
with open(outfile, 'a+', newline='') as csvfile:
# print('Starting writer')
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
time.sleep(random.uniform(5, 15)) #setup break to manually login
i=0
for link in links:
i=i+1
## randomsite = randomsites[int(random.uniform(0, 500))]
## print('Random site: ', str(randomsite))
## bus.driver.get(str(randomsite))
time.sleep(random.uniform(30, 60)) #setup break to manually browse current page including login page
profiles = []
# every search result
print('Link ', str(i) + ': '+ link)
bus.driver.get(link)
# login_in_the_middle(bus.driver, username)
experienceBlock = None
experiences = None
try:
bus.driver.find_element_by_class_name('profile-unavailable')
print('Profile is unavailable')
## randomsite = randomsites[int(random.uniform(0, 500))]
## print('Random site: ', str(randomsite))
## bus.driver.get(str(randomsite))
time.sleep(random.uniform(15, 30)) #setup break to manually login
bus.driver.get(link)
except NoSuchElementException:
print('Profile found')
# scroll down to open experience so I can look for the techireland company
last_height = bus.driver.execute_script("return document.body.scrollHeight")
time.sleep(random.uniform(30, 60)) #setup break to manually login
# print('Scrolling to bottom')
while True:
bus.driver.execute_script("window.scrollBy(0, +"+str(int(random.uniform(1000,1500)))+");")
# Wait to load page
time.sleep(random.uniform(1, 2))
bus.driver.execute_script("window.scrollBy(0, -"+str(int(random.uniform(1000,1500)))+");")
# Wait to load page
time.sleep(random.uniform(1, 2))
bus.driver.execute_script("window.scrollBy(0, +"+str(int(random.uniform(200,400)))+");")
# Wait to load page
time.sleep(random.uniform(1, 2))
bus.driver.execute_script("window.scrollBy(0, +"+str(int(random.uniform(200,400)))+");")
# Wait to load page
time.sleep(random.uniform(1, 2))
bus.driver.execute_script("window.scrollBy(0, +"+str(int(random.uniform(200,400)))+");")
# Calculate new scroll height and compare with last scroll height
new_height = bus.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#get the experience block as proof - too complicated to get just the right company as it's name may vary slightly (e.g. include Limited, Ltd. Group etc)
try:
experienceBlock = bus.driver.find_element_by_class_name('experience-section')
# print('Experience Block: ', experienceBlock)
except NoSuchElementException:
click.echo("No experience section skipping this user")
continue
name = bus.driver.find_element_by_class_name('pv-top-card-section__name')
# print('Name: ', name.text)
#scroll back to inline expansion and click on it
while True:
try:
element = experienceBlock.find_element_by_class_name('pv-profile-section__see-more-inline')
# print('Found an inline expansion: scrolling back to click on it')
bus.driver.execute_script("return arguments[0].scrollIntoView();", element)
# print('Scrolled to element')
bus.driver.execute_script("window.scrollBy(0, -150);")
# print('About to click')
bus.driver.find_element_by_class_name('pv-profile-section__toggle-detail-icon').click()
# Wait to load page
time.sleep(random.uniform(1, 2))
except NoSuchElementException:
click.echo("No expand element found")
# continue
try:
moreInline = experienceBlock.find_element_by_class_name('pv-profile-section__see-more-inline')
except NoSuchElementException:
break
experiences = experienceBlock.find_elements_by_class_name('pv-entity__summary-info')
# print('Experiences:',experiences)
for experience in experiences:
# print('Parsing titles')
title = experience.find_element_by_tag_name('h3');
# print(title.text)
company = experience.find_element_by_class_name('pv-entity__secondary-title');
# print(company.text)
try:
dateRange = experience.find_element_by_class_name('pv-entity__date-range');
dateData = dateRange.text.splitlines()[1]
except NoSuchElementException:
## click.echo("No date range data")
dateData = 'None'
# continue
try:
location = experience.find_element_by_class_name('pv-entity__location');
locationData = location.text.splitlines()[1]
except NoSuchElementException:
## click.echo("No location data")
locationData = 'None'
# continue
# print('Allocated title data')
# print('Title text: ',title)
# print('Title text: ',title.text)
# print('Creating data entry')
data = {'Number': i, 'Link': link, 'Resolved URL': bus.driver.current_url,
'Name': name.text.encode('ascii', 'ignore').decode('utf-8'),
'Title': title.text.encode('ascii', 'ignore').decode('utf-8'),
'Company': company.text.encode('ascii', 'ignore').decode('utf-8'),
'Date Range': dateData.encode('ascii', 'ignore').decode('utf-8'),
'Location': locationData.encode('ascii', 'ignore').decode('utf-8')}
# print('Data: ',data)
profiles.append(data)
# print(profiles)
writer.writerows(profiles)
click.echo("Checked: " + link)
@click.command()
@click.option('--browser', default='phantomjs', help='Browser to run with')
@click.argument('infile')
def crawlgoogle(browser, infile):
"""
Run this crawler with specified username
"""
# first check and read the input file
searches = collect_urls(infile) #get urls from file - could make a single smarter file reader proc
# now open the browser
with WebBus(browser) as bus:
time.sleep(random.uniform(5, 15)) #setup break to manually login
i=0
for search in searches:
i=i+1
bus.driver.get("https://www.google.com");
click.echo("Searching: " + str(i) + search)
bus.driver.find_element_by_id("lst-ib").send_keys('"'+search+'"')
time.sleep(random.uniform(3, 5))
bus.driver.find_element_by_id("lst-ib").send_keys(Keys.RETURN)
time.sleep(random.uniform(45, 45)) #setup break to manually login
@click.command()
@click.argument('username')
def store(username):
"""
Store given password for this username to keystore
"""
passwd = getpass.getpass()
keyring.set_password('linkedinpy', username, passwd)
click.echo("Password updated successfully")
cli.add_command(crawl)
cli.add_command(crawlexperience)
cli.add_command(crawlgoogle)
cli.add_command(store)
if __name__ == '__main__':
cli()