In [1]:
import time
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import faker
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from dateutil import parser

In [2]:
fake = faker.Faker(locale='en')

In [3]:
headers = {
    'User-Agent': fake.chrome(),
    'accept-language': 'en-US,en;q=0.9',
    'pragma': 'np-cache',
    'referer': 'http://www.deepmind.com/careers'
}

In [4]:
URL = "http://api.greenhouse.io/v1/boards/deepmind/embed/jobs"

In [5]:
snapshot_pattern = re.compile(r"(\nSnapshot:?\n)((.+?)(\n))", flags=re.DOTALL|re.MULTILINE|re.IGNORECASE)
responsibilities_pattern = re.compile(r"(\nKey Responsibilities:?\n\n)((.+?)(\n\n))", flags=re.DOTALL|re.MULTILINE|re.IGNORECASE)
qualifications_pattern = re.compile(r"(\nAbout You:?\n)((.*)($))", flags=re.DOTALL|re.MULTILINE|re.IGNORECASE)

In [6]:
clear_string = lambda x: re.sub(' +', ' ', re.sub('<.*?>', '', x).replace('\n', '\n ')).strip()

In [7]:
result = requests.get(URL, headers=headers)
result

<Response [200]>

In [8]:
all_jobs = result.json()['jobs']
len(all_jobs), all_jobs[1]

(49,
 {'absolute_url': 'https://boards.greenhouse.io/deepmind/jobs/4281175',
  'data_compliance': [{'type': 'gdpr',
    'requires_consent': False,
    'retention_period': None}],
  'internal_job_id': 2271915,
  'location': {'name': 'London, UK'},
  'metadata': [{'id': 90484,
    'name': 'Website Grouping',
    'value': 'Operations',
    'value_type': 'single_select'}],
  'id': 4281175,
  'updated_at': '2022-06-20T11:05:42-04:00',
  'requisition_id': None,
  'title': 'Assistant to Two Research Leads'})

In [17]:
job_dicts = []
for job_info in tqdm(all_jobs):
    url = job_info['absolute_url']
    publish_date = parser.parse(job_info['updated_at'])
    job_website = requests.get(url, headers=headers)

    soup = BeautifulSoup(job_website.text)
    contents = soup.find('div', id='content')

    description = re.search(snapshot_pattern, contents.text)
    if description:
        description = clear_string(description.group(3))

    responsibilities = re.search(responsibilities_pattern, contents.text)
    if responsibilities:
        responsibilities = clear_string(responsibilities.group(3))

    qualifications = re.search(qualifications_pattern, contents.text)
    if qualifications:
        qualifications = clear_string(qualifications.group(3))

    if description is None and responsibilities is None and qualifications is None:
        print(f'All None for {url}. skipping...')
        continue

    job_dict = {
        'title': job_info['title'],
        'internal_id' : job_info['internal_job_id'],
        'url': url,
        'description': description,
        'responsibilities': responsibilities,
        'qualifications': qualifications,
        'company': 'DeepMind',
        'publish_date': publish_date
    }

    job_dicts.append(job_dict)

    time.sleep(0.2) # sleep for 200 ms

 94%|█████████▍| 46/49 [00:49<00:03,  1.08s/it]

All None for https://boards.greenhouse.io/deepmind/jobs/3919000. skipping...


100%|██████████| 49/49 [00:52<00:00,  1.07s/it]


In [18]:
snapshot = pd.DataFrame(job_dicts)
print(len(snapshot))
snapshot.sample(5)

48


Unnamed: 0,title,internal_id,url,description,responsibilities,qualifications,company,publish_date
27,"Research Engineer, Security & Privacy",2240610,https://boards.greenhouse.io/deepmind/jobs/418...,"As DeepMind continues to grow, we are seeking ...",Devise novel or implement existing methods to ...,To set you up for success as a Research Engine...,DeepMind,2022-06-20 11:05:42-04:00
45,"Technical Program Manager, Applied",1773351,https://boards.greenhouse.io/deepmind/jobs/302...,The role of the Program Management team is to ...,Support the team with all required elements of...,We highly value strong Program Managers who ar...,DeepMind,2022-06-20 11:05:42-04:00
30,"Research Scientist, Cognitive Science, Scalabl...",2239940,https://boards.greenhouse.io/deepmind/jobs/418...,Scalable Alignment Cognitive Science Research ...,Improve our understanding of what it means for...,We do not require candidates to have experienc...,DeepMind,2022-06-23 08:31:32-04:00
33,"Research Scientist, Machine Learning, Scalable...",2239951,https://boards.greenhouse.io/deepmind/jobs/418...,"At DeepMind, we've built a unique culture and ...",Improve our understanding of what it means for...,We look for the following skills and experienc...,DeepMind,2022-06-20 11:05:42-04:00
21,"Program Manager, Operations",2202844,https://boards.greenhouse.io/deepmind/jobs/416...,The role of the Program Management Team is to ...,End to end project planning and delivery acros...,Extensive knowledge and expertise in program m...,DeepMind,2022-06-20 11:05:42-04:00


In [19]:
snapshot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype                                 
---  ------            --------------  -----                                 
 0   title             48 non-null     object                                
 1   internal_id       48 non-null     int64                                 
 2   url               48 non-null     object                                
 3   description       43 non-null     object                                
 4   responsibilities  36 non-null     object                                
 5   qualifications    42 non-null     object                                
 6   company           48 non-null     object                                
 7   publish_date      48 non-null     datetime64[ns, tzoffset(None, -14400)]
dtypes: datetime64[ns, tzoffset(None, -14400)](1), int64(1), object(6)
memory usage: 3.1+ KB


In [20]:
current_date = datetime.now().strftime('%d-%m-%Y')
current_date

'23-06-2022'

In [21]:
snapshot.to_csv(f'../data/deepmind/{current_date}.csv')

### Проверка на одной вакансии

In [9]:
parser.parse(all_jobs[4]['updated_at'])

datetime.datetime(2022, 6, 20, 11, 5, 42, tzinfo=tzoffset(None, -14400))

In [10]:
job_website = requests.get(all_jobs[4]['absolute_url'], headers=headers)
job_website

<Response [200]>

In [11]:
soup = BeautifulSoup(job_website.text)

In [12]:
contents = soup.find('div', id='content')

In [13]:
contents.text

'\nAt DeepMind, we value diversity of experience, knowledge, backgrounds and perspectives and harness these qualities to create extraordinary impact. We are committed to equal employment opportunity regardless of sex, race, religion or belief, ethnic or national origin, disability, age, citizenship, marital, domestic or civil partnership status, sexual orientation, gender identity, pregnancy, maternity or related condition (including breastfeeding) or any other basis as protected by applicable law. If you have a disability or additional need that requires accommodation, please do not hesitate to let us know.\nSnapshot\nDeepMind has a remarkable track record of scientific breakthroughs. Such extraordinary work is a direct result of the brilliant and diverse people we bring together. Our People & Culture team work together to maintain, optimise, and nurture our culture to create the best environment in the world for advancing AI research. A place where everyone’s expertise is recognised 

In [14]:
description = re.search(snapshot_pattern, contents.text)
description.group(3)

'DeepMind has a remarkable track record of scientific breakthroughs. Such extraordinary work is a direct result of the brilliant and diverse people we bring together. Our People & Culture team work together to maintain, optimise, and nurture our culture to create the best environment in the world for advancing AI research. A place where everyone’s expertise is recognised and where everyone is continually learning and supported to be the best leaders, managers, and collaborators they can be.'

In [15]:
responsibilities = re.search(responsibilities_pattern, contents.text)
responsibilities.group(3)

'Crafting and delivering people practises which develop and maintain positive working relationships.\nSolving HR and policy issues, anything from handling long term sickness to working on a disciplinary investigation which includes advising on best practice and note-taking responsibilities, to assisting in the management of poor performance.\nSupporting large-scale change programmes, and being a point of contact for topics such as organisational restructures, maternity and paternity, sickness absence.\nHelping ensure our employee handbook, policies and code of conduct are up to date, legally compliant and reflect our culture and ways of working. .\nIdentifying and implementing appropriate reporting mechanisms for employee relations cases, both informal and formal.\nPartnering closely with Learning and People Partners to ensure managers are equipped to support with people challenges in a fair and effective way, and to enable you to build strong relationships across the organisation.\nYo

In [16]:
qualifications = re.search(qualifications_pattern, contents.text)
qualifications.group(3)

'\nYou are driven by fairness, high quality delivery and standards, and care deeply about people.\nWhen handling people issues, you efficiently navigate legal, cultural and business risks and tensions.\nYou enjoy conflict management and problem solving.\nYou have experience of handling investigation and disciplinary processes.\xa0\nYou enjoy learning about and working within non-UK legal jurisdictions.\nYou have change management experience.\nYou hold a solid understanding of organisational culture. Different work environments pose different challenges for people, so too does the ever-changing nature of our organisation as we grow, scale, and evolve.\nYou proactively ensure your Employment Law expertise is up-to-date.\nTaking the learning opportunities after each case you support, you possess a continuous improvement approach\n\n'