In [29]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

1. Retrieve job posting links

These links are stored in a list. They will then be used by request to get each job posting's information.

In [352]:
#given the job type e.g. data analyst, retrieve the result pages

def get_result_pages(job_type):
    job='+'.join(job_type.split(' '))
    links=[]
    for page_no in range(10): #by default, only 10 result pages are accessed
        URL = 'https://sg.indeed.com/jobs?q='+job+'&l=Singapore&jt=internship&start='+str(page_no*10)
        #conducting a request of the stated URL above:
        page = requests.get(URL)
        #specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
        soup = BeautifulSoup(page.text, 'html.parser')
        links.extend(extract_link_from_result_page(soup))
    return links

In [78]:
# from each result page which contains multiple job postings, the links for each job posting is collected

def extract_link_from_result_page(soup): 
    links = []
    for div in soup.find_all(name='div', attrs={'class':'jobsearch-SerpJobCard'}):
        title=div.find(name='h2', attrs={'class':'title'})
        link=title.find(name='a')['href']
        links.append('https://indeed.com'+link)
    return(links)

#test run the function
extract_link_from_result_page(soup)

['https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0As_4rxRHupPYvzzfn0nGzD2Xhz6UmZ5K8U37POs52SRxlxW_zdF17F--96N-D4vvog47QJFbCQPpELLcPYaqxZ3dC4HCAG2jormKoCZ28aznEmRyybxmeZ9nQFg3gvO7nv4w6_9tHmdVXdbcA0IS-KbHexp5TiyS99FJsWQSHt90tPRBxcbwxEZVUJeb7YDDdnlLWJnj1pjqvEcd1ceiOBZDoaXk7zK3zLvZsrP5bpSwiRJPz9DI-bu5T_qWmD4gesZr_4N-ttX4CbTgBQTd3Q-dc1T6Dj2G30yVX0nKzmVsa8hFNLcQKRwZJuQEsIhtgFXgjRAvZjAS8ht4_6D7wR7uYsv_b2oIHe7t5VNaSVbgceuxY06HcRo-O7RlB0MfbkBnWCNEQDituy_792QZnAiTdSeTnzQ8ZqJmReEVCkTK-ypbqS1T4YMOFRGY7NjYkoN7RMTJc6ESx9Xz0G0y5AnuitPXOtBSmvUOF691i-OHSVcdKJQlB3rlFIlOlzm6M=&p=0&fvj=1&vjs=3',
 'https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A1xek5knTnC5i_gSNNUQ677KkFw824YX3Xvyf6DkeM2R9g6Hf2XGYktF7f42jzdS9vkb9DeHced15MsZd_fWIgVeH4nwnMjmmxCElAeSmF6Hm3YE9CS4WPUrWsupLeejqdFJL7xHb1D7_38L30FX1RdxBXQy0P5wmcOmcwg1hgIhZlLQxzg_XJG3eyKQDbEThNioqPv6HmqiHvrve5GNYu8Q9jLagSpEaVDIdrIrkCbeFeEV6CTBbN6GcP6xziqnrG4VH282R5Bg1E05Fofyw8kh4ZxxAQIeC1RYnIb2Fb1_rT551ak7bw0zNP_y4CcD7qckc95qUbOlwNzo_WwtfooTTwR7hhUhHmO0x8A1wHptVv8

2. Collect information from each job posting's page

In [304]:
#main function called once for each job posting page to collect all the info on that page and insert it into the lists

def get_each_page_info(links, job_type):
    job_title_list=[]
    company_list=[]
    requirements_list=[]
    responsibilities_list=[]
    for link in links:
        time.sleep(1)
#         print(link) for debugging purposes
        page = requests.get(link)
        soup = BeautifulSoup(page.text, 'html.parser')
        company=str(company_from_page(soup))
        company_list.append((job_type,company))
        job_title=str(job_title_from_page(soup))
        job_title_list.append((job_type, job_title,company))
        responsibilities_list.extend(responsibility_from_page(soup,job_type,job_title))
        requirements_list.extend(requirements_from_page(soup,job_type,job_title))
    return (job_title_list,company_list,requirements_list,responsibilities_list)

In [138]:
def job_title_from_page(soup):
    title_container=soup.find(name='div',attrs={'class':'jobsearch-JobInfoHeader-title-container'})
    title=title_container.find(name='h1').string
    return title

In [139]:
def company_from_page(soup):
    company=soup.find(name='div',attrs={'class':'jobsearch-InlineCompanyRating'}).contents[0].string
    return company

In [301]:
def responsibility_from_page(soup,job_type,job_title):
    responsibilities=[]
    return_list=[]
    responsibility=soup.findAll(['p','h4','h3'],string=re.compile('Responsibilities',re.IGNORECASE))
#     print(responsibility)
    if len(responsibility)==0:#some pages dont have responsibilities section, or name the section weirdly
        return []
    for responsibility_possibility in responsibility:
        #searching for a list that is the sibling of the section header
        if not (responsibility_possibility.find_next_sibling()==None or responsibility_possibility.find_next_sibling().name!='ul'):
            tag=responsibility_possibility.find_next_sibling()
            tags=[tag]
            while (True): #getting all the list items
                tag=tag.find_next_sibling()
#                 print(tag)
                if (tag==None or (tag.name!='ul' and tag.name!='li')):
                    break
                tags.append(tag)
            if len(tags)==1:
                tags=tags[0].findAll('li')
            for tag in tags: #removing extra lingering tags like br
                if tag.find('br')!=None:
                    tag.br.extract()
                if tag.name=='ul':
                    list_item=tag.find(name='li')
                else:
                    list_item=tag
                list_item=list_item.contents[0]
                return_list.append((job_type,str(list_item.string.strip('\n')),job_title))
            return return_list
    return []

In [302]:
def requirements_from_page(soup,job_type,job_title):
    requirements=[]
    requirement=soup.findAll(['p','h4','h3'], string=re.compile('Requirement|Qualification|Profile|Competencies',re.IGNORECASE))
    if len(requirement)==0:#some pages dont have requirements section, or name the section weirdly
        return []
#     print(requirement)
    for requirement_possibility in requirement:
        #searching for a list that is the sibling of the section header
        if not (requirement_possibility.find_next_sibling()==None or requirement_possibility.find_next_sibling().name!='ul'):
            tag=requirement_possibility.find_next_sibling()
            tags=[tag]
            while (True): #getting all the list items
                tag=tag.find_next_sibling()
                if (tag==None or (tag.name!='ul' and tag.name!='li')):
                    break
                tags.append(tag)
            if len(tags)==1:
                tags=tags[0].findAll('li')
            for tag in tags: #removing extra lingering tags like br
                if tag.find('br')!=None:
                    tag.br.extract()
                if tag.name=='ul':
                    list_item=tag.find(name='li')
                else:
                    list_item=tag
                list_item=list_item.contents[0]
                requirements.append((job_type,str(list_item.string.strip('\n')),job_title))
            return requirements
    return []

Testing out the functions above!

In [251]:
get_each_page_info(['https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPatO_JBaRnmHwWQqwCMS5aG1Q6T-KWtyWFcxq6igeJLizrHloFP05UqaR0za9t4_15Rw470ECbgSL2RMH7-BH6KwU1OIJudVwi2v6AZpML2V4MAd4DreNy-ATXJ5YcQp0ESzJKsJuWjY9almbK1oy6RYj6oH0CZmVIDBIy2LTuipfYfaFua0AuDxl7G0YEMEF-yFVfm0loJrztH_T2tRr_J65xwVM8Q74jHsXutz8dVl7RStQ21aEmMVKuhUPYrIqMN4j4_o9dIJFQIRFpESWLC4pAV2W6c6AUDzF9Q0pb2Hrahe0v8KarKP5XLih5lETqqdxHetTYjYo1AErUwTf6al62A0wl-uduoGVjSTzAqLFOqmArezfT1qHVMcd3nj7Q6ecwGCSQyq9p9jX4rHlWg4lFon8lLJZ-SlHGQZcUqPOKsmApPH37WwW2QdSOh0N3nfJ0u1NcafoIK_i40vNxDap7u4bC5iFxbcz1RqafGg7KHtWHdI8LRPyO6MjHmUES8vvXyABjxixOu7Dw5mQGis00OGLZYrD-cko53wNSrcOfdVnD5MQvP5ZdZpfDjxwBv8gqg-AcOgFuu5XyUaeDjM1wHcgH8vGeKuBTDdxW1uysqZlbIgqSICTo_kT67IMnknSX5mPW9cJV3Bi1Gg18xLA4GnkyupDr-OibXeLpEvAd2lG6C-CrYXqOhjbM8fsExCnMUKmvbMwylpXP-UWhrFaE4qg-pakcZFIA8c4Gdh41UpKuNSqk9eHLW6ps59x5DhGDBLNgxIzg==&p=6&fvj=0&vjs=3'])

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPatO_JBaRnmHwWQqwCMS5aG1Q6T-KWtyWFcxq6igeJLizrHloFP05UqaR0za9t4_15Rw470ECbgSL2RMH7-BH6KwU1OIJudVwi2v6AZpML2V4MAd4DreNy-ATXJ5YcQp0ESzJKsJuWjY9almbK1oy6RYj6oH0CZmVIDBIy2LTuipfYfaFua0AuDxl7G0YEMEF-yFVfm0loJrztH_T2tRr_J65xwVM8Q74jHsXutz8dVl7RStQ21aEmMVKuhUPYrIqMN4j4_o9dIJFQIRFpESWLC4pAV2W6c6AUDzF9Q0pb2Hrahe0v8KarKP5XLih5lETqqdxHetTYjYo1AErUwTf6al62A0wl-uduoGVjSTzAqLFOqmArezfT1qHVMcd3nj7Q6ecwGCSQyq9p9jX4rHlWg4lFon8lLJZ-SlHGQZcUqPOKsmApPH37WwW2QdSOh0N3nfJ0u1NcafoIK_i40vNxDap7u4bC5iFxbcz1RqafGg7KHtWHdI8LRPyO6MjHmUES8vvXyABjxixOu7Dw5mQGis00OGLZYrD-cko53wNSrcOfdVnD5MQvP5ZdZpfDjxwBv8gqg-AcOgFuu5XyUaeDjM1wHcgH8vGeKuBTDdxW1uysqZlbIgqSICTo_kT67IMnknSX5mPW9cJV3Bi1Gg18xLA4GnkyupDr-OibXeLpEvAd2lG6C-CrYXqOhjbM8fsExCnMUKmvbMwylpXP-UWhrFaE4qg-pakcZFIA8c4Gdh41UpKuNSqk9eHLW6ps59x5DhGDBLNgxIzg==&p=6&fvj=0&vjs=3
[]
[<p>Competencies to Learn
</p>]
[]
['Basic business intellience data domain knowledge'

In [311]:
get_each_page_info(['https://www.indeed.com/viewjob?jk=c595cb66ea1125bf&tk=1eprith8qo2ad800&from=serp&vjs=3&advn=1862197902428497&adid=358790837&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPavaEP2WdfqhRBGlu9c89vYOUeEam1c7jbyYU50bNr3iwrXZUhzhbM7-9PcAL-yzNz3Ow7FbSJxuecSNUKY-soIrq-uCPLuWn8vLwd6Oafc1bVuVeqHBGe8O2teLxKTJdM6MDiLZOuFlqVL-dKVqRAhTFsbBHopMDNdjoa6-iQ_4Vd1Xs4ny5rO1gPXsXz-lnCvJVUEFLklZ2xW0rth_jXDrs7GUSMmY0nOTBNNgX7bdbQ==&sjdu=dtXADpuwviNwHAcP_0C2om8en-HeGVEjgyadgTLktdf5q-Trwsi24vbmog2X0W9QbCVqQk58-CWVx79WvrC_crWQMli0n8aMlMtInuWK23Ouak5bzzQwlCUmIz1QbGQNm0kK_EY-H1DBLlctdPNOmHJTvwWWP4cEpC5nowOdLmpRmvV4_vPQONOpvzPrVfUgNIZWq89Q0_61E1tWXkAJE0TPxzjQLyLrDfPX9M5aU9G7TsW5KcASe9zcQ5iqK4kLzee5nnxHYmu1qo4vYvYxJTpNzA5p455SAFVvhpNmi6MLXsl4O4RGsgFUeN2olKIN'],'data analyst')

https://www.indeed.com/viewjob?jk=c595cb66ea1125bf&tk=1eprith8qo2ad800&from=serp&vjs=3&advn=1862197902428497&adid=358790837&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPavaEP2WdfqhRBGlu9c89vYOUeEam1c7jbyYU50bNr3iwrXZUhzhbM7-9PcAL-yzNz3Ow7FbSJxuecSNUKY-soIrq-uCPLuWn8vLwd6Oafc1bVuVeqHBGe8O2teLxKTJdM6MDiLZOuFlqVL-dKVqRAhTFsbBHopMDNdjoa6-iQ_4Vd1Xs4ny5rO1gPXsXz-lnCvJVUEFLklZ2xW0rth_jXDrs7GUSMmY0nOTBNNgX7bdbQ==&sjdu=dtXADpuwviNwHAcP_0C2om8en-HeGVEjgyadgTLktdf5q-Trwsi24vbmog2X0W9QbCVqQk58-CWVx79WvrC_crWQMli0n8aMlMtInuWK23Ouak5bzzQwlCUmIz1QbGQNm0kK_EY-H1DBLlctdPNOmHJTvwWWP4cEpC5nowOdLmpRmvV4_vPQONOpvzPrVfUgNIZWq89Q0_61E1tWXkAJE0TPxzjQLyLrDfPX9M5aU9G7TsW5KcASe9zcQ5iqK4kLzee5nnxHYmu1qo4vYvYxJTpNzA5p455SAFVvhpNmi6MLXsl4O4RGsgFUeN2olKIN


([('data analyst',
   '#SGUnitedTraineeship Analyst - Data Analyst',
   'STANDARD CHARTERED BANK (SINGAPORE) LIMITED')],
 [('data analyst', 'STANDARD CHARTERED BANK (SINGAPORE) LIMITED')],
 [('data analyst',
   'Data visualization and representation',
   '#SGUnitedTraineeship Analyst - Data Analyst'),
  ('data analyst',
   'Knowledge of wealth management',
   '#SGUnitedTraineeship Analyst - Data Analyst'),
  ('data analyst',
   'Introduction to tableau',
   '#SGUnitedTraineeship Analyst - Data Analyst'),
  ('data analyst',
   'Application of data software skills',
   '#SGUnitedTraineeship Analyst - Data Analyst')],
 [('data analyst',
   'Engagement and management of stakeholders including business, front office, middle/back office, technology, legal and compliance, country and regional management teams.',
   '#SGUnitedTraineeship Analyst - Data Analyst'),
  ('data analyst',
   'Create strong blending data capabilities within team so that we can come up with useful client information fr

In [313]:
links=get_result_pages('data analyst')
job_title,companies,requirements,responsibilities=get_each_page_info(links,'data analyst')


https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-cT3dCZgxlbPQRGEcFchKcZzAYo7kjKUivYWOlNVWwHTy5pTtqsodZTH8V-Y-2zLqunCl4iEqapY8zUe-_5HT7ozqXoPBhXs7K11QJ8709NWU6m7S8fJCsk9R8q2kzLNPT6tqrMOgUvvRbUHCej7Mzf0zh9GDI7KwubJsIMDnByBpHwbJP5L-JsUxjYXaUY01Y49peuPKM1f2xzWysWUMnmy1dmkQ0U6uruNtPke82gyQSp1NQxQ7Y5DxE-5AxorcKqaZfs1S4EAzAjzOv5I6ssJk2HO-i4TtYtEeSJzfJYbWfqtdYq7tNePoh4Ab2Z5EcijpuoXFs4FS5BjOM4CH8KeJWXXta70zH3_fWAnaqsmBzN34-ggLdugTIqXM_VbjlT7HQnzRRZuusQgk4IfcuVNdzeOq2udsL-btvLqpQgiArC893Xm4iyXwNIw7ehfIBCI0PLSdGTQ_ZSbQyN-eb7XJdOdDRG2D6zmwDsR4iPWzbjR3iC3joW3i88UmGcgS6UeJpO8KUnvWOutXTjyQtId3jrg4Q58LLi_7lWhMHwQa9pov1iMF8cJZdvX3RNv4xlAIVNfW-NNpu8IyQkgdX-_N1oAWDGRLhwYq-oOATJ2yuajUZFSnrvSKoVcOGS-BsAor-sDIaMK9-z5gyXYtiodlUE10SsPtkgfnOggrMRIKcFvM25kh3zWmoG57Ir5TCEhd2YT6p5Z1v9ReFqxudR_ilBjdCsw3Q==&p=0&fvj=0&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPatO_JBaRnmHwWQqwCMS5aG1Q6T-KWtyWFcxq6igeJL

3. Inserting data into database

I decided to break the insertion up into multiple functions so it would be easier to debug and understand.
Note: If you try running this on your computer it might not work since the database hasn't been set up. The code for setting up the database is in the repo.

In [5]:
def connection_to_sql():
    import mysql.connector

    mydb = mysql.connector.connect(
      host="localhost",
      user="root",
      password="",
      database="whoomtroom"
    )
    
    return mydb
mydb=connection_to_sql()

In [337]:
def insert_companies(mydb,companies):
    mycursor = mydb.cursor()
    sql = "INSERT IGNORE INTO companies (job_name,company_name) VALUES (%s, %s)" #ignore duplicates
    mycursor.executemany(sql, companies)
    mydb.commit()
    print(mycursor.rowcount, "was inserted.")

In [338]:
def insert_job_titles(mydb,job_title):
    mycursor = mydb.cursor()
    sql = "INSERT IGNORE INTO job_titles (job_name,job_title,company_name) VALUES (%s, %s,%s)" #ignore duplicates
    mycursor.executemany(sql, job_title)
    mydb.commit()
    print(mycursor.rowcount, "was inserted.")

In [339]:
def insert_responsibilities(mydb,responsibilities):
    mycursor = mydb.cursor()
    sql = "INSERT IGNORE INTO responsibilities (job_name,responsibility,job_title) VALUES (%s, %s,%s)" #ignore duplicates
    mycursor.executemany(sql, responsibilities)
    mydb.commit()
    print(mycursor.rowcount, "was inserted.")

In [341]:
def insert_requirements(mydb,requirements):
    mycursor = mydb.cursor()
    sql = "INSERT IGNORE INTO requirements (job_name,requirement,job_title) VALUES (%s, %s,%s)" #ignore duplicates
    mycursor.executemany(sql, requirements)
    mydb.commit()
    print(mycursor.rowcount, "was inserted.")

In [334]:
#used when I was trying out the scraper.
def clear_tables(mydb):
    mycursor = mydb.cursor()
    mycursor.execute("delete from responsibilities")
    mycursor.execute("delete from requirements")
    mycursor.execute("delete from job_titles")
    mycursor.execute("delete from companies")
    mydb.commit()
    print("Rows Deleted = ",mycursor.rowcount)

In [416]:
#used when I was trying out the scraper.
def drop_tables(mydb):
    mycursor = mydb.cursor()
    mycursor.execute("drop table requirements")
    mycursor.execute("drop table responsibilities")
    mydb.commit()
    print("Deleted")

In [348]:
def insert_job_info(mydb,job_title,responsibilities,requirements,companies):
    insert_companies(mydb,companies)
    insert_job_titles(mydb,job_title)
    insert_responsibilities(mydb,responsibilities)
    insert_requirements(mydb,requirements)

In [417]:
# drop_tables(mydb)

Deleted


In [419]:
# clear_tables(mydb)

Rows Deleted =  65


In [420]:
#testing out insertion into database


# links=get_result_pages('data analyst')
# job_title,companies,requirements,responsibilities=get_each_page_info(links,'data analyst')
# insert_job_info(mydb,job_title,responsibilities,requirements,companies)

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-cT3dCZgxlbPQRGEcFchKcZzAYo7kjKUivYWOlNVWwHTy5pTtqsodZTH8V-Y-2zLqunCl4iEqapY8zUe-_5HT7ozqXoPBhXs7K11QJ8709NWU6m7S8fJCsk9R8q2kzLNPT6tqrMOgUvvRbUHCej7MzfgUWSESbKkiTz3CEjpRnU5VmYaS4wSFR22CmSgF7r3YfbrProLqpYp5lDdozOSeW6OWLygE4tuk5Lldk9M9gkO1ky43Ff7XVO6wgVh3eNAyeL7cx0-BDjXh1N_tiZE2IdyVbhYdBzb6kdpb4o4tLoHhbUAgZDF2OYlwB-7Z7mrq6JkUFzwpJ9Qx3QGhyVtIFgTBe3nqSRVY-h4T-x81IkAoNiMjPNYKjqgS0bbtSnEbwX1bF7EEKaLuZIipCTprBLeQ0dRHyQS1durQDLn2mVvq8y4yizsclCd_56ZaZYpasYcnJFBK9N4_5YItuFfq33oYpRz4NpyyBBnFm2MBpfgUJEpNbv1z7zRAlL3p8SNZ5K4ks8mf6DDxFnUCrLQmNJCl9COyW38FnbJnLnC1oPKC1VK6WR_AlaSfoFN_0ZQoWHoGNaW0TYn_b8mSelUN-ogA7wbk6olRkwilnZ_rzvyioT58ovj2SJ5XKUsowJkjM8o969xkb3ce70PYzHMJYmH6ZKa-FWGZGr91Xo1bbOoiy9uMj0d9SUb5-ow9PEbd8XTSCr7E3AeOuiYm4d66xUSIanM2mnHQ4ai7LQbrc96&p=0&fvj=0&vjs=3
https://indeed.com/rc/clk?jk=96a0b8d9e1355c5c&fccid=2ff585682e0825cf&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0

https://indeed.com/rc/clk?jk=e8fe100f1fa3d75e&fccid=126e3afd205caa95&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPatHxWzzouhO42_KdKDAmsXwmrRH8-ggbM9HxRWL_VpWF3RzBS229sXn1RYAvaTjZDS6GnBQquC2JhDnmiRQ3dGAzgJFq6FcCW6ANfojUpY_EitzsteLxHaN1yJ3c9Mzmg7Noo6jEA-UVTx4FxSRUbTPVUliyWzoSLDLVoW9vDTHXD8mZO3JgecqHxbeIK-n5mJhQW4oPyGYdAbrcwJjjIo5Uuhls-3GZ_6IBRXK6R0qhqXpa60QsifGw1lOdsvsV6ivi9IWXHOy-CR-5Ep_HYElKSvGbIe5Hysf1vhAujjBAto1zLBy7eWwuezpf-noHskScrfaVglA9bzP3efQ6KgYH31oDSCoBlqdRXAUvqNz5G4dAHMjc8A1bTd9lF3ofgERigz5BJA65GMwL4Dl8bRiDJG5tEhJ2dzClCEVdFcGhmsz6ZtXGnG7fQ62zhzlZKQIrFlzG2gyluhqVC3_nRN-R9MKLfzlECp5UN_93h0TNSzfsNZLT2y7k-o3ONRIRgaWIoIKPs1TUEBL7Kf5gnqb6VahhTWPhKBCEED7brHgT2I_enHFUfTT64YKlS3bbctePBECcFfzzEx2LzzGahCE9b6WjSfrXszyPwjKq-dTwm1JAyGcn1b4M997gsEz2hgzaGo141wzuNXSkqGom4nu9_Uwco3uX5YNecrHevE_As1Qot46KHsjzcc-Kti8EcgiCInfSO4OsIXCoyVdUwf5jBu0L61zCurDqaL3KONVqW6t5TR3QdBy6dNrzQauSSRwDJgvlaIWjo1MW5hpZwGmqFSkgnAG7jU=&p=0&fvj=0&

https://indeed.com/rc/clk?jk=26cf48b946174ba7&fccid=f947fcaac3b936c9&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-cT3dCZgxlbPxW1N_UnT6KLKxWLeBgvbfw6IDnuFovTJDhXuXJTt7UInmELSwKcox4XyC0NN7r9NLPz7kZ-SULXEkI-piOyjCQUljt3Csx7vOcxybn6nnCtbmHMs1LOk6ngVZ9roZRt4Zk6oail3Owb8gtCH4T6Vd9zBVcQViQA_s5FINHVYUmIuo4qjCqhcZzMs8hnQXs2kd83LXh4-kV_M75MLiXQYMij8Y1sKU_7sJPLe-MSu7jqBMiCDYmAeK-2t6szrSQ3hh9ofX0J2-0zcy2hX5MV_51WJFoD-N5bMg0Y4b9i46jNWo-DZB6hz75FGyJFI_Dp156cIH5N7zRrviyOYWAXfuC_KgJIeykZYSVfAz-bI5ot-uvfF4XnjKjxJrMkq-bmlvMlCPOU7e7H4i159fnK9UC_ws4tp2svm_Wn5YaFVu4wgo89WsZfOjJFq5EMGyTGAx1kvjstNH59ljH0sxQBdcUBlvqtGDlefK8JwDuTw2Or6clGtmpdEhRFRXHRdPIOdW_sLB7a5d4h0SQyJxrPfcakr1g8A0oLMLAoMG4-oIhXE2Dt-HCCXXkiWie52Ek0UOfm7Q3jSocso0fFbgsbgUHNrYqX5zBf3NvV7f-e8vwPHmV7H2AmV_T8gPiMEhn_HEYoUwQBlCa0kBtA475lYWIPtPnH3ry4YUAbE_k0r5z7aN_sWLV3aAp8FvEZOgqz62txZ9IyKtwfCc0H9vBfeZtpkq0iWd8ueQj2hVZLJTXwHE_7LzIfRVfD6JxbubkTK3AJJKBic_g-kSUw1dI-MDNUGBirQBQVn0OsX02Zkpzss-e-NeKEhoqH

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPavJeSvGqKgrOtDQv0SPtFN4WjgV68NOcUoAvEe-x7htYPpfh_zAyNL7GyMCU-eKWKqQR5oKqO_tkdp9yqDB1c1iZCa0LaMRNb5epVd9S-DN8c8JhQDIBr9qkx_MVLs5yjc6Ffgq8m001xw4p4dKYHIZctCVneS272qUmZu3V3MnISbFn8f6iMRYJnTESmtsPywbrFdPyHkHjXZ9PFO0sKW_ny1MxQsbeljRT2DH2Jww3B-I7P3Kf5Qk-zlqNeSP5OzEnqMziKCBoIaMQ-Dj342yMWxLyPtLnn-bOQqSAZr9ZT9O-712lbEUnZ3vXrtRxDa8f8ckCOFVieNkXL4g0vDBlIToF074Be8cawxNZdYa_0mBAu3pnb0vN-XewGgM3NN3RHzkLtWoU2ZygR-OHXd0HOUZ4L9EgraH5tFkjjJRh8LQk2RlbdfYTMCmr8sQ-zeIf0O-itzQQRHexzaAsN9aSNKI7XeMb-YCrQvmiqmwwhTtwunmwuPbraHU06n09DEtTHWd6X-vzO3cCpiqKl-fcAXrZ5PHwQq1_DbzUKixoZyxhiEcS9D_ybDlDnJ1wrGgfnzxLXDNjv2SMf43dJjTtn39QHdi-X6DdLXHvVfijRSYkMRAmRzb8X4LDK6gF23dIbGi-tfqHq3u773Cz-ZBx2qQmlxZZRCl7sWMdSESUSJBwBOmoRzvZcSPIeFkORbZJIKXQZJOwIt8PrKZhsoUAURuXc2TfRrMFXeN2BhnTLaXWd_3VbExtNeNN_D_yUn0FCT8bgqp5i9myZ7Mvl4g&p=0&fvj=0&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPatwKFiSFH5eoPhnJTxrFhQiKN6ujNzPkEVQhacvCoo9TkZp3T6bW7DRzC7mvWL7Ev0vbmddvxdax9s3xGoEnyd3qqexy9oMG4PJw_hY42AQotfNESEMyrSWpjTap81k02qEK9LP5G3aCH3DzuuWWXgWeRB9GKFnki3XyIw_xHBF5Rzm7PCe1DpJBPbpbsW4SVbaUUhdmtDgFe0r67pXr1rtJadELdlhx-IKSooS4Cha0LnRkOy8S_CWm_pRCyLlsJUtGKm8J7IZttCEath5ads4LJO01oK57LQWL9m2dpn0u9NK2_CuJgis198XMNzlYlNN4m5nUCoDvQ8gauzDyrwByAe45FeDUVlxu_bILtfz009rjnfOwCsNxPIpjKKtWQ9z0k8t0Nt6MkZUvVtKP69taMXg0FsYbp5YvcadgHu6z3VWP4vbXSeDjaGJF2HxhWVU15Tr3q1QXL-XLHzXGPvEx3M-SYJplawo2KJT2XvcOAFSs6SiFRr-TXPIDQ9221-kE4vSPys7qrzGltn54rDtBbkm52VCo14H6cYV5-tk4VNVh2e4Z5lOGaFWWw3PUGvcrdT6EX6Lyk8S_3Bx5NL2Xh9EPd2eFfV_wVApjqyzK7g1OxgtIuxVXVozlWguXq1DpaU6krXLaAlakSq8UgbsAXcXnMIiGQQKwciP-t2_yuxbgkS6MM2rekJKUIzstCCTnZaxOgd3dsqPJwEGtrVc9qo0d0BdavEOVHALAYwy7thKo-PSFZZ79gpuAyHB1FODIdLeDTsWxSqmBjoTAdReaz4WIkdlL8BPhHALiTty-yr6IGuWnNbqKs-LGRxAPCY=&p=4&fvj=0&vjs=3
https://indeed.com/rc/clk?jk=5db8a3a3

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPauxoNt29LiPsSHb4WHxszt0GLkwYsYrt98fQ9JmTxZ9KIw28ZjVI01scz2TaNQ2Q9TRs85OWpB00e2OCZwydiZyFN3ue6V0uj7Vkxa8hhLzwldvrW3NhILvscHI5swEILE_Ew1tHQmly9vqBCL4QFPlteGjJioYSoZDQ2GL60ZPUsK5nswCbdB041cKHjV1iL_luDwiRvVJtfJyrtKlRooy6_qyZFLGCu9uQCL9d-l3zIjinR8-5S8hqEXvhKDmtXV2vE512QNu-UXCW20TQcRRvmgJudehq3lYF66cLKAekdPjqnFLLpQxJjNuPDJUrCNs5YK_M1crEZNAqHrt19m2KjINTjbGJDDL7aR-hpOIK70I1gH_onuVJ-eQoGyZ2ZrwlOC4gvHB7wRKIbgZ_U1i3g43IXIbCZOKTVTuQ9ZdqW9WYKMGd5OpSmHB0ZeXcz4HRlPKncCzujqBie_HIdhykSgsmPiYVcUM8oW3iU-P8_TNhrXcai5WSioBnelIiPtZsuZIbcyepoFHlYQGXetEZccgc611wvblPrJLC7Wl7klr2bsprKylOvpPeA4BA2jqD9WvUaBqeBWeVGoX8Vj6EMNBulkRmWRiLhV9WfkPqd5BqAlwtWlQ2eP-nVSQAChQsc3VeQewJzXOvRfwkirNynW0pM03rrXRvEAm4PH0LUdMyVZYgT4FqAr9NqNCiV_9ecCWynyrYJsfKshzKVouGSj5a6uZv84fBpthGuQkYkFDYexa5oLwtgnLJ3GNbIdowgKXEVdNM-uudfaJVNVD&p=0&fvj=0&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPauxoNt29LiPsSHb4WHxszt0GLkwYsYrt98fQ9JmTxZ9KIw28ZjVI01scz2TaNQ2Q9TRs85OWpB00e2OCZwydiZyFN3ue6V0uj7Vkxa8hhLzwldvrW3NhILvscHI5swEILGj6Yu1IhuELeJ9MQ9z4jjUwvRf4rDVsiO-Sf7Rjy_bZ3pFJdTQbAkr-2inNxubvhzBHVbYrRxsa7xG-t9hzm31G6QYc3a0hWnsTAJHLgl4Hhw3R5zm7S9MLTN9OlXzvDPsurXyMk517UuJInh6quSXajc9ZDIUNAoB69aL6X3-FRTeotssVx66EtHa769897EgM8tb6ALqPV1P6vFP65BfVD7HBdvkANYYWjklJpjEWbE8KhHIKOxOowangIPCk1dQyMeDUxHDw5rRYxNvhcX2aw6PPxqrCbZz9V3YgvIsaM4UJPXl0lfgpsJEArMPYibMlzGuuSEPBWTBwyesFh6n0mV6zZWw_WezTZY38_MIZxf3VlsIZRXCxhv4mP5-AI_vu1uLs7cPDIVtP55H4MDn3f0TwMlKkrFlaD2J72jYrvuqAmJDv18Gb_KqGq4UyDfj9HW8GLRXtKpYpgWbZoRCSteL1d4ZopCkRWvokDAZf20j3sDqpxwlynQPKMe8DP8nhXgZJ6S3Hmzgbqrz1Suj4HodWDQqmTg0HmiG1mhNSi4eQghmlQJgkswvKWldp6SD4gdVXNdk2x6bdIXZHG41Namhmct23qoWu9vm0MyT5IpB7AR61aG3in3EDDB570P3mglLymQgheT-exgOJEGo&p=13&fvj=0&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNe

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPavaEP2WdfqhRC1ayr5tXJYe1Th9x9ywzdqpms98qZKUlLsRWngVrw9_xoSrMk3y7uA50Hf6t7bHvy27vlVe6j5zqwVf6ssjBIxZCsV8C9GkxNweyYJkOxiTLTzwd2--aJRtP37ul8smFUz20eRruUr_bX4kufoXl35FQfKrWVZRV2NTpUS0x7cULqDxRIMWBUKh8GJMwtV0e3ZoSfHkT0lJijlKdKg9Q5IQ0eyh0wdsvNRE0wxhER7FO0dDqukLgIdHR1N7Bp5pTJaLXveqs2iDFHB8act7piTN5SkHmYUQccgKXkyLp26KSQ3u3zPcahDJrhY7o1MTRQDvts2h90eg9HJiwZHQ8rlRZHF2lXk1QjR82JrMq4rWTXuPlZA9rT9OOzS0QNAtjl0Mf16IeAKuobLNvVYiclxryzbH4ZM1FkIMHAD4AnbT_uHCT3QQQ0uZEkZ0Ukqh_3h_7hZNBpUGCTv4geJICgXRpqLPeYyijKeT4ml_BcKIodkJ-k5rO4gHi2DPVrcvnDRgWC1AHhZrXx6h1bEJ4S_2EgleVmCGLpKoHGYwj_OWmOui4jR33NtIc2p6kBa8fDnEM2HwUxAIhMMEJMAzQ--H95UO4uCtQxpYxbaPAW9Zyq9V_oolNYgEWfsXCh6UmEm7qGeFp_6dM_qIogMl3ufmgT6jOe1s7IYkQS_e4hB3BuFaKsx1JmHcIKXSyPaf2CQvijzBHhz6yNI5QwrHZmkZsxTeMFQpGkl211s07Pil4qOHTLaqsNlHd8-yixtu--8aEsoL2pPXZRjxfYfkgjQOG9Jw2YbELyVCCSNhKyeEir4TExstEyU=&p=9&fvj=0&vjs=3
https://indeed.com/pagead/clk?mo=r&ad

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPasLFG1Nqt-EhG1_QVjto4yH_nzNKd2R_6XR0mchXRv6-LEqv5FtY_5eUfj7HJgiO8x5cfzvyUt828MT3QyNRIJuJAI4wiZNGd1LfcSHkH6PBfZ4W2uCLMTCC-JAD8N3MolVBJKWa1p3eME2d5UR_iL2NXQcAZWUSw4-3nKK9sS4Q459N6iWoGSF5ooEE36Joq0Daf0n6xaCYEDfzIvsH-SMjp6zQkJ0OdwXhYoIIF_VZFA5cAnZ0-FcR7dnfeNESq744WKVdbRg5hmi3t3hmJ2KaM73kpkaiI6omJhToGJ56lTKx_xL6dXQFxCEzYepEQTGV5Kpx0187Gg3a4nFjdt3OxwgsLWQm9fdHtnwe2unLztd6wYWZ3ZTchjUHqefnXP_ucmIJ20xUlAzSYMiIRVYeKeGRbmGPH7HHaArLnYj8xAIWJQAdFrqg0wVw5myIw1pPyOimMz-cAlriGsZV3bGe6IY9pRZ1PqxWREml8EfI4Na_l-VwadEFrNoQ-SjH0GbsP_yDPhnBA5fHhjTOf0nxqHn7xLCig3G24AZEjwFEWj1mKcFQRndvwqKL4yWjd7ZXQOAzQtV9Mql9u4ZG30s0pwW8I-gMEDOeAhaUxvEgeHcz6ZuIOXNI0-Z2TjvGE4fQqrTJktgQ23P2s_tiMUJFY5hU4Uh8V-folur3v9V-YVMszif8_gSpARuDLZ-L_ID7vlUB7CZ7jgGgKYvUP7yWQd4rnKt9NiIQlLYuifuWqvbW_tdQMfn960jBCd6FrXw023SuqfX_crG9tF-34sGMqTtRYA9DRU=&p=3&fvj=0&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-

https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06JaTHFjvTB_5q6-0gBNCyrzTNez_CNw5GfFr-YeB3E5TQjGgRpROSxApPavaEP2WdfqhRGyswZYpGgAhwLiD1Xz508J_R1TE07FXb5gD-8Q4e8C2tvaNG7mumIEi3Gw-hBup4JAMIBtPye0r8eUdhb9Wn_Jf3ttDGKFgTUxXZ5HUcbb8tt930596YMmEuSiZpqPcIRELZWqBxdF0HBAxDR2wmqeuQDKe13U3R5eE0-J8xqJx9UYC989k84ENmoM6Jij7wPDb-7iy7s0U8IFcrbcYdegmVymjBG1F0KYsJZqxQXkHaEJtlXgjKVs0iF_OUPfao_18ASr338HO0yxW-Y6nk_0wLqtrXCi6dPiKNs_nQoc8EwtaH7qVK4dznA1X99gcLNNMl833AY2rP4JFmhFfW1dfBP57mdVAGlxWDNv5VN15fr156uYZthinVHX2CvlB8gwbZpjobyfJ7o8egVOOVGtJ7Fwf1ERotd3w8-GQ69-T77E1RUHZTTxS9tKE2v0ANJZQ8sEKyIV1L7ojM0OI80UYTJW3A0ST6YeyB0z-RSl8KV21oDkT6kdt64EgZhL2zXbLVoUEQMBrLVi-fBsE1EuPmL_SbYTm2cfYBsc5qDZK36ZOxuQILK6OZ8OGH7YNyDBDMbcjkZdDMp4kD_eRAvRJDSAQUH0D9fS7AdGP-ZdQqkJTNXXSfCwbLTwmrpSYiRM-8y0Vv5uvHR0VklT05_y9M61nc8Pq2pT4CjAtFr1yRoZ9UwVcq2degsPJrUBkclyileXXvKXP6T1PZS-w-AYxM4S9Af6vzNvbOeu1ukC5-Zw1gbXpihs_H4iQ99C6Dt8VeM2PYjwrtvVq9fhARavdIYHVuT1u1g==&p=12&fvj=0&vjs=3
https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AuWpd06J

4. Retrieving data and creating tokens out of them

In this example, I am retrieving and tokenising requirements data.

In [6]:
#retrieve requirements from database for processing, joined together as sentences
def retrieve_requirements(mydb,job_title):
    mycursor = mydb.cursor()
    sql = ("select requirement from requirements where job_name=%s")
    mycursor.execute(sql, (job_title,))
    requirements = mycursor.fetchall()
    req_new=[requirement[0] for requirement in requirements]
    req_joined='.'.join(req_new)
    return req_joined

In [7]:
#test
retrieve_requirements(mydb,'data analyst')

'Ability to communicate effectively both verbally and in writing.Ability to conceptualize, write and present market views to management.Ability to handle and navigate uncertain environments because much of what Advisory Solutions team deals with are projects and initiatives that the market has not seen before.Ability to manage multiple projects at the same time in a fast-paced environment..Ability to multi-task and work well under pressure with commitment to deliver under tight deadlines and drive initiatives to completion effectively..Ability to multitask and work in a fast-paced environment under tight deadlines..Ability to quickly analyse / reconcile large amounts of data within determined parameters.Ability to screen cases.Ability to think objectively and \'think outside the box\' when analyzing issues.Ability to think objectively and \'think outside the box\' when analyzing issues.Ability to work alone as well as in a team.Ability to work independently and a good team player.Able 

In [423]:
#retrieve job types from database for processing
def retrieve_job_types(mydb,job_title):
    mycursor = mydb.cursor()
    sql = ("select job_title from job_titles where job_name=%s")
    mycursor.execute(sql,(job_title,))
    job_titles = mycursor.fetchall()
    req_new=[job_title[0] for job_title in job_titles]
    req_joined='.'.join(req_new)
    return req_joined

In [8]:
#retrieving requirements for data analyst from database
text = retrieve_requirements(mydb,'data analyst')
#creating tokens from the requirements
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize

filtered_sentence=remove_stopwords(text)
tokens=word_tokenize(filtered_sentence)
words=[token.lower() for token in tokens if token.isalpha()]



In [9]:
print(words)

['ability', 'communicate', 'effectively', 'verbally', 'conceptualize', 'write', 'present', 'market', 'views', 'handle', 'navigate', 'uncertain', 'environments', 'advisory', 'solutions', 'team', 'deals', 'projects', 'initiatives', 'market', 'seen', 'manage', 'multiple', 'projects', 'time', 'environment', 'ability', 'work', 'pressure', 'commitment', 'deliver', 'tight', 'deadlines', 'drive', 'initiatives', 'completion', 'effectively', 'ability', 'multitask', 'work', 'environment', 'tight', 'deadlines', 'ability', 'quickly', 'analyse', 'reconcile', 'large', 'amounts', 'data', 'determined', 'screen', 'think', 'objectively', 'outside', 'box', 'analyzing', 'think', 'objectively', 'outside', 'box', 'analyzing', 'work', 'work', 'independently', 'good', 'team', 'autonomously', 'effectively', 'fast', 'paced', 'evolving', 'create', 'visualizations', 'data', 'business', 'intelligence', 'analytics', 'tools', 'able', 'create', 'visualizations', 'data', 'business', 'intelligence', 'analytics', 'tools'