## Scraping Job Posting Data from Indeed 
Source:
https://towardsdatascience.com/scraping-job-posting-data-from-indeed-using-selenium-and-beautifulsoup-dfc86230baac

strategy:

* Get the job posting links filtered for correct title in order to save scraping time for not relevant jobs
* Click each link and parse text from the job posting page 
* get rid of duplicates 
* Store the parsed text data



### 1. Get all the job posting links

In [8]:
# all import statements

from bs4 import BeautifulSoup
import pandas as pd

from selenium import webdriver
import re
from collections import defaultdict

# to write from list to csv file
import csv
import string    # need string to clean title

from fuzzywuzzy import fuzz
from fuzzywuzzy import process


In [2]:
 """
    Given the url of a page, this function returns the soup object.
    
    Parameters:
        url: the link to get soup object for
    
    Returns:
        soup: soup object
    """

def get_soup(url):
    driver = webdriver.Chrome("C:/Users/lili/Documents/icode/scraping/chromedriver")
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    driver.close()
    return soup



In [3]:
   """
    Grab all non-sponsored job posting links from a Indeed search result page using the given soup object
    filter for postings with no keywords in title
    
    Parameters:
        soup: the soup object corresponding to a search result page
                e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
        key_words: the specific job title we are filtering for 
                    e.g. "data" and "analyst"
        base_url: enalbes us to search in different indeed domains il.indeed vs. ca.indeed for example
    
    Returns:
        urls: a python list of job posting urls
    
    """
def grab_job_links(soup, key_words, base_url):
    urls = []
    for link in soup.find_all('h2', {'class': 'jobtitle'}):
        partial_url = link.a.get('href')
        t = link.a.get('title').lower()
        t = t.split()
        table = str.maketrans('', '', string.punctuation)
        t_stripped = [w.translate(table) for w in t]
        print(t)
        if (key_words[0] in t_stripped ) and (key_words[1] in t_stripped ):
            print("yessss")
            url = base_url + partial_url
            urls.append(url)
        else:
            continue
    return urls

# to improve the filtering works only for two words titles like data + analyst


In [5]:
 """
    Grab number of result pages, from a Indeed search result page using the given soup object
    
    Parameters:
        soup: the soup object corresponding to a search result page
                e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
    
    Returns:
        num_pages: integer number of result pages
    
    """
def grab_num_pages(soup):
    page_in_search = soup.find(name='div', attrs={'id':"searchCount"}).get_text()
    p = re.compile('(\d+,*\d*)')
    num_pages = p.findall(page_in_search)
    
    # have to deal with comma delimited numbers which can't be turned into integer unless the comma is deleted
    
    num_pages = int(num_pages[1].replace(',',''))
    return num_pages


In [11]:
"""
    Get all the job posting URLs resulted from a specific search.
    
    Parameters:
        query: job title to query
        num_pages: number of pages needed
        location: city to search in
    
    Returns:
        urls: a list of job posting URL's (when num_pages valid)
        max_pages: maximum number of pages allowed ((when num_pages invalid))
    """

def get_urls(base_url1, query, location):
    # We always need the first page
    base_url = 'https://{}.com/jobs?q={}&l={}'.format(base_url1, query, location)
    soup = get_soup(base_url)
    urls = grab_job_links(soup, (query.split('+')), 'indeed')
    num_listings = grab_num_pages(soup)
    print("num_listings",num_listings)
    num_pages_calc = int(num_listings/10)
    print("num_pages_calc",num_pages_calc)
     

    # starting page 2
    for i in range(2, num_pages_calc+1):
            num = (i-1) * 10
            base_url = 'https://{}.com/jobs?q={}&l={}&start={}'.format(base_url1, query, location, num)
            try:
                soup = get_soup(base_url)
                # We always combine the results back to the list
                urls += grab_job_links(soup, (query.split('+')),'indeed')
            except:
                continue
    return urls
                

    
# finished stage 1 we have urls a list  with job posting links

In [None]:
query = ["data+scientist", 'data+analyst', 'data+engineer']
location = ["New+York"]
list_urls =[]

for i in range(len(query)):
    list_pos =[]
    urls1 = get_urls('indeed',query[i], location)

    list_urls.append(urls1)
    print('length list urls', len(list_urls))
  

['junior', 'data', 'scientist']
yessss
['statistical', 'research', 'and', 'data', 'science', 'intern']
['data', 'scientist,', 'sales', 'analytics']
yessss
['research', 'analyst,', 'people', 'analytics']
['data', 'scientist']
yessss
['data', 'scientist', '-', 'trello', '(remote)']
yessss
['junior', 'data', 'scientist']
yessss
['data', 'scientist', '–', 'personalization']
yessss
['data', 'scientist', 'i']
yessss
['data', 'scientist']
yessss
num_listings 2980
num_pages_calc 298
['data', 'scientist,', 'analytics,', 'university', 'grad']
yessss
['data', 'scientist']
yessss
['data', 'scientist', '–', 'content', 'marketing', 'acquisition']
yessss
['data', 'analyst', '-', 'user', 'research']
['statistical', 'research', 'and', 'data', 'science', 'intern']
['senior', 'data', 'scientist']
yessss
['data', 'scientist', 'summer', 'intern']
yessss
['data', 'scientist', '-', 'machine', 'learning']
yessss
['associate', 'data', 'scientist']
yessss
['data', 'scientist']
yessss
['data', 'scientist', 'inte

['machine', 'learning', 'engineer']
['senior', 'data', 'scientist', '-', 'chief', 'analytics', 'office']
yessss
['data', 'scientist', '-', 'investment', 'research']
yessss
['machine', 'learning', 'engineer']
['data', 'scientist,', 'finance']
yessss
['data', 'scientist']
yessss
['data', 'scientist']
yessss
['museum', 'scientist', '2']
['data', 'scientist', 'technical', 'lead,', 'google', 'maps']
yessss
['data', 'scientist,', 'engagement']
yessss
['custom', 'insights,', 'research', 'analyst']
['sr.', 'data', 'engineer']
['data', 'scientist', '–', 'content', 'marketing', 'engagement']
yessss
['data', 'strategy', 'analyst']
['lead', 'data', 'scientist']
yessss
['applied', 'scientist', '-', 'question', 'answering']
['sr.', 'data', 'scientist', '-', 'worldwide', 'public', 'sector', 'team']
yessss
['svp', '–', 'data', 'scientist', 'lead']
yessss
['quantitative', 'analyst']
['analytics', 'consultant']
['data', 'scientist', 'summer', 'intern']
yessss
['statistical', 'research', 'and', 'data', '

['fusion', 'center', '–', 'data', 'derived', 'threat', 'intelligence', 'manager', '(svp)']
['museum', 'scientist', '2,', 'box', 'oce-1115/28103']
['data', 'engineer']
['fgp', 'medical', 'secretary', '(35)']
['scientist,', 'thermosets', 'and', 'additives']
['research', 'scientist,', 'google', 'brain', '(united', 'states)']
['assistant', 'scientist', '-', 'night', 'shift']
['machine', 'learning', 'research', 'scientist']
['fgp', 'sec', 'i-intake/sched', '(35)', '*astoria*', '(shift', 'btwn', '9:30-6pm', 'alt', 'sats)']
['senior', 'statistical', 'clerk']
['investment', 'research', 'analyst']
['data', 'engineer']
['senior', 'business', 'intelligence', 'engineer', '/', 'data', 'analyst']
['data', 'engineer', '-', 'analytics']
['biotel', 'research', '-', 'data', 'analyst']
['marketing', '&', 'service', 'data', 'scientist', '-', 'vice', 'president']
yessss
['quantitative', 'researcher', '-', 'internship', "(bachelor's/master's)"]
['database', 'analyst']
['senior', 'analyst', '-', 'quantitativ

['ml', 'software', 'engineer']
['data', 'scientist', 'manager', '-', 'new', 'york!']
yessss
['machine', 'learning', 'engineer']
['senior', 'analytics', 'consultant,', 'decision', 'analytics']
['senior', 'data', 'scientist', '-', 'machine', 'learning', '/', 'nlp']
yessss
['machine', 'learning', 'engineer']
['machine', 'learning', 'engineer']
['offering', 'manager']
['data', 'science', 'manager']
['ai', 'research', 'scientist', '-', 'nlp']
['equity', 'research', '-', 'medical', 'devices', 'analyst', '/', 'associate']
['ecommerce', 'full', 'stack', 'engineer']
['senior', 'data', 'scientist']
yessss
['data', 'engineer']
['temp', '-', 'biostatistician']
['principal', 'decision', 'scientist', '(decision', 'engineering)']
['data', 'strategy/machine', 'learning', 'architect,', 'vp', '(']
['product', 'manager,', 'instagram', 'home', 'ranking']
['senior', 'data', 'scientist']
yessss
['algo', 'trading', 'model', 'validation', 'quantitative', 'analyst']
['research', 'analyst']
['programmer']
['res

['senior', 'laboratory', 'scientist']
['quantitative', 'analyst', '-', 'financial', 'resource', 'management']
['analyst,', 'sales', 'research']
['vacuum/mechanical', 'technician']
['college', 'research', 'intern']
['statistical', 'programmer']
['undergraduate/graduate', 'data', 'analysis', 'and', 'research', 'internship', '(summer', '2019)']
['part-time', 'assistant', 'research', 'scientist,', 'non-exempt', '(ercan', 'lab)']
['data', 'vizualization', '&', 'user', 'exprience/ux', 'associate', '-', 'data', 'capabilities', 'office', '-', 'research', '&', 'statistics', 'group']
['environmental', '-', 'geologist,', 'scientist', 'or', 'engineer', '(entry-level)']
['junior', 'technical', 'business', 'analyst']
['research', 'scientist']
['senior', 'data', 'engineer']
['machine', 'learning', 'modeling', 'engineer']
['program', 'manager']
['scientific', 'researcher', 'or', 'engineer', '-', 'data', 'acquisition', 'systems', '(experienced)']
['quantitative', 'analyst,', 'associate/avp']
['systems'

not in data scientist
['real', 'world', 'data', 'scientist/epidemiologist']
['decision', 'scientist']
['applied', 'scientist', 'intern', '-', 'alexa', 'shopping']
['fraud', 'scientist', 'engineer']
['applied', 'scientist']
['bi', 'team', 'leader']
['senior', 'r&d/data', 'scientist']
senior', 'r&d/data', 'scientist']
['excellent', 'excel', 'command.']
['data', 'science', '-', 'team', 'lead']
['excellent', 'inter-personal,', 'communication', '&', 'teamwork', 'skills']
how come this gets to be the title
['head', 'of', 'data', 'science']
['data', 'product', 'manager']
['senior', 'applied', 'scientist', '-', 'alexa', 'shopping']
['senior', 'security', 'data-scientist']
same problem
['details', 'oriented,', 'efficient', 'and', 'organized,', 'able', 'to', 'meet', 'deadlines.']
['strong', 'analytical', '&', 'technical', 'skills', '&', 'data', 'orientation.']
['business', 'analyst']
['director', 'of', 'data', 'science']
['איש', 'ביג', 'דאטה', 'big', 'data', '|', 'מנהל', 'מוצר', '-', 'תוכנה']
BI JOBS
['taboola', 'protect', 'business', 'analyst']
['business/data', 'analyst']
['business', 'analyst']
['business', 'operations', 'analyst-intern']
['bi', 'analyst']

analyst jobs
['product', 'analyst']

['junior', 'shipping', '&', 'payment', 'analyst']
['customer', 'support', 'data', '&', 'operations', 'analyst']
['operations', 'analyst']
['fraud', 'analyst']
['business', 'enablement', 'analyst']
['junior', 'online', 'marketing', 'analyst']
['strategy', 'analyst']
['client', 'services', 'level', '2', 'analyst']
['junior', 'game', 'analyst']
['product', 'analyst']
['product', 'data', 'scientist']
['commission', 'analyst']
['malware', 'analyst']
['financial', 'analyst', '–', 'student', 'position']
['product', 'analyst']

In [None]:
# write list of lists to csv file 


with open('urls.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(list_urls)

In [10]:
print(len(list_urls[0]))
print(len(list_urls[1]))
print(len(list_urls[2]))

29
25
21


### 2. Click each link and parse text from the job posting page

In [None]:
"""
    Get the text portion including both title and job description of the job posting from a given url
    
    Parameters:
        url: The job posting link
        
    Returns:
        title: the job title (if "data scientist" is in the title)
        posting: the job posting content    
    """

def get_posting1(url):
    soup = get_soup(url)
    title = soup.find(name='h3').getText().lower()
    posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent"}).get_text()
    return title, posting.lower()
    

In [None]:
'''
    get title + discribtion for every job posting url
    
    parameters: 
            url list 'urls'
            
    returns: 
            list with tuples job title and description
'''

posting_list =[]
for i in range(0, len(list_urls[2])):
    posting = get_posting1(list_urls[2][i])
    posting_list.append(posting)
  

In [None]:
# check len posting list 
data_eng = posting_list.copy()
print(len(posting_list))
print('posting_list')


In [None]:
print(len(data_sc))
print(len(data_ana))
print(len(data_eng))

In [None]:
# save posting list to csv
# so will not have to run all process again


df = pd.DataFrame(posting_list)
df.head()
df.to_csv('posting_list.csv')


### 3. Get list of unique postings by keyword

In [None]:
'''
    gets company name from description
    
    parameters: 
            listing
            
    returns: 
            company name
'''


def pick_company(position):
    pos_com = re.compile('^[^-]+')
    sentence_l = pos_com.findall(position[1])
    company = sentence_l[0][len(position[0]):]
    
    return company
pick_company(posting_list[0])

In [None]:
'''
   filters listing list for listings with specific keywords
    
    parameters: 
            keywords and listings list
            
    returns: 
            sublist of listings which contain keywords
'''



def filter_listings(key_word, posting_list):
    listx = []
    for k in posting_list:
        if key_word[0] in k[0] and key_word[1] in k[0]:
            if k in listx:
                continue
            else:
                listx.append(k)
    return listx

data_sci_des =filter_listings(('data','scientist'), data_sc)
data_ana_des =filter_listings(('data','analyst'), data_ana)
data_eng_des =filter_listings(('data','engineer'), data_eng)
print(len(data_sci_des))
print(len(data_ana_des))
print(len(data_eng_des))

In [None]:
'''
   generate list of non-identical duplicates
    
    parameters: 
            list of postings
            
    returns: 
            list with duplicates only
'''
def list_duplicates(data_sci_des):
    print(len(data_sci_des)," lengthlist")
    duplicates = []
    for posting in range (len(data_sci_des)):
        looping = posting + 1
        while looping < len(data_sci_des):
           
            if fuzz.token_sort_ratio(data_sci_des[posting], data_sci_des[looping]) >= 70:
                if fuzz.token_sort_ratio(data_sci_des[posting], data_sci_des[looping]) == 100:
                    duplicates.append(data_sci_des[posting])
                else:
                    if data_sci_des[posting][0] == data_sci_des[looping][0]:
                       
                        if fuzz.partial_ratio(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping]))>80:
                            print(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping]))
                            print(fuzz.partial_ratio(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping])))
                            print("yeees")
                            duplicates.append(data_sci_des[posting])
                        else:
                            print(fuzz.partial_ratio(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping])))
                            print("not ehough")
                            print(0)
                         
                        
                    
                break
            else:
                looping += 1
       
        
    return duplicates
        
data_sci_des1 =  list_duplicates(data_sci_des)

          

In [None]:
'''
   generate list of postings without non-identical duplicates
    
    parameters: 
            list of postings
            
    returns: 
            list of postings without non identical duplicates.
'''
print(len(data_sci_des1))

def delete_duplicates(listing_by_keyword):
  #  dups = list_duplicates(listing_by_keyword)
    for i in data_sci_des1:
        listing_by_keyword.remove(i)
    return listing_by_keyword

data_sc_nodups =delete_duplicates(data_sci_des)
print(len(data_sc_nodups))

### 4a - playing with nltk

In [None]:
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences
import nltk
sentence = ie_preprocess(zzz[1][1])
print(sentence)

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

my_text = ''
for zz in zzz:
    my_text += zz[1]
#print(my_text)

tokens = word_tokenize(my_text)
print(len(tokens))
#print(tokens)
print('number of unique words')
print(len(set(tokens)))

filtered_words = [word.lower() for word in tokens if word not in stopwords.words('english')]
print('no stopwords')
print(len(filtered_words))
words_freq = {}

for word in filtered_words:
    if word in words_freq.keys():
        words_freq[word] += 1
        continue
    else:
        words_freq[word] =1
print(words_freq)


In [None]:
# extract minimum qualification level

def m_toar(posting):
    ma = re.compile('(?:msc|m.sc.|master’s|תואר שני|advanced degree|masters|תואר מתקדם)')
    phd = re.compile('(?:phd|phd.)')
    ba = re.compile('(?:ba/|b.sc.|bsc|bachelor’s|first degree|תואר אקדמי)')
    if len(ba.findall(posting)) > 0:
        print('ba')
        min_toar = 1
    elif len(ma.findall(posting)) > 0: 
        print('ma')
        min_toar = 2
    elif len(phd.findall(posting)) > 0: 
        print('phd')
        min_toar = 3
    else:
        min_toar = 0
        print(posting)
    return min_toar


index = 0
for zz in zzz:
    print(index)
    sent_bag = break_sentences(zz[1])
    academic = thematic_ana(education,sent_bag)
    #print(academic)
    print()
    toar = m_toar(zz[1])
    print("the toar is!!!!!!!!!!!!", toar)
    index += 1       
    

In [None]:
print(zzz[12][1])

### 4. analyze listing content

In [None]:
job_scope =['in this job you will:','requirements']
prog_in =['python','java','tensorflow', 'sql', 'spark', 'kafka', 'r', 'cassandra', 'elasticsearch', 'bigquery', 'google cloud', 'docker', 'scala', 'c++','matlab']
education = ['m.sc.', 'phd.','bsc/msc','master’s', 'computer science', 'cs', 'ee', 'mathematics', 'engineering', 'physics','related','degree', 'bachelor’s','statistics']
experience = ['years', 'experience']
optional = ['nice to have:', '– an advantage']
skills =['communication', 'accuracy','visualization', 'machine learning', 'deep learning']
junior = ['student']

In [None]:
'''
   generate list of composite sentenses in posting
    
    parameters: 
            one posting description string
            
    returns: 
            list of sentences in posting
'''

# analyse sentenses in listing

def break_sentences(listing):

    length_listing = len(listing)
    sentence_l = []
    sentence = re.compile('[^\n]+\n')
    end_sentence = re.compile('[\n](.+)\-.+ago')
    # the last part of the listing does not have line break in the end therefore needs special regex
    
    sentence_l = sentence.findall(listing)
    end_sentence_l = end_sentence.findall(listing)
    if len(sentence_l)> 0:   
        sentence_l.append(end_sentence_l[0])
   
   
    len_sentences = 0
    for i in sentence_l:
        len_sentences += len(i)
    print("listing", length_listing, "sentences", len_sentences)
    if len_sentences/length_listing < 0.9:
        sentence_l = sent_tokenize(listing)
        print("tokenize")
        for i in sentence_l:
            len_sentences += len(i)
        print(len_sentences)
    return sentence_l




In [None]:
'''
   filters the relevant sentence by theme
    
    parameters: 
            theme( eg. education qualification, experience, etc.) and posting
            
    returns: 
            list of relevant sentences
'''
# problem this kind of cleaning breaks relevant expresions such as "computer science"

def thematic_ana(theme, listing):
  
    temp_words = []
    selected = []
    for i in listing:
        i = i.replace('/',' ')
        words = i.split()
        table = str.maketrans('', '', sub('\+', '',string.punctuation))
        stripped = [w.translate(table) for w in words]
        for ii in stripped:
            if ii in theme:
                selected.append(stripped)
                break
            else:
                continue
    return selected


In [None]:
index = 0
for zz in zzz:
    print(index)
    print(zz)
    print('')
    sent_bag = break_sentences(zz[1])
    prog_lang = thematic_ana(prog_in,sent_bag)
    print(prog_lang)
    print()
    academic = thematic_ana(education,sent_bag)
    print(academic)
    print()
    exp = thematic_ana(experience,sent_bag)
    print(exp)
    print()
    index += 1

In [None]:
print(zzz[3])
print()
bagg = break_sentences(zzz[3][1])
temp_words = []
selected = []
for i in bagg:
    print(i)
    temp_words = i.split()
    for word in temp_words:
        word2 = word.replace('/',' ')
        if word2.strip(',') in prog_in:
            print(word2)
            selected.append(i)
            break
        else:
            print("+++++++++++++++++", word2)
            continue

In [None]:
# take posting list and count relevant vs. irrelevant
############## how important it is to count relevant vs irrele

data_scientist_jobs = defaultdict(list)
#data_analyst_jobs = defaultdict(list)
data_sci_des = []
data_eng_des = []
data_ana_des = []
datasci_related = [] 
non_data_sc = []


for k in posting_list:
    if 'data' in k[0] and (('scientist' in k[0]) or ('science' in k[0])):
        data_sci_des.append(k) 
        if k[0] in data_scientist_jobs.keys():
            data_scientist_jobs[k[0]] += 1
        else:
            data_scientist_jobs[k[0]] = 1
    elif 'data' in k[0] and 'engineer' in k[0]:
        data_eng_des.append(k) 
        if k[0] in data_scientist_jobs.keys():
            data_scientist_jobs[k[0]] += 1
        else:
            data_scientist_jobs[k[0]] = 1
    elif 'data' in k[0] and 'analyst' in k[0]:
        data_ana_des.append(k) 
        if k[0] in data_scientist_jobs.keys():
            data_scientist_jobs[k[0]] += 1
        else:
            data_scientist_jobs[k[0]] = 1
    elif 'data scientist' in k[1]:
        datasci_related.append(k) 
        
    else:
        if k in  non_data_sc:
            continue
        else:
            non_data_sc.append(k) 
        print(k)
        


In [None]:
print('data science posting', len(data_sci_des))
print('data engineer posting', len(data_eng_des))
print('data analyst posting', len(data_ana_des))
print('datasci_related', len(datasci_related))
print('non data science', len(non_data_sc))


In [None]:
for h in range(len(non_data_sc)):
    print(h)
    print(non_data_sc[h])
    print()

In [None]:
print(sentence_l)

In [None]:
print(string.punctuation)

In [None]:
#scraping code:
for city in city_set:
    for start in range(0, max_results_per_city, 10):
        page = requests.get("https://il.indeed.com/jobs?q=data+scientist&l=tel+aviv&start=" + str(start))
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
        for div in soup.find_all(name="div", attrs={"class":"row"}): 
            #specifying row num for index of job posting in dataframe
            #num = (len(sample_df) + 1)
            # think this shouldnt be +1
            num = (len(sample_df))
            #creating an empty list to hold the data for each posting
            job_post = [] 
            #append city name
            job_post.append(city) 
            #grabbing job title
            for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
   
                job_post.append(a["title"]) 
            #grabbing company name
            company = div.find_all(name="span", attrs={"class":"company"})
            if len(company) > 0:
                for b in company:
                    job_post.append(b.text.strip()) 
            else:
                sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
                for span in sec_try:
                    job_post.append(span.text) 
            print("this is row 1!!!!")
            print(job_post)
            #grabbing location name
          #  for div in soup.find_all(name="div", attrs={"class":"companyInfoWrapper"}): 
        
                
            c = soup.findAll("span", attrs={'class': "location"})
            for span in c:
                temp1 = 0
                temp1 = span.text
                job_post.append(span.text) 
                print()
                if len(temp1) > 0:
                    break
           
            #grabbing summary text
            d = soup.findAll("span", attrs={"class": "summary"})
            for span in d:
                temp2 = 0
                temp2 = span.text
                job_post.append(span.text.strip())
                if len(temp2) > 0:
                    break
           
            
            print("this is row 2!!!!")
            print(job_post)
            #appending list of job post info to dataframe at index num
            sample_df.iloc[num,:] = job_post
#saving sample_df as a local csv file — define your own local path to save contents 
#sample_df.to_csv("C:\Users\lili\Documents\icode\scraping\ver1.csv", encoding="utf-8")
print(sample_df.head())

# source
* [Web Scraping Job Postings from Indeed](https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b)
* [Scraping Job Posting Data from Indeed using Selenium and BeautifulSoup](https://towardsdatascience.com/scraping-job-posting-data-from-indeed-using-selenium-and-beautifulsoup-dfc86230baac)
