In [1]:
########################################################
#################### IMPORT LIBRARY ####################
########################################################
import bs4
import numpy
import pandas
import re
import requests
import datetime
import stop_words

In [47]:
#Arguments 
input_job = "Data Scientist"
input_quote = False # add quotation marks("") to your input_job
input_city = "Toronto" # leave empty if input_city is not specified
input_state = "ON"
sign = "+"
BASE_URL_indeed =  'https://ca.indeed.com/' #'http://www.indeed.com'

In [48]:
#Transform searching keywords
# The default "quote = False"
def transform(input,sign, quote = False):
    syntax = input.replace(" ", sign)
    if quote == True:
        syntax = ''.join(['"', syntax, '"'])
    return(syntax)

In [49]:
#Generate the URL 
if not input_city: # if (input_city is "")
    url_indeed_list = [ BASE_URL_indeed, '/jobs?q=', transform(input_job, sign, input_quote),
                    '&l=', input_state]
    url_indeed = ''.join(url_indeed_list)
else: # input_city is not ""
    url_indeed_list = [ BASE_URL_indeed, '/jobs?q=', transform(input_job, sign, input_quote),
                    '&l=', transform(input_city, sign), '%2C+', input_state]
    url_indeed = ''.join(url_indeed_list)
print(url_indeed)

# get the HTML code from the URL
rawcode_indeed = requests.get(url_indeed)
# Choose "lxml" as parser
soup_indeed = bs4.BeautifulSoup(rawcode_indeed.text, "lxml")

# total number of results
num_total_indeed = soup_indeed.find(
                        id = 'searchCount').contents[0].split()[-2]
print("total results:")
print(soup_indeed.find(id = 'searchCount').contents[0])
print(soup_indeed.find(id = 'searchCount').contents[0].split()[-2])
num_total_indeed = re.sub("[^0-9]","", num_total_indeed) # remove non-numeric characters in the string
num_total_indeed = int(num_total_indeed)
print(num_total_indeed)

# total number of pages
num_pages_indeed = int(numpy.ceil(num_total_indeed/10.0))
print("Number of pages to search: " + str(num_pages_indeed))

# create an empty dataframe
job_df_indeed = pandas.DataFrame()
# the date for today
now = datetime.datetime.now()
now_str = now.strftime("%m/%d/%Y")
now_str_name=now.strftime('%m%d%Y') #This will be used when creating the csv file

https://ca.indeed.com//jobs?q=Data+Scientist&l=Toronto%2C+ON
total results:

        Page 1 of 381 jobs
381
381
Number of pages to search: 39


In [50]:
# Loop for all the total pages
for i in range(1, num_pages_indeed+1):
    # generate the URL
    url = ''.join([url_indeed, '&start=', str(i*10)])
    print(url)

    # get the HTML code from the URL
    rawcode = requests.get(url)
    soup = bs4.BeautifulSoup(rawcode.text, "lxml")

    # pick out all the "div" with "class="job-row"
    divs = soup.findAll("div")
    job_divs = [jp for jp in divs if not jp.get('class') is None
                    and 'row' in jp.get('class')]
    #print("job_divs:")
    #print(job_divs)

    # loop for each div chunk - Get frist some info for each job from the indeed Summary page
    for job in job_divs:
        try:
            # job id
            id = job.get('data-jk', None)
            # job link related to job id
            link = BASE_URL_indeed + '/viewjob'+ '?jk=' + id
            print("link:")
            print(link)
            # job title
            title = job.find('a', attrs={'data-tn-element': 'jobTitle'}).attrs['title']
            # job company
            company = job.find('span', {'class': 'company'}).text.strip()
            # job location
            location = job.find('span', {'class': 'location'}).text.strip()
        except:
            continue

        job_df_indeed = job_df_indeed.append({'job_title': title,
                                'job_id': id,
                                'job_company': company,
                                'date': now_str,
                                'from':'Indeed',
                                'job_location':location,
                                'job_link':link},ignore_index=True)
cols=['from','date','job_id','job_title','job_company','job_location','job_link']
job_df_indeed = job_df_indeed[cols] #reorganize columns in the dataframe
print(job_df_indeed.shape)

# delete the duplicated jobs using job link
job_df_indeed = job_df_indeed.drop_duplicates(['job_link'], keep='first')

# print the dimenstion of the dataframe
print(job_df_indeed.shape)

# save the dataframe as a csv file
#path = '/Users/chou/Google Drive/websites/github/webscraping_example/output/'+'job_indeed_' + now_str_name + '.csv'
#path = 'C:/Users\Larissa\Documents\UofT\Data_Science\webscraping_example-master\webscraping_example-master\'
path = 'job_indeed_' + now_str_name + '.csv'
job_df_indeed.to_csv(path)
display(job_df_indeed)

https://ca.indeed.com//jobs?q=Data+Scientist&l=Toronto%2C+ON&start=10
link:
https://ca.indeed.com//viewjob?jk=643af532812b501e
link:
https://ca.indeed.com//viewjob?jk=107c969e3deb11c3
link:
https://ca.indeed.com//viewjob?jk=b9b2233c7a074542
link:
https://ca.indeed.com//viewjob?jk=c158466eba72ee05
link:
https://ca.indeed.com//viewjob?jk=86e0e9a1dab7acf3
link:
https://ca.indeed.com//viewjob?jk=5febe1f5e2c2fda4
link:
https://ca.indeed.com//viewjob?jk=13d57d7182af1b88
link:
https://ca.indeed.com//viewjob?jk=8cda68574c7a9e6e
link:
https://ca.indeed.com//viewjob?jk=b78085b7525b9c99
link:
https://ca.indeed.com//viewjob?jk=1500241269ef2103
link:
https://ca.indeed.com//viewjob?jk=fd529787329d9b52
link:
https://ca.indeed.com//viewjob?jk=643af532812b501e
link:
https://ca.indeed.com//viewjob?jk=3625131c23ab13f2
link:
https://ca.indeed.com//viewjob?jk=5547d39589e97752
link:
https://ca.indeed.com//viewjob?jk=9702772463c5d349
link:
https://ca.indeed.com//viewjob?jk=631aecbd912f8389
link:
https://ca.i

link:
https://ca.indeed.com//viewjob?jk=9702772463c5d349
link:
https://ca.indeed.com//viewjob?jk=2172d8fe11ab66ae
link:
https://ca.indeed.com//viewjob?jk=c158466eba72ee05
link:
https://ca.indeed.com//viewjob?jk=0bc5c4d7cc78fc39
link:
https://ca.indeed.com//viewjob?jk=e79d0c1b3b6484c4
link:
https://ca.indeed.com//viewjob?jk=4b6c8a545f5aae9a
link:
https://ca.indeed.com//viewjob?jk=29168539b9786a5d
link:
https://ca.indeed.com//viewjob?jk=1f4505c89ed4023a
link:
https://ca.indeed.com//viewjob?jk=31bc2fc37d08a17f
link:
https://ca.indeed.com//viewjob?jk=1c2761ce4252fd4e
link:
https://ca.indeed.com//viewjob?jk=d481bc68d0d5cebc
link:
https://ca.indeed.com//viewjob?jk=d984ab129330b987
link:
https://ca.indeed.com//viewjob?jk=f145afef83cb1fe3
link:
https://ca.indeed.com//viewjob?jk=faa17bf556a4332e
link:
https://ca.indeed.com//viewjob?jk=631aecbd912f8389
link:
https://ca.indeed.com//viewjob?jk=643af532812b501e
link:
https://ca.indeed.com//viewjob?jk=107c969e3deb11c3
https://ca.indeed.com//jobs?q=D

link:
https://ca.indeed.com//viewjob?jk=107c969e3deb11c3
link:
https://ca.indeed.com//viewjob?jk=0bc5c4d7cc78fc39
link:
https://ca.indeed.com//viewjob?jk=9702772463c5d349
link:
https://ca.indeed.com//viewjob?jk=b9b2233c7a074542
link:
https://ca.indeed.com//viewjob?jk=40ed11afbb45bfe8
link:
https://ca.indeed.com//viewjob?jk=01ba692053bfef60
link:
https://ca.indeed.com//viewjob?jk=4a764ac7b03e93cf
link:
https://ca.indeed.com//viewjob?jk=e41c162d370328fa
link:
https://ca.indeed.com//viewjob?jk=88a390ffe63bed8c
link:
https://ca.indeed.com//viewjob?jk=912c8682aff2a953
link:
https://ca.indeed.com//viewjob?jk=b39a22ec722487f7
link:
https://ca.indeed.com//viewjob?jk=3af12b52aef061ff
link:
https://ca.indeed.com//viewjob?jk=068e21153f034f98
link:
https://ca.indeed.com//viewjob?jk=e85e42274f1fe9cb
link:
https://ca.indeed.com//viewjob?jk=c158466eba72ee05
link:
https://ca.indeed.com//viewjob?jk=631aecbd912f8389
link:
https://ca.indeed.com//viewjob?jk=643af532812b501e
https://ca.indeed.com//jobs?q=D

link:
https://ca.indeed.com//viewjob?jk=107c969e3deb11c3
link:
https://ca.indeed.com//viewjob?jk=0bc5c4d7cc78fc39
link:
https://ca.indeed.com//viewjob?jk=643af532812b501e
link:
https://ca.indeed.com//viewjob?jk=9702772463c5d349
link:
https://ca.indeed.com//viewjob?jk=13e5b5d0fb0bf308
link:
https://ca.indeed.com//viewjob?jk=35a6b855c2f43366
link:
https://ca.indeed.com//viewjob?jk=6adc90268ab5de60
link:
https://ca.indeed.com//viewjob?jk=fec0ab33b94e5a4d
link:
https://ca.indeed.com//viewjob?jk=96eef3d9535e81b7
link:
https://ca.indeed.com//viewjob?jk=6fb6fe834721f4df
link:
https://ca.indeed.com//viewjob?jk=13a880a4551ab011
link:
https://ca.indeed.com//viewjob?jk=a7fc8296ff896890
link:
https://ca.indeed.com//viewjob?jk=0ad4c65a91d397fb
link:
https://ca.indeed.com//viewjob?jk=b3c3277e44a1f20b
link:
https://ca.indeed.com//viewjob?jk=2172d8fe11ab66ae
link:
https://ca.indeed.com//viewjob?jk=c158466eba72ee05
link:
https://ca.indeed.com//viewjob?jk=631aecbd912f8389
https://ca.indeed.com//jobs?q=D

link:
https://ca.indeed.com//viewjob?jk=643af532812b501e
link:
https://ca.indeed.com//viewjob?jk=631aecbd912f8389
link:
https://ca.indeed.com//viewjob?jk=0bc5c4d7cc78fc39
link:
https://ca.indeed.com//viewjob?jk=edf652eb2608173e
link:
https://ca.indeed.com//viewjob?jk=7a94ac1d491b7db4
link:
https://ca.indeed.com//viewjob?jk=ee3fb472e3ab7640
link:
https://ca.indeed.com//viewjob?jk=e9a425ca34499f0c
link:
https://ca.indeed.com//viewjob?jk=0a241c45732bf9cf
link:
https://ca.indeed.com//viewjob?jk=67f4ffe25243202a
link:
https://ca.indeed.com//viewjob?jk=60e6097a5cc3316e
link:
https://ca.indeed.com//viewjob?jk=61a3242263a24742
link:
https://ca.indeed.com//viewjob?jk=f7cf69293b76e27d
link:
https://ca.indeed.com//viewjob?jk=aafe7b55dc0c84a9
link:
https://ca.indeed.com//viewjob?jk=9702772463c5d349
link:
https://ca.indeed.com//viewjob?jk=107c969e3deb11c3
https://ca.indeed.com//jobs?q=Data+Scientist&l=Toronto%2C+ON&start=370
link:
https://ca.indeed.com//viewjob?jk=107c969e3deb11c3
link:
https://ca.

Unnamed: 0,from,date,job_id,job_title,job_company,job_location,job_link
0,Indeed,11/10/2018,86e0e9a1dab7acf3,Data Scientist,Flipp,"Toronto, ON",https://ca.indeed.com//viewjob?jk=86e0e9a1dab7...
1,Indeed,11/10/2018,5febe1f5e2c2fda4,Data Scientist / Actuary,Munich Re,"Toronto, ON",https://ca.indeed.com//viewjob?jk=5febe1f5e2c2...
2,Indeed,11/10/2018,13d57d7182af1b88,Data Scientist- Machine Learning,Amazon.com,"Toronto, ON",https://ca.indeed.com//viewjob?jk=13d57d7182af...
3,Indeed,11/10/2018,8cda68574c7a9e6e,Data Mining Scientist,Huawei Canada,"Markham, ON",https://ca.indeed.com//viewjob?jk=8cda68574c7a...
4,Indeed,11/10/2018,b78085b7525b9c99,Data Scientist- Canadian Celebration of Women ...,Capital One,"Toronto, ON",https://ca.indeed.com//viewjob?jk=b78085b7525b...
5,Indeed,11/10/2018,1500241269ef2103,"Vice President, Data Science & Artificial Inte...",Quantium,"Toronto, ON",https://ca.indeed.com//viewjob?jk=1500241269ef...
6,Indeed,11/10/2018,fd529787329d9b52,"Data Scientist, Finance",Loblaw Companies Limited,"Brampton, ON",https://ca.indeed.com//viewjob?jk=fd529787329d...
7,Indeed,11/10/2018,643af532812b501e,Data Scientist - Machine Learning,Intact,"Toronto, ON",https://ca.indeed.com//viewjob?jk=643af532812b...
8,Indeed,11/10/2018,3625131c23ab13f2,Data Scientist,Xylem,"Mississauga, ON",https://ca.indeed.com//viewjob?jk=3625131c23ab...
9,Indeed,11/10/2018,5547d39589e97752,Data Scientist - Toronto,Prodigy Game,"Toronto, ON",https://ca.indeed.com//viewjob?jk=5547d39589e9...


In [51]:
### Define the terms that we're interested in looking for
import stop_words

# define the stop_words for future use.
#Stop words is part of the NLTK. A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine 
#has been programmed to ignore, both when indexing entries for searching and when retrieving  them as the result of a 
#search query.

stop_words = stop_words.get_stop_words('english') # list out all the English stop word

# read the csv file - 
#job_df_indeed = pandas.DataFrame.from_csv(path) - Don't really need to read from csv file, since it's in memory

#This part should be modified later to get these values from our database
##### Job types #####
type = ['Full-Time', 'Full Time', 'Part-Time', 'Part Time', 'Contract', 'Contractor']
type_lower = [s.lower() for s in type] # convert all to lowercases
# map the type_lower to type
type_map = pandas.DataFrame({'raw':type, 'lower':type_lower}) # create a dataframe
type_map['raw'] = ["Full-Time", "Full-Time", 'Part-Time', 'Part-Time', "Contract", 'Contract'] # modify the mapping
type_dic = list(type_map.set_index('lower').to_dict().values()).pop() # use the dataframe to create a dictionary
#print(type_dic)

##### Skills #####
skills = ['R', 'Shiny', 'RStudio', 'Markdown', 'Latex', 'SparkR', 'D3', 'D3.js',
            'Unix', 'Linux', 'MySQL', 'Microsoft SQL server', 'SQL',
            'Python', 'SPSS', 'SAS', 'C++', 'C', 'C#','Matlab','Java',
            'JavaScript', 'HTML', 'HTML5', 'CSS', 'CSS3','PHP', 'Excel', 'Tableau',
            'AWS', 'Amazon Web Services ','Google Cloud Platform', 'GCP',
            'Microsoft Azure', 'Azure', 'Hadoop', 'Pig', 'Spark', 'ZooKeeper',
            'MapReduce', 'Map Reduce','Shark', 'Hive','Oozie', 'Flume', 'HBase', 'Cassandra',
            'NoSQL', 'MongoDB', 'GIS', 'Haskell', 'Scala', 'Ruby','Perl',
            'Mahout', 'Stata']
skills_lower = [s.lower() for s in skills]# lowercases
skills_map = pandas.DataFrame({'raw':skills, 'lower':skills_lower})# create a dataframe
skills_map['raw'] = ['R', 'Shiny', 'RStudio', 'Markdown', 'Latex', 'SparkR', 'D3', 'D3',
            'Unix', 'Linux', 'MySQL', 'Microsoft SQL server', 'SQL',
            'Python', 'SPSS', 'SAS', 'C++', 'C', 'C#','Matlab','Java',
            'JavaScript', 'HTML', 'HTML', 'CSS', 'CSS','PHP', 'Excel', 'Tableau',
            'AWS', 'AWS','GCP', 'GCP',
            'Azure', 'Azure', 'Hadoop', 'Pig', 'Spark', 'ZooKeeper',
            'MapReduce', 'MapReduce','Shark', 'Hive','Oozie', 'Flume', 'HBase', 'Cassandra',
            'NoSQL', 'MongoDB', 'GIS', 'Haskell', 'Scala', 'Ruby','Perl',
            'Mahout', 'Stata']
skills_dic = list(skills_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary
# print(skills_dic)

##### Education #####
edu = ['Bachelor', "Bachelor's", 'BS', 'B.S', 'B.S.', 'Master', "Master's", 'Masters', 'M.S.', 'M.S', 'MS',
        'PhD', 'Ph.D.', "PhD's", 'MBA']
edu_lower = [s.lower() for s in edu]# lowercases
edu_map = pandas.DataFrame({'raw':edu, 'lower':edu_lower})# create a dataframe
edu_map['raw'] = ['BS', "BS", 'BS', "BS", 'BS', 'MS', "MS", 'MS', 'MS', 'MS', 'MS',
        'PhD', 'PhD', "PhD", 'MBA'] # modify the mapping
edu_dic = list(edu_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary
# print(edu_dic)

##### Major #####
major = ['Computer Science', 'Statistics', 'Mathematics', 'Math','Physics',
            'Machine Learning','Economics','Software Engineering', 'Engineering',
            'Information System', 'Quantitative Finance', 'Artificial Intelligence',
            'Biostatistics', 'Bioinformatics', 'Quantitative']
major_lower = [s.lower() for s in major]# lowercases
major_map = pandas.DataFrame({'raw':major, 'lower':major_lower})# create a dataframe
major_map['raw'] = ['Computer Science', 'Statistics', 'Math', 'Math','Physics',
            'Machine Learning','Economics','Software Engineering', 'Engineering',
            'Information System', 'Quantitative Finance', 'Artificial Intelligence',
            'Biostatistics', 'Bioinformatics', 'Quantitative']# modify the mapping
major_dic = list(major_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary

##### Key Words ######
keywords = ['Web Analytics', 'Regression', 'Classification', 'User Experience', 'Big Data',
            'Streaming Data', 'Real-Time', 'Real Time', 'Time Series']
keywords_lower = [s.lower() for s in keywords]# lowercases
keywords_map = pandas.DataFrame({'raw':keywords, 'lower':keywords_lower})# create a dataframe
keywords_map['raw'] = ['Web Analytics', 'Regression', 'Classification', 'User Experience', 'Big Data',
            'Streaming Data', 'Real Time', 'Real Time', 'Time Series']# modify the mapping
keywords_dic = list(keywords_map.set_index('lower').to_dict().values()).pop()# use the dataframe to create a dictionary
# print(keywords_dic)

In [52]:
def Salary(soup):
    try:
        Salary = soup.find( attrs = {'class' : 'jobsearch-JobMetadataHeader-item'})
        return  Salary.contents
    except:
        return 'Not Informed'

In [62]:
##### For Loop for scraping each job URL #####
#create empty lists to store details for all the jobs
list_type = []
list_salary = []
list_skill = []
list_text = []
list_edu = []
list_major = []
list_keywords = []


#Web scrap each link in the DataFrame 

for i in range(len(job_df_indeed)): #range(10):
    # empty list to store details for each job
    required_type= []
    required_skills = []
    required_edu = []
    required_major = []
    required_keywords = []
    salary = []

    try:
        # get the HTML code from the URL
        job_page = requests.get(job_df_indeed.iloc[i, 6])
        #print(job_df_indeed.iloc[i, 6])
        # Choose "lxml" as parser
        soup = bs4.BeautifulSoup(job_page.text, "lxml")
        
        #saly = Salary(soup)
        try:
            Salary = soup.find( attrs = {'class' : 'jobsearch-JobMetadataHeader-item'})
            salary = Salary.contents;
            list_salary.append(salary)
            #print(Salary.contents)
        except:
            list_salary.append("null")
            #print('Not Informed')
  
        # drop the chunks of 'script','style','head','title','[document]'
        for elem in soup.findAll(['script','style','head','title','[document]']):
            elem.extract()
        # get the lowercases of the texts
        texts = soup.getText(separator=' ').lower()

        # cleaning the text data
        string = re.sub(r'[\n\r\t]', ' ', texts) # remove "\n", "\r", "\t"
        string = re.sub(r'\,', ' ', string) # remove ","
        string = re.sub('/', ' ', string) # remove "/"
        string = re.sub(r'\(', ' ', string) # remove "("
        string = re.sub(r'\)', ' ', string) # remove ")"
        string = re.sub(' +',' ',string) # remove more than one space
        string = re.sub(r'r\s&\sd', ' ', string) # avoid picking 'r & d'
        string = re.sub(r'r&d', ' ', string) # avoid picking 'r&d'
        string = re.sub('\.\s+', ' ', string) # remove "." at the end of sentences
        #print(string)
        # Job types
        for typ in type_lower :
            if any(x in typ for x in ['+', '#', '.']):
                typp = re.escape(typ) # make it possible to find out 'c++', 'c#', 'd3.js' without errors
                print(typp)
            else:
                typp = typ
            result = re.search(r'(?:^|(?<=\s))' + typp + r'(?=\s|$)', string) # search the string in a string
            if result:
                required_type.append(type_dic[typ])
        list_type.append(required_type)

        # Skills
        for sk in skills_lower :
            if any(x in sk for x in ['+', '#', '.']):
                skk = re.escape(sk)
            else:
                skk = sk
            result = re.search(r'(?:^|(?<=\s))' + skk + r'(?=\s|$)',string)
            if result:
                required_skills.append(skills_dic[sk])
        list_skill.append(required_skills)

        # Education
        for ed in edu_lower :
            if any(x in ed for x in ['+', '#', '.']):
                edd = re.escape(ed)
            else:
                edd = ed
            result = re.search(r'(?:^|(?<=\s))' + edd + r'(?=\s|$)', string)
            if result:
                required_edu.append(edu_dic[ed])
        list_edu.append(required_edu)

        # Major
        for maj in major_lower :
            if any(x in maj for x in ['+', '#', '.']):
                majj = re.escape(maj)
            else:
                majj = maj
            result = re.search(r'(?:^|(?<=\s))' + majj + r'(?=\s|$)', string)
            if result:
                required_major.append(major_dic[maj])
        list_major.append(required_major)

        # Key Words
        for key in keywords_lower :
            if any(x in key for x in ['+', '#', '.']):
                keyy = re.escape(key)
            else:
                keyy = key
            result = re.search(r'(?:^|(?<=\s))' + keyy + r'(?=\s|$)', string)
            if result:
                required_keywords.append(keywords_dic[key])
        list_keywords.append(required_keywords)

        # All text
        words = string.split(' ')
        job_text = set(words) - set(stop_words) # drop stop words
        list_text.append(list(job_text))
    except:
        list_type.append('Forbidden')
        list_skill.append('Forbidden')
        list_edu.append('Forbidden')
        list_major.append('Forbidden')
        list_keywords.append('Forbidden')
        list_text.append('Forbidden')
    print(i)
    
print("Salary")
print(len(list_salary))
print(list_salary)
#print(list_skill)
job_df_indeed['job_type'] = list_type
job_df_indeed['job_skills'] = list_skill
job_df_indeed['job_edu'] = list_edu
job_df_indeed['job_major'] = list_major
job_df_indeed['job_keywords'] = list_keywords
job_df_indeed['job_text'] = list_text
job_df_indeed['job_salary'] = list_salary

cols=['from','date','job_id','job_title','job_company','job_location', 'job_link','job_type', 'job_salary',
        'job_skills', 'job_edu', 'job_major', 'job_keywords','job_text']
job_df_indeed = job_df_indeed[cols]

# print the dimenstion of the dataframe
print(job_df_indeed.shape)

job_df_indeed.to_csv(path)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [63]:
display(job_df_indeed)

Unnamed: 0,from,date,job_id,job_title,job_company,job_location,job_link,job_type,job_salary,job_skills,job_edu,job_major,job_keywords,job_text
0,Indeed,11/10/2018,86e0e9a1dab7acf3,Data Scientist,Flipp,"Toronto, ON",https://ca.indeed.com//viewjob?jk=86e0e9a1dab7...,[],,"[R, SQL, Python, SAS, Tableau, Spark, Scala]",[BS],"[Computer Science, Math, Physics, Machine Lear...",[],"[, recommendations, new, status, bachelor's, t..."
1,Indeed,11/10/2018,5febe1f5e2c2fda4,Data Scientist / Actuary,Munich Re,"Toronto, ON",https://ca.indeed.com//viewjob?jk=5febe1f5e2c2...,[],,"[R, SQL, Python, SAS]",[],"[Computer Science, Statistics, Math, Machine L...","[Big Data, Real Time]","[, made, collaborative, mathematics, thrive, r..."
2,Indeed,11/10/2018,13d57d7182af1b88,Data Scientist- Machine Learning,Amazon.com,"Toronto, ON",https://ca.indeed.com//viewjob?jk=13d57d7182af...,[],,"[R, SQL, Python, SAS, Matlab, Excel, Tableau, ...","[BS, MS]","[Computer Science, Statistics, Math, Physics, ...",[],"[, bachelor's, modelling, mathematics, competi..."
3,Indeed,11/10/2018,8cda68574c7a9e6e,Data Mining Scientist,Huawei Canada,"Markham, ON",https://ca.indeed.com//viewjob?jk=8cda68574c7a...,[],[Permanent],"[Python, SAS]",[],"[Computer Science, Statistics, Math, Physics, ...","[Regression, Classification, Big Data]","[, diverse, new, york, applicants, topic, huaw..."
4,Indeed,11/10/2018,b78085b7525b9c99,Data Scientist- Canadian Celebration of Women ...,Capital One,"Toronto, ON",https://ca.indeed.com//viewjob?jk=b78085b7525b...,[],,"[R, Unix, Linux, SQL, Python, SPSS, SAS, Matla...",[],"[Computer Science, Statistics, Math, Machine L...",[],"[, made, modelling, mathematics, walk, designe..."
5,Indeed,11/10/2018,1500241269ef2103,"Vice President, Data Science & Artificial Inte...",Quantium,"Toronto, ON",https://ca.indeed.com//viewjob?jk=1500241269ef...,[],,[],[],[Artificial Intelligence],[],"[, automated, done, dedicated, close, sell, id..."
6,Indeed,11/10/2018,fd529787329d9b52,"Data Scientist, Finance",Loblaw Companies Limited,"Brampton, ON",https://ca.indeed.com//viewjob?jk=fd529787329d...,[Full-Time],,"[R, SQL, Python, SAS, Azure]",[],"[Computer Science, Statistics, Math, Machine L...",[Big Data],"[, you’ll, new, pride, made, applicants, conti..."
7,Indeed,11/10/2018,643af532812b501e,Data Scientist - Machine Learning,Intact,"Toronto, ON",https://ca.indeed.com//viewjob?jk=643af532812b...,[],,"[R, Python, SAS, Hadoop]",[],"[Computer Science, Statistics, Math, Machine L...","[Regression, Big Data]","[, collaborative, mathematics, algorithmic, ne..."
8,Indeed,11/10/2018,3625131c23ab13f2,Data Scientist,Xylem,"Mississauga, ON",https://ca.indeed.com//viewjob?jk=3625131c23ab...,[],,"[R, SQL, Python, Hadoop, Spark]",[],"[Computer Science, Statistics, Math, Physics, ...",[Big Data],"[, greek, diverse, new, loss, york, take, prid..."
9,Indeed,11/10/2018,5547d39589e97752,Data Scientist - Toronto,Prodigy Game,"Toronto, ON",https://ca.indeed.com//viewjob?jk=5547d39589e9...,[],,"[R, SQL, Python, SAS, AWS]",[],"[Computer Science, Math, Math, Machine Learnin...",[Real Time],"[, new, applicants, similar, can, cross-functi..."
