In [1]:
# import libraries

from bs4 import BeautifulSoup # For HTML parsing
import urllib # Website connections
import requests
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
import matplotlib
import nltk
import random

In [4]:
def headers():
    i = random.randint(3,5)
    j=random.randint(40,53)
    x=random.randint(2,13)
    url_base = 'https://www.indeed.ca/jobs?q='
    headers ={
            'User-Agent':'Mozilla/'+str(i)+'.0 (Macintosh; Intel Mac OS X 10.'+str(x)+'; rv:53.0) Gecko/20100101 Firefox/'+str(j)+'.0'
                }
    return headers

def text_cleaner(website):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    try:
        site = requests.get(website, headers = headers()) # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 

    soup_obj = BeautifulSoup(site.text, "html.parser") # Get the html from the site

    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object

    text = soup_obj.get_text() # Get the text from this
    lines = (line.strip() for line in text.splitlines()) # break into lines
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each

    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  

    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line

    # Now clean out all of the unicode junk (this line works great!!!)
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception

    text = re.sub(r"[^a-zA-Z.+3]"," ", str(text))  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++

    text = text.lower().split()  # Go to lower case and split them apart
    stop_words = set(stopwords.words("english")) # Filter out any stop words
    text = [w for w in text if not w in stop_words]
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                            # or not on the website)
    return text

# note the function itself doesn't return anything if you run this cell




In [5]:

def skills_info_test(city = None, state = None, job_title = None):
    final_job = str(job_title)
    columns = ["job_title","job_link","job_description","company_name","location","summary","salary"]
    df = pd.DataFrame(columns = columns)
    num = (len-(df)+1)
    
    for start in range(0,50,10):
        
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
        if city is not None:
            final_city = city.split() 
            final_city = '+'.join(word for word in final_city)
            final_site_list = ['http://ca.indeed.com/jobs?q=', final_job, '&l=', final_city,
                    '%2C+', state,'&start=',str(start)] # Join all of our strings together so that indeed will search correctly
        else:
            final_site_list = ['http://ca.indeed.com/jobs?q="', final_job, '"','&start=', str(start)]

        final_site = ''.join(final_site_list) # Merge the html address together into one string
        print(final_site)

        base_url = 'http://ca.indeed.com'
    
        try:
            html = requests.get(final_site, headers = headers()) # Open up the front page of our search first
        except:
            print('That city/state combination did not have any jobs. Exiting . . .') # In case the city is invalid
            return
        
        soup = BeautifulSoup(html.text, 'html.parser') # Get the html from the first page
    
        job_link_area = soup.find(id = 'resultsCol')
        
        #target = BeautifulSoup(html.text, "html.parser") # Get the html from the first page
        #targetElements = target.findAll('div',attrs={'class':'row result'})
        
        
        
        #for elem in targetElements:
            #job_link = "%s%s" % (base_url,elem.find('a').get('href'))
            #job_post.append(job_link)
            
        for div in soup.find_all(name = "div", attrs = {"class":"row"}):
            #specifying row num for index of job posting in dataframe
            job_post = []
            num = (len(df)+1)
            
            #creating an empty list to hold the data for each posting
            
            
        
            # grabbing job title
        
            for a in div.find_all(name = "a", attrs = {"data-tn-element":"jobTitle"}):
                job_post.append(a["title"])
        
            # grabbing job link
            job_descriptions = [] 
            for a in div.find_all(name = "a", attrs = {"data-tn-element":"jobTitle"}):
                job_post.append(str(base_url) + str(a["href"]))
                
                final_description = text_cleaner(str(base_url) + str(a['href']))
                job_post.append(final_description)
                #sleep(0)
             # grabbing detailed description
            
            
            
            #grabbing company name
        
            company = div.find_all(name = "span", attrs = {"class":"company"})
        
            if len(company) > 0:
                for b in company:
                    job_post.append(b.text.strip())
            else:
                sec_try = div.find_all(name = "span", attrs = {"class":"result-link-source"})
            
                for span in sec_try:
                    job_post.append(span.text)
                
            # grabbing location name
        
            c = div.findAll('span',attrs = {'class':'location'})
        
            for span in c:
            
                job_post.append(span.text)
        
            #grabing summary text
        
            d = div.findAll('span', attrs = {'class':'summary'})
            for span in d:
                job_post.append(span.text.strip())
                
           
   
            # grabbing salary
    
            try:
                job_post.append(div.find('nobr').text)
        
            except:
                try:
                    div_2 = div.find(name = 'div', attrs = {"class":"sjcl"})
                    div_3 = div_2.find("div")
                    job_post.append(div_3.text.strip())
                except:
                    job_post.append("Nothing_Found")
                
            #appending list of job post to a data frame at index num
        
            df.loc[num] = job_post
        
    
    
    return(df)
        
        
    
        
    
    
    

In [6]:
pd.set_option('display.max_colwidth', -1)
df1 = skills_info_test(city = 'Toronto', state = 'ON', job_title = "Data+Scientist")

http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=0
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=10
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=20
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=30
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=40


In [75]:
df1

Unnamed: 0,job_title,job_link,job_description,company_name,location,summary,salary
1,Data Scientist,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Dl0EE_WhFXbpizkrl8nywi6L-ueye5Z0RVrGddRMF4VcenHcfJ6Sm70jbXDr9BXYjDukEP1GuEvFrqCRDv7b48Viu2RdIktP-UDr_L8UZAdyKnZlhUVhxNyLgD6xJIhpJR30Zl_8hYjf9RelaBwXqQ6KznIqPOMldHNT7Sud0hkcKKImjXQbkv9jEnPk3YbONbBnYxK7W1ooHPQkkdgRpefkfaArwHsGpssx5eAzR2pTn5TuhROh4usVTzrKLRTkjmssNa7hSbx_kotCgO_7AZPmaZQHagzxzn8IOGeDQwsWrPWUQwCR1RcSCrlh5sQjx5hwC1OULU_X59LaCwYwEfc02lThA1q49K_zWO47NFTmnUI1iIt4ZFxfI8Lf1lYCe5iK3J9FlL6M265E9dXp0GuOrqchUmoZxsKtngZAOn6B3DX0BqbGbPBcUwL0DRXbOqj0RY6HZzDM9Q_9RLeesvzk-z4FX9zLhXiKgOzArfduw7TQnLkOvJmBkuSJF_HTz75ybMvOPK2zjPV857IaY7zWeiVbNnaga8NyM577nkqw==&p=1&sk=&fvj=0,"[top, twitter, jobs, data, tube, window, royal, security, linkedin, accessibility, facebook, canada, backtotop, rbc, toronto, link, new, legal, submit, menu, careers, view, opens, general, instagram, rbc.com, b, icon, ontario, website, scientist, bank, technology, application, privacy, mobile]",RBC,"Toronto, ON","Big Data Technologies:. The Data Scientist will design, implement and deliver algorithmic, model-driven, statistical or machine learning solutions in...",Nothing_Found
2,Data Developer (14 month contract),http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0CeJ2NNOGKC6-vVDIZc6zLWOhawF6DrYB26fLHY4IsQgCPM7i81DLkcntHhEyxf6D7-0cIcLAj1syhlHicZtM4S2NbTpcAu6OA1bfsS5WPkXQR7gQBwH2bIYZ5ropSyPohgV8NuINJIDksgB9rLQRehDLYT5fE4yy_KgeJVkoqmOMjrB9Zj6eJ-Cys32fnDYWnk_7mgJLnDIbnnvmAHyDLAvR-SfRAErz9Gknh4Vf9xQV3tLatB8LfjNcurdnUEI1585mHznKeFkXKXWgX355CfOzC8bAm-ao9URJbrynlLy1KViGBcpuXrqU7NlGmKPiItNUaPkU1LFh4uJchyuO3Sf6aaicWZWrB-68jOrLgfM1r7AIPtEcqDTiIZWdfjtwF34Y_1ELrxQKqKs_9wkbj1OgJVK43I_lYg6MK1EV_caIgSrmE49Uie84FqiVA7butoOq9BBEA2aS_CqufyHP3WSBCsowuPjfyp4oah9oTl_4QOr_jn2J3cov0eAH0Z1f18GTy4Y_Nf_S4we4MvVfpKUuNuwtjIax5yYkbWdr8Rkg==&p=2&sk=&fvj=0,"[inthis, inactivity.job, primary, set, sign, beginning, jobs, content, ok, work, signed, section.return, teamsgo, previous, loyaltyonelearn, jobpagebasic, position, organization, friend, candidate, main, accessibility, searchmy, disconnect, home, pageprintable, job, locations, refer, page, loyaltyone, reset, matching, advanced, candidatesubmit, profile, inactivity., search, saved, automatically, options, minutes, posting, terms, co., jobrefer, minutes.you, b, use, timer, loyaltyoneour, friendrefer, formatreturn, submissions, section., minutes.click, cart, schedule, searches, out.this, searchcareers, opportunitieslife, account, session, end, type, privacy, location, service]",LoyaltyOne,"Toronto, ON",Alliance Data participates in E-Verify. Work with large volumes of data; He/she will work closely with Data Scientists and Business Consultants to understand...,Nothing_Found
3,"Data Scientist, AML Risk Analytics",http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Dl0EE_WhFXbpizkrl8nywi6L-ueye5Z0RVrGddRMF4VcenHcfJ6Sm70jbXDr9BXYhJscKp_ZnISfkbEkZGiHl21sOa7pR3ffcOPduTls4AZzycp0PLfP26XYs4Gp7vrc9hliZ9NZHCYJsppDHjJJfxuhdxRqLpPVNGrOdmWbNZ95Ni7tSno0oRviaYK8gRN5TvotaTHgfva5Urfj48VBWg65JTlNI9pfgcEFpxggsGB0jMFaugksTSneFmPoT8Ny6a79XBtNAkUGtZafI0iLwbOHEVwPV_VSBsdqakVYHn7YHEOy23iuTBriRjV3pN0zzrv4tJoAcTU3XOP04jIue5QRZ7QJ2tE1G4OsJAATi-XrATG9juK5whuL6Aii4I9G5j-6ARodiSWvrMsfGMA9wvCeW4De_tIBTihBdVBWW3nL8HUKOIpW3eA3P8Hv9c7PowCpHNTYCpVT6d8Za2BtzwLTxqWLEliYrLqGHXEctSmauPpPRqUWc_fi5pbnFhyV-LPswtD5oE5FAMwdBeisR_UgkllJkC1JHla7tCYeDotCBWbb_dHBHW-knJGs7yJQ0=&p=3&sk=&fvj=0,"[top, twitter, jobs, data, tube, window, royal, security, linkedin, accessibility, aml, facebook, canada, backtotop, rbc, management, toronto, link, new, legal, submit, menu, careers, view, opens, general, instagram, rbc.com, b, risk, icon, ontario, website, scientist, bank, application, privacy, mobile, analytics]",RBC,"Toronto, ON","Big Data Technologies:. Data Science Innovation Lead. Working throughout the pipeline from defining a research question, collecting data, analyzing data,...",Nothing_Found
4,Data Scientist,http://ca.indeed.com/rc/clk?jk=b67970bf908fd14e&fccid=9e215d88a6b33622,"[borders, harbor, accept, vendors, previous, line, architects, project, audit, linkedin, refinement, ourcookie, sr., consensus, envision, learning, requisition, statistical, additionally, standards, multiple, targeted, goals, influence, notices, discover, united, may, buildinggood, understanding, responsive, organizational, deloittes, requirements, detail, cookies, mahout, cart., programs, negotiation, advanced, planning, terms, services, query, machine, large, consulting, effortscoordinates, states, entities., capabilities, rational, position, across, sas, illinois, add, connected, advisory, locations, solving, delete, careers, key, igniting, understand, depth, within, university, alumni, please, regulations, network, uses, practical, required, maturing, information., adhere, patterns, appropriate, one, canada, seeking, matter, trusted, office, solutions, provides, product, advice, limited, comprising, plusexperience, transaction, etc.knowledge, sets, button, strong, ...]",Deloitte,"Greater Toronto Area, ON","Additionally, the Data Scientist:. The Global Technology Services-Business Services Group is seeking a Data Scientist....",Nothing_Found
5,Data Scientist,http://ca.indeed.com/company/First-Derivatives/jobs/Data-Scientist-9239b938e4ae81c5?fccid=df7770eb169ac252,"[www.firstderivatives.com, unique, technologies, developer, scopes, kx, applying, often, digital, recruitment, instructions, future, graduated, big, chief, findings, finance, solve, selected, prowess, challenges.we, inquisitive, analytics, expenses, quantitative, offers, beneficial, toronto, marketing, mathematics, easily, intelerad, timerequired, revenue.the, middle, institutions, markets, day, international, front, save, company, senior, b, tab, multi, ., gpa, pharma, time, information, insight, emphasis, platforms.the, website, using, similar, role, amounts, career, graduate, nature, operations., business, cities., type, power, may, privacy, mobile, groceries, mission, develops, searchclose, software, businesses, find, last, typical, michael, applied, ago, indeed.com, activities, trends, like, explore, mind, detect, prjob, acumen., job, vast, cookies, science, disciplineexcellent, citizenship, management, implements, technical, ...]",First Derivatives,"Toronto, ON","Data Scientists explore vast amounts of information to detect patterns, trends and correlations that enable businesses to improve their products, services and...",Nothing_Found
6,Data Scientist,http://ca.indeed.com/rc/clk?jk=5b48eb85dbc2c0f4&fccid=3002307a9e5b4706,"[line, scotiabankers, introduction, linkedin, collaborative, deploy, hands, window., insights., sits, predictability, image, build, culture, requisition, learning, rapid, datastores, categories, scientific, looking, banks, industry, accounts., frameworks, scotiabank.com, prepare, community, united, saas, responsive, organizational, nearly, support, alert, leveraging, machine, models, systems., large, working..., process, identifying, states, keyword, language, groups, speaking, collecting, locations, everyone., careers, roll, generation, candidates, diverse, create, dc, within, please, talent, cleaning, required, sign, continuing, experts, level, transform, one, work., existing, canada, matter, synchronize, provides, product, based, opportunity, scotiabank, inclusive, sets, postings, way, strong, provide, maintaining, reimagining, scientist, open, location, science, tidal, influencers, algorithms, english, agile, deployment, scotiabanks, creating, knowledge, ...]",Scotiabank,"Toronto, ON","Experience cleaning, transforming and visualizing large data sets working with various data formats (e.g....",Nothing_Found
7,Data Scientist,http://ca.indeed.com/rc/clk?jk=38b4524db4432931&fccid=3a429036b3725efa,"[managementbank, relationsaccessibilitysupportparticuliers, bourse, locationother, bancaires, avec, aux, ok, previous, feeschanges, line, personnelmarge, servicestravel, crditfaire, maisonplanification, rate, lines, ligneplacementscertificats, main, bancairesdes, emploi, home, accountschequingsavingspremium, accountsmart, pagebasic, financireplanification, faqsapple, joindrecourrieltlphoneprendre, matching, membres, readilineretro, offersmortgage, profile, descriptionqualificationsjobprimary, profession, achat, imptportefeuille, cardssmall, minutes, au, investisseursaccessibilitsoutien, onlineinvestmentsgicsmutual, particulierspatrimoineentreprisemarchs, builder, directed, b, entreprisescarte, bmoprivacylegalsecuritycdic, bmotauxnous, assurance, friendrefer, harris, programmes, section., minutes.click, marketsbusinesswealth, activits, insurance, buy, homeestate, capitauxentreprisesparticulierscomptes, spcialiste, veterans, ngociables, bmocomparez, lost, debttravel, chquescompte, mdecine, dune, referral, branchreport, business, une, canadaservices, planningplan, bmoratescontact, creditmedical, taux, bmocapital, insurancefind, paycontinuer, go, collect, en, pargne, carte, dental, ou, un, retraitepargne, appointmentget, harrismenu, calculatorspecial, marges, cpg, content, securitytravel, crditcartes, accumuler, ...]",BMO Financial Group,"Toronto, ON",Supporting the data infrastructure and data repository build for analytical needs. We are looking for a Data Scientist that will help us discover the...,Nothing_Found
8,Associate Data Scientist,http://ca.indeed.com/company/Nielsen/jobs/Associate-Data-Scientist-fcd2ef8e148fbdef?fccid=683f563b5cc25b70,"[applying, explores, analysisaccuracy, recruitment, staff, instructions, answers, attention, integrity, project, consumer, let, frequently, sheets, workforce., toronto, mathematics, easily, closely, timerequired, sql, logic, analytical, require, curiosity, supportcompile, company, statistical, save, b, senior, comprehensive, processing, access, industry, building, presentations, asked, basic, consumers, standards, school, extraction, docsproficiency, responsibilities, contacted.job, google, nielsen, type, and..., power, may, privacy, global, establishing, yearsproficiency, processprovide, searchclose, understanding, find, nlsn, watch, applied, ago, indeed.com, like, pickering, job, documentation, cookies, panel, client, management, estimationassist, inc, support, degreebroad, apply, advanced, search, company., full, scheme, offer, encourage, statistics, terms, operations, equivalent, help, sample, keywords, workopolis, computer, accommodation, collaborator, associate, resume, material, include, ...]",The Nielsen Company,"Markham, ON","Assure the integrity of project data, including data extraction, storage, manipulation, processing and analysis. Exceptional aptitude for data analysis....",Nothing_Found
9,Data Scientist: Advanced Analytics,http://ca.indeed.com/rc/clk?jk=fc0167706a14059e&fccid=de71a49b535e21cb,"[aucune, firstname, afficher, dfiler, dj, pas, emploi, limitexceededmessage, pourvu., tes, candidatureafficher, vedette, avez, envoye., messages, oq.value, refineresultstext, b, cherchez, welcometitle.replace, autre, postul., dynamicstrings.link, limite, candidaturevrifier, ressayez., une, en, jusqu, ou, lastname, candidatures, poste., bienvenue, expire, slection, retour, vrifier, site, errormessagejobtitle, intress, enregistrer, search, seule, de, offre, recommandation, recommander, le, jobdetailsbuttontext, pour, emplois., les, poste, haut, null, est, l, tgsettings.sorttextlabel, ce, recherche, effacer, display, courriel., votre, message, oactivefacet.description, bien, clearall, chargement, profiledetails.firstname, nonloggedinwelcometitle, tgsettings.searchresultsapplytojobbuttontext, consulter, .replace, selectjobs, dsol..., sendtofriendbuttontext, facet.description, veuillez, op.optionname, tgsettings.showmoretext, tgsettings.jobdetailsapplybuttontext, permise, candidature, offres, postul, compte, si, candidatures., notre, envoyer, notifications, enregistre, jobsheading, profiledetails.lastname, impossible, cours, filtersappliedtext, statut, ...]",IBM,"Toronto, ON",The Data Scientist will be familiar with key internal and external data sources and how they are gathered and retrieved;...,Nothing_Found
10,Data Scientist: Integrity,http://ca.indeed.com/rc/clk?jk=1db09f93f750a3d2&fccid=a7ae3b18474ceca5,"[powered, forward, often, processes, audit, collaborative, effectively, hands, descriptionwho, popularity, small, providers., +, closely, build, culture, learning, instrumental, looking, building, standards, multiple, space, highly, implied, career, amounts, incentive, fast., content, saas, meanings, ensuring, detail, eye, support, fraudulent, scale, services, excellent, equivalent, identifies, tell, machine, order, large, identifying, communication, years., perks, closed, written, receives, solving, judgment, sharp, constructive, involved, mixpanel, keen, four, key, spending, diverse, create, range, optimize, analyze, improve, homestars, story, million, darts, recently, one, work., canada, office, trusted, platform, seeking, annually, interprets, degree, product, based, sociable., space., opportunity, increased, strong, provide, connections, score, scientist, synthesizes, december, benefitsthe, science, initiative, ...]",HomeStars,"Toronto, ON",HomeStars is seeking a data scientist to capture insights and measure all released developments. A keen eye for implied meanings in data....,Nothing_Found


In [103]:
def test(city = None, state = None, job_title = None):
    final_job = str(job_title)
    df = pd.DataFrame() 
    
    for page in range(1,6):
        page = (page-1) * 10  
    
    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
        if city is not None:
            final_city = city.split() 
            final_city = '+'.join(word for word in final_city)
            final_site_list = ['http://ca.indeed.com/jobs?q=', final_job, '&l=', final_city,
                    '%2C+', state,'&start=',str(page)] # Join all of our strings together so that indeed will search correctly
        else:
            final_site_list = ['http://ca.indeed.com/jobs?q="', final_job, '"','&start=', str(page)]

        final_site = ''.join(final_site_list) # Merge the html address together into one string
        print(final_site)

        base_url = 'http://ca.indeed.com'
    
        try:
            html = requests.get(final_site, headers = headers()) # Open up the front page of our search first
        except:
            print('That city/state combination did not have any jobs. Exiting . . .') # In case the city is invalid
            return
        
        soup = BeautifulSoup(html.text, 'html.parser') # Get the html from the first page
        targetElements = soup.findAll('div', attrs={'class' : 'row result'})
        
        
        for elem in targetElements:
            job_link = "%s%s" % (base_url,elem.find('a').get('href'))
            df = df.append({"job_link":job_link}, ignore_index=True)
        
    return df


In [104]:
l = test(city = 'Toronto', state = 'ON', job_title = "Data+Scientist")

http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=0
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=10
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=20
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=30
http://ca.indeed.com/jobs?q=Data+Scientist&l=Toronto%2C+ON&start=40


In [105]:
l

Unnamed: 0,job_link
0,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Dl0EE_WhFXbpizkrl8nywi6L-ueye5Z0RVrGddRMF4VcenHcfJ6Sm70jbXDr9BXYhJW1NEeVDe0qo0ac2-_gj9SlseVbeUS-HSYJBulSeV4oh49Ft4nqtv9bhswSmmIXQ2I2QRjvGF_eaVJJWNhgMZwcI-fJYdcvvNwVrpqB-5WFpQ5fPVm_VI2GnWRQ1a4AqEgvsXCQEOEsVoYNxOY2x6ChusgnKFmlJREb0ZCPMTiqW1LmObpjINAbOG2W5nVKeASDAOjq3O69nLrLNfBuiAWUACMcr5AIhJ_RT4p-YnC6SGmcXeBoCX25GBP9PZZJ8Y-ljTgzgNYA8qlRpKD9hqofNN5tCMK0vV6VIYDyPG_G3WEaO1ctKd3ddd5LjHfyvCfjZgzswxN5hq5cdTB1vDl7AwTt1dFu9tabHkVhBMroUyoGwNfA-Dic1FGANm1Aa_drX3YO6YSqEfaKUZ73WlU5GRqSdWyi6NTD3lfRO5vQx1fZQSPYnXGdCfXaaDMLvTZ1B5lOxbB_fBsi4j94KzGEo27p9dNtpe0Ambc5QBLg==&p=1&sk=&fvj=0
1,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Dl0EE_WhFXbpizkrl8nywi6L-ueye5Z0RVrGddRMF4VcenHcfJ6Sm70jbXDr9BXYhJscKp_ZnISfkbEkZGiHl21sOa7pR3ffcOPduTls4AZzycp0PLfP26XYs4Gp7vrc9hliZ9NZHCYNtWnmOFj2w65xtRVvTvZmnqwDJYiU8euc_RduCPahfyb9eBU_I6Z8Obqax2gJ9yrgsTpp_fXisLfwWJ5pqLazgprNCFNfqH0vdEeBjqTSMO2iQfYwndwa9IBWIN5HEWTelq0ATMh2vEUV5KsIM2WzNn7dUi2uJZFr-w7bE3pEBj4Fd7kfcXkI3cwqii2QA0zkTAql3M5LNaVBN4ggGxaX4rEmudrBz-RcRMeyyla_QrlkJsNGhotDxY29Ne10YggHDyZhLMa-BOt2PZhLNSfHAfC6VmbDbDflhWzlBt162uEgnfMZ1kjJDlkQO4x6AbMFi4HAnTPQn4UNX7G6fCV0Ut4fkSoBFMuieipCT4Zvi7BT9flVkh1pOfQ_zeIK3GYkK8sA5yEC5hwze1H-RIXHEYDUmcnngwvsZbdDLWhzPqU3R1AZcM9l4=&p=2&sk=&fvj=0
2,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Bkpu8HCcIg6w2Px_-g0kqv0N6j2AyswEWsTC7b_CkCFhBEdcZ_bx7kG10IY6QbnWr17ZAh8qcSjvytIG9mHGjtrcWA4Cj6_x3ciLiT20FENl9bmu9mwkI3tjjajMrHkMUugv9XZUH4KcKx_nOpom8VJnIQx_X52IUfb9RqLwKTw22W3P5zz5QQYr6dWLt5Uz_2bSOX7Vc5TgKZ00X3yUoMpFu552krnMOY0NUwulX3H-SkktpqO9s4pfgraWj-NSLZ1eY6gqQgl2PaR8tYFwIQeksz0-SoxIJzfksZ5OA8s5_OGW9b7_MFKU4ocjTPJh0UhdEqdoiyCllh2Jfy3wdpmB4DRCkQcEkOAfRM0FXcE4Nh6Wy8NuKAyvrJJJymXa_7BMxggyK_4rDda6o7JBWUnHff7oqu8ssTaudg45ULbcavCCYcYbeVDvdGD2WELFYhRUyuS7nsfSZ2gpHgc29fus2etN0GzulT2d5QwHbfHtLdFncoQYKlrHkoVyZ-IlkT4AJNDBlaCqe_PzPD1XWqTpCTg8EMy7Eq6tNQyH1nK6-bOJ05yciEwxvwqJtcOL1z9_DkYKaGV-2F7EdBwD0iu2NowYHSD_I=&p=4&sk=&fvj=0
3,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BlF99n-nNBr1TbUFPQlXHtlU_xPAKaoVNBEsTN_g4pwpKD7IeBJF73MCeEjZIH8pC4lXS27h3Q2agxWKWr_CVhFqxqIh1aMRGkEHlGtWwhVUaXcm8O2iB23awlXoD7jN9kl_hLy4qrthi6VOXHpB1vnPHGRXBzg7g8fev2crl7s_qMbv-p1vkVp9RGlRwLVLXlMRvsp9DVLcpBnUG-wCgFUowhqAxSf8XGxkKte3rtijOI8ZZqdPi9VUyfMr98xgt7vFj6J8NyUWMChIO074yyEPfm2DfXwPxjQJF-GUsASn15nex2E95krfShzfzk_FJk1Mb45teL1HiG-AcIgoQt8iqTANDAf70c2hacuHGmgxKoTUsVSLBJDrVVmsrLtAVlxUR9UsqI3UX1H5YhawqOqOr9NmRTGiKgnX-LAcvU-0VG_7TLs0Ga&p=1&sk=&fvj=1
4,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Dl0EE_WhFXbpizkrl8nywi6L-ueye5Z0RVrGddRMF4VcenHcfJ6Sm70jbXDr9BXYjJZyIy5K8BzmrJC4CigCASvr8Uf-GW3oeVvU76dLY5am3YZj-N7xsNAohbe3eY90tUAMPR_TrAtQ6t0tf-oX-05zvgWJet7SAf0l-THXYvZCNYeh2vMXDA4h7W8GbNj1HMT8EYwzWWqhahZm9r7ips0EmKDI17Bfggy5gXNe-9ZtmPBvSHgrhO76WWMxQdI6XsMxwidpXTUsjYjXMRz6YSyLQ0eppJ1dKxHnfvFYcXWrYw6_QnUsosNF61tBYVTvGwyHkTInXfw__zLNv3EmJ651WjdenbFuSkq_7GNCshxTJbtfdSuKMssVnlyVfwSlhdfksPqwXKse15k3A2yXaAbq9TZ1SSsew1_VvfR9BF-dfd41ZUK2giZB-RbS5SRF41SUsYUsRhj8Qm3-HgWu17gXZtH-zx9YYoZDROXFvw3YBVJ5DT8iXWMl0-vVZtnqNdg3p5nkLMU_C7_fHPxqObEzGFnPPGBWnEjglEeS6OiPMfXxyYJwwv9uOYaoZ9mPc=&p=3&sk=&fvj=0
5,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Bkpu8HCcIg6w2Px_-g0kqv0N6j2AyswEWsTC7b_CkCFhBEdcZ_bx7kG10IY6QbnWr17ZAh8qcSjvytIG9mHGjtrcWA4Cj6_x3ciLiT20FENl9bmu9mwkI3tjjajMrHkMUugv9XZUH4KRtm6T0m6O2GwKxOuAmOKuYvRSvgdsu5an2vvC6T-O94RaNNis7ek3wdfeM0A-m-BTOteUhvwp33_OwFAqc4ZzY2gZlLNWNLj693oa8ENYugRoRqBEUWCTTys75OEEhhLN2l_bzKwmKL7Pp1RdWESMB5WpER0BTCyH7U4X-WoUMNgXxLTNWQPaJrJc3NfjadUJvXa1AQet0oUKY5Ljur-iSHAURO2Uy4y6D4Bhuve_bt1pSMQd00UuqZio7vZnxo5slsFE0TbxJwaGV18J7JCkkftAosw7uONTQkSR281TjeOfABEAjzGiM3nzoTrnr13bD19n9LDYZApvRRDg-QFThsQZTId73_wpOTgs3XJrV6osKbjkHau6CxEd0kFD7YWMa62t_5PT656kM8iaZWk8k5fXi9fJQulY6iUhJL228cCjk299sozndqfizIcXZ69t1mveclIihr7woRxkHa9Y4=&p=4&sk=&fvj=0
6,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BlF99n-nNBr1TbUFPQlXHtlU_xPAKaoVNBEsTN_g4pwpKD7IeBJF73MCeEjZIH8pC4lXS27h3Q2agxWKWr_CVhFqxqIh1aMRGkEHlGtWwhVUaXcm8O2iB23awlXoD7jN9kl_hLy4qrthi6VOXHpB1vnPHGRXBzg7inChbF-N9vKIx_X8BTwo6W2zCdKuPGtqyJShx2QDCUZbnRtWwmPQYsZaLgcrMN0jR803mEl82bwkEYlpoBrpf_v5qha8K_DUYvgVx-9AwTDVaF6wJqzv1SLTvdYRfdwP7i2D4dPgEVYphVvstdWIZwmaQNE6Hr5_2W_2N_2UlJY4NIWHO7BIuB8B8MTNb6-ojc4icqLxhOREkkL05AONLkd-_czDd9akFRXWxaoTiZZR429NwSIaRVTKt1q4jGXi0cHFHJzh85vrB0F1aqs1Qm&p=1&sk=&fvj=1
7,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Bkpu8HCcIg6w2Px_-g0kqv0N6j2AyswEWsTC7b_CkCFhBEdcZ_bx7kG10IY6QbnWr17ZAh8qcSjvytIG9mHGjtrcWA4Cj6_x3ciLiT20FENl9bmu9mwkI3tjjajMrHkMUugv9XZUH4KR3sEH0AMtBIoXV3kUe4Sml5-_bCOkBsxnYGzIsHUwQX9gXFg1riqeH3th5XaXgQzLD-XUJJHW9toMyJ6GSPlt0WXxpg89ZBpocDvF_T9TaNM9pbHI-wBBlcHQkN5N34Q5ogl-UBUPonzRyD57dZB1g3IpwaXRus_rW-48ebZb7aJAqipXwPrLHmg6iuMvXXrHV4nyiBRCD9kwnvlJIzZM5d3Uc3uofLEBb8ize3JZYtVmEGofnfk6ZQiCHwZKV6TyD2bzirCVBiv9kujNgdUpDd3F-f2BSEjlp6fAbi9Rrptx7BnH4lOtyjwMUzh1RwUetswHnDCjBn3iX6Izdfh-S12vGi-V8H3wGkMxMkcVZg19bytAgFQgVSQ19o57hoa4VEyFq7nupdg4NCS92hrUc8S90jKzmsrmKMQW0-N9elsCWTf9abtDvlKNemhbZy0pRVthB7HR3MJllBm18Aiuk=&p=3&sk=&fvj=0
8,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Dl0EE_WhFXbpizkrl8nywi6L-ueye5Z0RVrGddRMF4VcenHcfJ6Sm70jbXDr9BXYhJscKp_ZnISfkbEkZGiHl21sOa7pR3ffcOPduTls4AZzycp0PLfP26XYs4Gp7vrc9hliZ9NZHCYHvnKvyU6dSxj6v0jER_kz5rSSqZQeEJ3NcOMAVJNX78X4ZyComWiPSJtRMVVZq4TvNC9nYQfM-GHQszFt1Qfg1Qq-fWzA98usRJGcVhyfJlZYC_qLK0bvroeFU9x8Kkkt4w9LBie5KbIPvqR3DvreQZfCsY3RVi_11c5_uLcgKkntXQiTfF2hn_2Mx2qjjFYNybS-QWOag72wkMl26WFX-P31B8URRI5AJUCmQN_dP0kBZoe64ETnK173axyOstYhsazGRPdyU263nmmUXU6Dg8ABQ5fTo5gFr80-GmLl68hMG1UUXC69tL8VZV1k3Qvy4wY_vw76_46Au1WL5BVigHDHnNHXbsxJStILvN-vrtSCSNUo6Y5d3mv7mm9EN1606xtijq0V5gCmmtiWBaLLab3wHsgUZQ4sFa-rO7t9TpmNIgVU-qahA=&p=4&sk=&fvj=0
9,http://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Dl0EE_WhFXbpizkrl8nywi6L-ueye5Z0RVrGddRMF4VcenHcfJ6Sm70jbXDr9BXYhJscKp_ZnISfkbEkZGiHl21sOa7pR3ffcOPduTls4AZzycp0PLfP26XYs4Gp7vrc9hliZ9NZHCYHLMNE6PaeknKHwfFhuGu3K3H3FOotvTlbsS1BmfBb9MGbn0rhb-Y9e2jlwKZ9EYkYiGZnrcxSVYD2GWyEzvvD8JXWxdGwdhqFSsA5xCLWynUS_ghHpIv5zFajzWNXuwy6WcLzVeRBaYA3WFzmjvB0YD3mjs2HvwDZohxwIYNjDrT-J0Dc_QPUzzhMjI4e3Y8CJTzDseEvS7DJ1RGf6nqvGNmls0nrgGDpnJigpjDaBLUBDl7aMmR29SY4CBawnuBJY2GgSA_drluwS8Z8ADVypFjs3m26zTVexJ5xaYBofVLPvwzFirnwrcqiyQ0U91imihZCwl4fjU6M-7pOgYmALSzzvIuATl-7HWWTWhBOa5nyToSokk5qn1XDTH7cLzBBIqkbzGFK3yLbb8BAAg5qXFfZLHP3jw_Fj7DSk_WKpee5wUHoaX9EY=&p=1&sk=&fvj=0


In [8]:
for i in df1["summary"]:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    
    data_words = ["data","science","scientist","engineering","analytics","analysis"]
    
    rev_data = [i for i in tokens if not i in data_words]
    
    # remove stop words from tokens
    stopped_tokens = [i for i in rev_data if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)


In [9]:
dictionary = corpora.Dictionary(texts)

In [13]:
corpus = [dictionary.doc2bow(text) for text in texts]


In [14]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [15]:
print(ldamodel.print_topics(num_topics=3, num_words=20))

SyntaxError: invalid syntax (<ipython-input-15-d7a42d01d1d3>, line 1)

In [52]:
df1('city = None, state = None, job_title = None'):
    

Unnamed: 0,job_title,company_name,location,summary,salary
1,Data Scientist,RBC,"Toronto, ON",Data Science Innovation. Data Science Project ...,Nothing_Found
2,Data Scientist,Quantfury Ltd,"Toronto, ON",Data scientist (Machine Learning Expert). Expe...,Nothing_Found
3,Data Engineer,Morneau Shepell,"Toronto, ON",Implementation of Big Data infrastructure and ...,Nothing_Found
4,Applied Research Scientist,Element AI,"Toronto, ON",Explaining the data audit and data quality con...,Nothing_Found
5,Data Scientist: Advanced Analytics,IBM,"Toronto, ON",The Data Scientist will be familiar with key i...,Nothing_Found
6,Data Analyst,Manulife Financial,"Toronto, ON",Work with Data Scientists to transform busines...,Nothing_Found
7,Data Scientist,Scotiabank,"Toronto, ON","Experience cleaning, transforming and visualiz...",Nothing_Found
8,Data Scientist,Veritaaq,"Greater Toronto Area, ON",We are looking for an exceptional data scienti...,Nothing_Found
9,"Research Scientist, Google Brain (Canada)",Google,"Toronto, ON",From creating experiments and prototyping impl...,Nothing_Found
10,Data Analyst,Decision Resources Group,"Toronto, ON","In this role, you will work with a team of 6 d...",Nothing_Found


In [None]:
from