In [None]:
import requests
import bs4
import re
from bs4 import BeautifulSoup
import math
import pandas as pd
import time
import os
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [18, 12]

In [None]:
base_url = ''
titles = [{
    'title': 'data scientist',
    'urltitle': 'data+scientist'
}, {
    'title': 'quantitative analyst',
    'urltitle': 'quantitative+analyst'
}, {
    'title': 'statistician',
    'urltitle': 'statistician'
}, {
    'title': 'data analyst',
    'urltitle': 'data+analyst'
}, {
    'title': 'research scientist',
    'urltitle': 'research+scientist'
}, {
    'title': 'machine learning engineer',
    'urltitle': 'machine+learning+engineer'
}, {
    'title': 'data engineer',
    'urltitle': 'data+engineer'
}]

In [None]:
#make output folders if they do not already exist
for term in titles:
      if not os.path.exists('jobdata/'+format(term['title'])):
        os.mkdir('jobdata/'+format(term['title']))

In [None]:
def storedescription(term, url):
    try:
        r = requests.get('https://www.indeed.com'+url)
        soup = BeautifulSoup(r.text, 'html.parser')
        description = soup.find('div', id="jobDescriptionText").get_text()
        #print(description)
        with open("jobdata/"+term+'/'+format(url.split('=')[1])+".txt", "w") as text_file:
            text_file.write(description)
    except:
        print(f'error: {e}')

In [None]:
jobtitles = []
urls = []
jobclass = []

for term in titles:
    startlen = len(urls)
    try:
        r = requests.get('https://www.indeed.com/jobs?q={}&sort=date&limit=50'.format(term['urltitle']))
        soup = BeautifulSoup(r.text, 'html.parser')
        count_str = soup.find('div', id="searchCountPages").get_text()
        max_results = int(count_str.split()[3].replace(',', ''))

        # calculate how many pages we need to query
        max_results = math.ceil(max_results/50)
        time.sleep(1)
    except Exception as e:
        print(f'error: {e}') 

    print('Querying {} pages for {}'.format(max_results, term['title']))
    for x in range(0, max_results):

        try:
            split = '&start='+format(x*50)
            html = requests.get('https://www.indeed.com/jobs?q={}&sort=date&limit=50+{}'.format(term['urltitle'], split))
            soup = BeautifulSoup(html.text, 'html.parser')
            for link in soup.findAll('a'):
                #print(link.attrs)
                try:
                    if term['title'].lower() in link.attrs['title'].lower():
                        jobtitles.append(link.attrs['title'])
                        urls.append(link.attrs['href'])
                        jobclass.append(term['title'])
                        storedescription(term['title'], link.attrs['href'])
                        
                except:
                    print(f'error: {e}') 
            time.sleep(1)
        except Exception as e:
            print(f'error: {e}')

    print('.. found {} results for {}'.format(len(urls)-startlen, term['title']))

In [None]:
from collections import Counter
import glob
import json

#get list of all the txt files in the jobdata folder
files = glob.glob("jobdata/*/*.txt")


def word_count(filename):
    with open(filename, 'r', encoding="utf8", errors='ignore') as f:
        c = Counter()
        for line in f:
            c.update(line.strip().split(' '))
        return c
    
counters = [word_count(filename) for filename in files]

# Add all the word counts together:
total = sum(counters, Counter())  # sum needs an empty counter to start with

with open("wordcount.txt", "w", encoding="utf8", errors='ignore') as text_file:
    text_file.write(json.dumps(total.most_common()))

In [None]:
total.most_common(100)

In [None]:
len(jobtitles)

In [None]:
df = pd.DataFrame({'title': jobtitles, 'jobclass': jobclass, 'url': urls})

In [None]:
df['title'].value_counts()

In [None]:
df['jobclass'].value_counts()

In [None]:
import seaborn as sns
ax = sns.countplot(x="jobclass", data=df)

In [None]:
len(urls)

In [None]:
len(jobtitles)

In [None]:
#go through all the files, and add the text to a list 
labels = []
description = []

# r=root, d=directories, f = files
for r, d, f in os.walk('jobdata/'):
    for file in f:
        if file.endswith(".txt"):
            
            with open(os.path.join(r, file),"r", encoding="utf8", errors='ignore') as f:
                payload = f.read()
                if len(payload) > 1:
                    description.append(payload)
                    labels.append(r.split('/')[1])
            #print(file)

In [None]:
def preprocess(line):
    line = line.replace('\n', ' ') # newlines
    line = line.replace('[', ' ') 
    line = line.replace(']', ' ')
    line = line.replace(':', ' ') # We dont need summaries, just give me tokens
    line= line.replace('\t', ' ') #tabs
    
    # converts line endings in a way that stops the accidental camelcasing
    # but leaves terms like SAP intact
    line= re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', line)
    
    # regex to address the issue of text continuing directly after a period
    # instead of a space
    line = re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', line))
    return line

In [None]:
description[400]

In [None]:
description = [preprocess(d) for d in description]

In [None]:
preprocess(description[400])

In [None]:
len(labels)

In [None]:
plt.rcParams['figure.figsize'] = [18, 12]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from yellowbrick.text import TSNEVisualizer

tfidf = TfidfVectorizer(stop_words='english')

X = tfidf.fit_transform(description)
y = labels

# Create the visualizer and draw the vectors
tsne = TSNEVisualizer(decompose_by=75, decompose='svd', colormap='tab20')
tsne.fit(X, y)
tsne.show()