# IS434: Social Analytics & Apps
### SkillsFuture Industry Scraper
---

In [59]:
import pandas as pd
import os
from os.path import isfile, join
import aspose.words as aw
pd.set_option('display.max_rows', None)

## Clean file names, convert to PDF to HTML

In [86]:
# fix file names after importing - to be run once only after scraping

def clean_title( file_name ):
    temp = file_name.replace("Skills Map_", "")
    temp = temp.replace("(GSC Top 5)", "")
    temp = temp.split(" ")
    temp = "-".join(temp)
    temp = temp.replace("-.pdf", ".pdf")
    new_name = temp.lower()
    return new_name
    
# Retrieve each sector
sector_dir_list = [ dir for dir in os.listdir("./") if dir.endswith('_Sector') ]

for sector_dir in sector_dir_list:
    # Retrieve all subsectors in a sector
    subsector_dir_list = [ dir for dir in os.listdir(f"./{sector_dir}") ]

    for subsector_dir in subsector_dir_list:
        # Retrieve all pdf files in a subsector
        pdf_files = [ file for file in os.listdir(f"./{sector_dir}/{subsector_dir}/pdf") if file.endswith(".pdf") ]

        # Clean each file's title and rename the file
        for file in pdf_files:
            # Clean file title and rename file
            new_name = clean_title( file )
            os.rename(f"./{sector_dir}/{subsector_dir}/pdf/{file}", f"./{sector_dir}/{subsector_dir}/pdf/{new_name}")
            
            # Save PDF as HTML document
            doc = aw.Document(f"./{sector_dir}/{subsector_dir}/pdf/{new_name}")
            doc.save(f"./{sector_dir}/{subsector_dir}/html/{new_name[:-4]}.html")

        # Remove images generated from the aspose package (only workaround)
        for file in os.listdir(f'./{sector_dir}/{subsector_dir}/html/'):
            if file.endswith('.png'):
                os.remove(f'./{sector_dir}/{subsector_dir}/html/{file}')

print('Done!')

Done!


### Scrape the HTML for Job Role + Job Skills

In [87]:
# Helper function to extract job role
def retrieve_job_role( span_list ):
    for i in range( len(span_list) ):
        span_text = span_list[i].text.strip()

        if "Job Role" == span_text:
            return span_list[i+1].text.strip()

In [88]:
# Helper function to extract skills
def retrieve_skills( span_list ):
    end_index = 0
    start_index = 0
    
    # Get index of last occurrence of skill
    for i in range( len(span_list)-1, 0, -1 ):
        text = span_list[i].text
        if "Level" in text:
            end_index = i
            break

    # Get index of first occurrence of skill
    for i in range( len(span_list)-1 ):
        text = span_list[i].text
        if "Generic Skills and Competencies" in text:
            start_index = i+1
            break
    
    # Get the subset of <span> that have the skills
    skillsets = [ span.text for span in span_list[start_index:end_index+1] ]

    # Remove '\xa0' occurrences in list from pdf to html conversion
    skillsets = [ skill for skill in skillsets if skill.strip() not in "\xa0"]
    
    # Store as list of skills and return
    skills_list = {}
    
    for i in range(1, len(skillsets), 2):
        skill_name = skillsets[i-1].strip()
        skill_level = skillsets[i].strip()

        if "," in skill_level:
            skill_level = skill_level.split(", ")[0]

        skills_list[skill_name] = skill_level
    return skills_list

In [95]:
from bs4 import BeautifulSoup

jobs_dict = {}

# Retrieve each sector
sector_dir_list = [ dir for dir in os.listdir("./") if dir.endswith('_Sector') ]

for sector_dir in sector_dir_list:
    # Retrieve all subsectors in a sector
    subsector_dir_list = [ dir for dir in os.listdir(f"./{sector_dir}") ]

    for subsector_dir in subsector_dir_list:
        # Retrieve all html files in a subsector
        html_files = [ file for file in os.listdir(f"./{sector_dir}/{subsector_dir}/html") if file.endswith(".html") ]

        # Scrape each file
        for file in html_files:
            with open( f"./{sector_dir}/{subsector_dir}/html/{file}" ) as fp:
                soup = BeautifulSoup(fp, 'html.parser')
                all_spans = soup.find_all('span')
                
                # Retrieve job role
                job_role = retrieve_job_role( all_spans )

                # Retrieve job's skills and competencies
                skills_list = retrieve_skills( all_spans )
                
                # Create key-value pair -> job_role, skill_list
                print(sector_dir, subsector_dir[3:], job_role, skills_list, end = "\n\n")
                break
        break
    break
    

# Remember to convert Beginner = Level 1, Intermediate = Level 2, Advanced = Level 3

ICT_Sector Strategy and Governance Associate Business Analyst {'Business Environment Analysis': 'Level 2', 'Problem Solving': 'Intermediate', 'Business Needs Analysis': 'Level 2', 'Lifelong Learning': 'Intermediate', 'Business Requirements Mapping': 'Level 3', 'Transdisciplinary Thinking': 'Intermediate', 'Change Management': 'Level 3', 'Virtual Collaboration': 'Intermediate', 'Data Visualisation': 'Level 3', 'Decision Making': 'Intermediate', 'Partnership Management': 'Level 3', 'Competencies': 'Process Improvement and Optimisation', 'Level 3': 'Technical Sales Support', 'Level 2': 'Stakeholder Management'}



In [78]:
jobs_dict['Data Analyst/Associate Data Engineer']

{'Budgeting': 'Level 3',
 'Leadership': 'Intermediate',
 'Business Innovation': 'Level 4',
 'Developing People': 'Intermediate',
 'Business Needs Analysis': 'Level 2',
 'Computational Thinking': 'Intermediate',
 'Business Performance Management': 'Level 3',
 'Communication': 'Intermediate',
 'Data Analytics': 'Level 2',
 'Creative Thinking': 'Intermediate',
 'Data Engineering': 'Level 2',
 'Data Ethics': 'Level 3',
 'Data Visualisation': 'Level 3',
 'Database Administration': 'Level 2',
 'Design Thinking Practice': 'Level 3',
 'Networking': 'Level 3',
 'Project Management': 'Level 3',
 'Stakeholder Management': 'Level 2'}

In [65]:
# # DO BE DONE
#         # Retrieve job's key task
#         all_td = soup.find_all('td')
#         start_index = 0
#         end_index = 0

#         # Find range of <td> tags that contain job description information
#         for i in range( len(all_td) ):
#             row_td = all_td[i]

# #             if "Performance Expectations" in row_td.p:
# #                 print(row_td.p)
#             print( row_td.p.string )

# #             for span in p_tag:
# #                 span_text = span.text.strip()
# #                 if "Performance Expectations" == span_text:
# #                     start_index = i + 1

# #                 if "Skills and Competencies" == span_text:
# #                     end_index = i

#         # Extract <p> tags that contain job description
#         job_desc_list = all_p[start_index:end_index]

#         print( job_desc_list )