# IS434: Social Analytics & Apps
### Technical Skills and Competencies Scraper
---

In [1]:
import pandas as pd
import os
from docx2python import docx2python
import json

### Helper function to clean dataframe

In [2]:
def clean_df( df ):
    # Remove list from column 1
    df.iloc[:, 0] = df.iloc[:, 0].apply( lambda val : "" if val == None else val[0] )

    # Remove list for rows 1 to 5, clean the text
    for i in range(6):
        df.iloc[i, 1:] = df.iloc[i, 1:].apply( lambda val : "" if val == None else str(val[0]) )
        df.iloc[i, 1:] = df.iloc[i, 1:].apply( lambda val : val.replace('\t', '') if '\t' in val else val ) 

    # Clean knowledge and abilities
    for i in range(6, 8):
        df.iloc[i, 1:] = df.iloc[i, 1:].apply( lambda row : [ elem.replace('--\t', '') for elem in row ] )
        df.iloc[i, 1:] = df.iloc[i, 1:].apply( lambda row : [ elem for elem in row if elem != "" ] )

    # Clean skill level from e.g. Level 1 to L1
    df.iloc[3, 1:] = df.iloc[3, 1:].apply( lambda skill_level : "L" + skill_level[-1] )
    
    return df

In [3]:
def clean_unicode( string ):
    if "\u2018" in string:
        string = string.replace("\u2018", "\'")
    if "\u2019" in string:
        string = string.replace("\u2019", "\'")
    if "\u201c" in string:
        string = string.replace("\u201c", "\"")
    if "\u201d" in string:
        string = string.replace("\u201d", "\"")
    if "\u00a0" in string:
        string = string.replace("\u00a0", "")
    if "\u2013" in string:
        string = string.replace("\u2013", "-")
    return string

### Scraped data from the .docx

In [4]:
scraped_data = [] 

In [5]:
# loop through all sectors
sector_dir_list = [ dir for dir in os.listdir("./") if "_Sector" in dir ]

skill_id = 0
for sector_dir in sector_dir_list:
    # retrieve each sector's subfolder
    skill_dir_list = [ dir for dir in os.listdir(f"./{sector_dir}/") ]
    for skill_dir in skill_dir_list:
        # retrieve each .docx in a subfolder
        docx_files_list = [ docx for docx in os.listdir(f"./{sector_dir}/{skill_dir}/") if docx.endswith(".docx") ]

        # scrape data from each .docx file
        for doc_file in docx_files_list:
            # Load the 
            doc = docx2python(f"./{sector_dir}/{skill_dir}/{doc_file}")
            table = doc.body[0]
            df = clean_df( pd.DataFrame(table) )

            skill_id += 1

            skill_dict = {
                "id": skill_id,
                "category": clean_unicode( df.at[0, 1] ),
                "name": clean_unicode( df.at[1, 1] ),
                "description": clean_unicode( df.at[2, 1] ),
                "proficiencies": []
            }

            proficiency_id = 0

            for i in range(1, 7):
                # Ensure the skill code e.g. ICT-XX-XXXX-X.X is not empty 
                if df.at[4, i] != "":
                    # Store each existing skill into skill dict
                    knowledge_list = [ clean_unicode(knowledge) for knowledge in df.at[6, i] ]
                    abilities_list = [ clean_unicode(ability) for ability in df.at[7, i] ]

                    # Unique ID for this proficiency
                    proficiency_id += 1
                    skill_dict["proficiencies"].append(
                        {
                            "id": proficiency_id,
                            "skill_id": skill_id,
                            "level": df.at[3, i],
                            "description": clean_unicode( df.at[5, i] ),
                            "knowledge": knowledge_list,
                            "abilities": abilities_list
                        }
                    )

            scraped_data.append( skill_dict )

### Write output to JSON

In [6]:
# with open('skills_info.json', 'w') as outfile:
#     outfile.write( json.dumps( scraped_data, indent=4) )

In [None]:
with open('skills_info_with_id.json', 'w') as outfile:
    outfile.write( json.dumps( scraped_data, indent=4) )