# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [2]:
import datetime
import pyspark.sql.functions as F
import pytz
from awsglue.dynamicframe import DynamicFrame




In [5]:
date_format='%m/%d/%Y'
date = datetime.datetime.now(tz=pytz.utc)
date = date.astimezone(pytz.timezone('US/Pacific'))
today = date.strftime('%Y%m%d')




In [27]:
df = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcleaned',
    table_name=f'clean_{today}'
).toDF()

# df = glueContext.create_dynamic_frame.from_catalog(
#     database='glassdoorcleaned',
#     table_name=f'clean_20230704'
# ).toDF()




In [28]:
# Job Location
df = df.withColumn('jobcity', 
                   F.when(
                        F.col('joblocation') == 'Remote', 'Remote'
                        )
                        .otherwise(
                            F.split(F.col('joblocation'), pattern=',')[0]
                        )
                    ) \
        .withColumn('jobcity',
                    F.trim(F.col('jobcity'))
                    ) \
        .withColumn('jobstate', 
                    F.when(
                        F.col('joblocation') == 'Remote', 'Remote'
                        ).
                        otherwise(
                            F.element_at(F.split(F.col('joblocation'), pattern=','), -1)
                            )
                    ) \
        .withColumn('jobstate',
                    F.trim(F.col('jobstate'))
                    )




In [29]:
# Company Type and Company Name
df = df.withColumn('companytype', 
                    F.when(
                        F.col('companytype')=='Other', F.col('companytype')
                        )
                        .otherwise(
                            F.element_at(F.split(F.col('companytype'), pattern=' - '), -1)
                            )
                    ) \
                    .withColumn('companytype',
                        F.trim(F.col('companytype'))
                    ) \
        .withColumn('companyname', 
                    F.split(
                        F.col('companyname'), pattern='\n'
                        )[0]
                    ) \
        .withColumn('companyname',
                    F.trim(F.col('companyname'))
                    )




In [30]:
# Salary
df = df.withColumn('jobsalary', 
                    F.regexp_replace(
                        F.col('jobsalary'), 'Employer Provided Salary:', '')
                    ) \
        .withColumn('jobsalary', 
                    F.regexp_replace(
                        F.col('jobsalary'), ' \(Glassdoor est.\)', '')
                    ) \
        .withColumn('jobsalary',
                    F.when(
                        F.col('jobsalary').contains('Hour'), 
                        F.split(F.col('jobsalary'), pattern=' Per Hour')[0]
                        ).otherwise(
                            F.col('jobsalary')    
                        )
                    ) \
        .withColumn('jobsalary',
                    F.regexp_replace(F.col('jobsalary'), '\$', '')
                ) \
        .withColumn('minsalary',
                    F.split(
                        F.col('jobsalary'), pattern=' - '
                        )[0]
                    ) \
        .withColumn('minsalary',
                    F.trim(F.col('minsalary'))
                    ) \
        .withColumn('maxsalary',
                    F.element_at(F.split(
                        F.col('jobsalary'), pattern=' - '
                        ), -1)
                    ) \
        .withColumn('maxsalary',
                    F.trim(F.col('maxsalary'))
                    ) \
        .withColumn('minsalary',
            F.when(
                F.col('minsalary').endswith('K'), 
                F.regexp_replace(F.col('minsalary'), 'K', '000')
                ).otherwise(
                    F.col('minsalary') * 2080
                )
            ) \
        .withColumn('maxsalary',
                    F.when(
                        F.col('maxsalary').endswith('K'), 
                        F.regexp_replace(F.col('maxsalary'), 'K', '000')
                        ).otherwise(
                            F.col('maxsalary') * 2080
                        )
                    ) \
        .withColumn('averagesalary',
            (F.col('minsalary') + F.col('maxsalary')) / 2
        )




In [31]:
# Company Age
df = df.withColumn('companyage', 
                   datetime.datetime.now().year - F.col('companyyearfounded')
                   ) \
        .withColumn('companyage',
                   F.trim(F.col('companyage'))
                   )




In [32]:
# Education levels
phds = {'phd','doctorate', 'postdoc'}
masters  = {'msc', 'master'}
undergraduate = {'bachelors', 'undergraduate', 'associates'}

df = df.withColumn('educationlevel',
                    F.when(
                            F.lower(F.col('jobdescription')).rlike('|'.join(phds)),
                            'PhD or higher'
                        ).otherwise(
                            F.when(
                                F.lower(F.col('jobdescription')).rlike('|'.join(masters)),
                                'Masters'
                            ).otherwise(
                                F.when(
                                    F.lower(F.col('jobdescription')).rlike('|'.join(undergraduate)),
                                    'Undergraduate'
                                ).otherwise(
                                    'Other / Unknown'
                                )
                            )
                        )
                    )




In [33]:
# Create columns for skills demanded by job
masterlist = {'python', 'sql', 'scala', 'aws', 'gcp', 'azure', 'stream', 'batch', 'java', 'spark','dbt', 'airflow'}

for skill in masterlist:
    df = df.withColumn(skill,
                  F.when(
                      F.lower(F.col('jobdescription')).rlike(skill),
                      1
                    ).otherwise(
                          0
                    )
                )




In [34]:
# Job Seniority
experienced = {'lead', 'principal', 'senior', 'sr', 'iv', 'manage'}
mid = {'mid', 'ii', 'iii'}
entry = {'entry', 'associate', 'assc', 'i', 'junior', 'jr'}
df = df.withColumn('experiencelevel',
                   F.when(
                       F.lower(F.col('jobtitle')).rlike('|'.join(experienced)),
                       'Senior'
                   ).otherwise(
                       F.when(
                           F.lower(F.col('jobtitle')).rlike('|'.join(mid)),
                           'Mid'
                   ).otherwise(
                       F.when(
                           F.lower(F.col('jobtitle')).rlike('|'.join(entry)),
                           'Entry'
                        ).otherwise(
                            'Other / Unknown'
                        )
                    )
                )
            )




In [35]:
# Select cols
df_transformed = df.select(
    'date',
    'companyname',
    'companyrating',
    'companyrevenue',
    'companysector',
    'companyindustry',
    'companysize',
    'companytype',
    'companyyearfounded',
    'easyapply',
    'jobcity',
    'jobstate',
    'minsalary',
    'maxsalary',
    'averagesalary',
    'companyage',
    'educationlevel',
    'stream',
    'sql',
    'gcp',
    'scala',
    'dbt',
    'java',
    'azure',
    'aws',
    'batch',
    'spark',
    'python',
    'airflow',
    'experiencelevel'
)




In [36]:
# Read dimensional tables in.
dimCities = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimcities'
).toDF()

dimStates = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimstates'
).toDF()

dimSize = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimsize'
).toDF()

dimSector = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimsector'
).toDF()

dimIndustry = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimindustry'
).toDF()

dimRevenue = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimrevenue'
).toDF()

dimType = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimtype'
).toDF()

dimEducation = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimeducation'
).toDF()

dimExperience = glueContext.create_dynamic_frame.from_catalog(
    database='glassdoorcurated',
    table_name='curated_dimexperience'
).toDF()




In [37]:
# Get list of valid values for categorical cols containing nulls
cities = {val[0] for val in dimCities.select('cityname').distinct().collect()}
states = {val[0] for val in dimStates.select('stateabbrev').distinct().collect()}
sizes = {val[0] for val in dimSize.select('size').distinct().collect()}
sectors = {val[0] for val in dimSector.select('sector').distinct().collect()}
industries = {val[0] for val in dimIndustry.select('industry').distinct().collect()} 
revenues = {val[0] for val in dimRevenue.select('revenue').distinct().collect()}
types = {val[0] for val in dimType.select('type').distinct().collect()}

# Apply valid categorical values check, fill nulls with Other / Unknown if necessary 
validvalueshashmap = {
    'jobcity': cities,
    'jobstate': states,
    'companytype': types,
    'companyrevenue': revenues,
    'companysector': sectors,
    'companysize': sizes,
    'companytype': types,
    'companyindustry': industries
}
for col in validvalueshashmap:
    df_transformed = df_transformed.withColumn(col,
                    F.when(
                        F.col(col).isin(validvalueshashmap[col]),
                        F.col(col)
                        ).otherwise(
                                'Other / Unknown'
                            )
                    )

# Fill company name nulls
df_transformed = df_transformed.fillna('Other / Unknown', subset=['companyname'])

# # Change date col to date type
# df_transformed = df_transformed.withColumn('date', 
#                                            F.date_format(
#                                                F.expr('to_date(date, "MM-dd-yyyy")'),
#                                                "MM-dd-yyyy"
#                                                 )
#                                             )




In [38]:
# Convert model to STAR schema
cols = [
        'date',
        'companyname', 
        'companyrating', 
        'revenuekey', 
        'sectorkey',
        'industrykey', 
        'sizekey', 
        'typekey', 
        'companyyearfounded', 
        'easyapply', 
        'citykey', 
        'statekey', 
        'minsalary', 
        'maxsalary', 
        'averagesalary',
        'companyage', 
        'educationkey', 
        'stream', 
        'sql', 
        'gcp', 
        'scala', 
        'dbt', 
        'java', 
        'azure', 
        'aws', 
        'batch', 
        'spark', 
        'python', 
        'airflow',
        'experiencekey'
    ]
df_fact = df_transformed \
    .join(F.broadcast(dimCities), df_transformed.jobcity==dimCities.cityname, 'left') \
    .join(F.broadcast(dimStates), df_transformed.jobstate==dimStates.stateabbrev, 'left') \
    .join(F.broadcast(dimSize), df_transformed.companysize==dimSize.size, 'left') \
    .join(F.broadcast(dimSector), df_transformed.companysector==dimSector.sector, 'left') \
    .join(F.broadcast(dimIndustry), df_transformed.companyindustry==dimIndustry.industry, 'left') \
    .join(F.broadcast(dimRevenue), df_transformed.companyrevenue==dimRevenue.revenue, 'left') \
    .join(F.broadcast(dimType), df_transformed.companytype==dimType.type, 'left') \
    .join(F.broadcast(dimEducation), df_transformed.educationlevel==dimEducation.education, 'left') \
    .join(F.broadcast(dimExperience), df_transformed.experiencelevel==dimExperience.experience, 'left') \
    .select(cols)




In [39]:
# Convert spark dataframe to glue dynamicframe
dyf_fact = DynamicFrame.fromDF(
    df_fact,
    glueContext,
    'convert'
)




In [41]:
# Change types, apply mapping
mapping = [
    ('date', 'string', 'date', 'string'),
    ('companyname', 'string', 'companyname', 'string'),
    ('companyrating', 'double', 'companyrating', 'double'),
    ('revenuekey', 'long', 'revenuekey', 'int'),
    ('sectorkey', 'long', 'sectorkey', 'int'),
    ('industrykey', 'long', 'industrykey', 'int'),
    ('sizekey', 'long', 'sizekey', 'int'),
    ('typekey', 'long', 'typekey', 'int'),
    ('companyyearfounded', 'string', 'companyyearfounded', 'int'),
    ('easyapply', 'string', 'easyapply', 'string'),
    ('citykey', 'long', 'citykey', 'int'),
    ('statekey', 'long', 'statekey', 'int'),
    ('minsalary', 'string', 'minsalary', 'double'),
    ('maxsalary', 'string', 'maxsalary', 'double'),
    ('averagesalary', 'double', 'averagesalary', 'double'),
    ('companyage', 'string', 'companyage', 'int'),
    ('educationkey', 'long', 'educationkey', 'int'),
    ('stream', 'int', 'stream', 'int'),
    ('sql', 'int', 'sql', 'int'),
    ('gcp', 'int', 'gcp', 'int'),
    ('scala', 'int', 'scala', 'int'),
    ('dbt', 'int', 'dbt', 'int'),
    ('java', 'int', 'java', 'int'),
    ('azure', 'int', 'azure', 'int'),
    ('aws', 'int', 'aws', 'int'),
    ('batch', 'int', 'batch', 'int'),
    ('spark', 'int', 'spark', 'int'),
    ('python', 'int', 'python', 'int'),
    ('airflow', 'int', 'airflow', 'int'),
    ('experiencekey', 'long', 'experiencekey', 'int')
]
dyf_fact = ApplyMapping.apply(
    frame=dyf_fact,
    mappings=mapping,
    transformation_ctx='dyfapplymapping'
)




In [43]:
# Add curated data to S3 curated bucket and Glue Data Catalog

s3output = glueContext.getSink(
  path=f's3://kc-glassdoor-data-curated/curated/curated_{today}',
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="glassdoorcurated", catalogTableName=f"curated_{today}"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf_fact.coalesce(1))

<awsglue.dynamicframe.DynamicFrame object at 0x7f9d5a764490>
