### <font color="#1F618D"> Newyork City Job Postings - Data Engineering Challenge </font>
### <font color="#F5B041"> Part-2: Data Preprocessing & Feature Engineering </font>

In [2]:
import findspark

findspark.init()

import jupyter_black

jupyter_black.load()

In [3]:
from pyspark.sql import SparkSession

In [1]:
import sys

sys.path.append("../utils")

In [2]:
%run ../utils/prep_utils.ipynb

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
spark = SparkSession.builder.appName("NYCJobsFeatureEngineering").getOrCreate()

In [4]:
df = spark.read.csv(
    "../../dataset/raw_data/nyc-jobs.csv", header=True, inferSchema=True, escape='"'
)

### <font color="#1F618D"> 1. Feature Extraction - </font> <font color="#F5B041"> Annual Salary </font>

In [6]:
df = standardize_annual_salary(df)

In [7]:
## Extracting average annual salary feature

df = df.withColumn(
    "AverageAnnualSalary", (col("AnnualSalaryFrom") + col("AnnualSalaryTo")) / 2
)

In [8]:
df.select("AnnualSalaryFrom", "AnnualSalaryTo", "AverageAnnualSalary").show(5)

+----------------+--------------+-------------------+
|AnnualSalaryFrom|AnnualSalaryTo|AverageAnnualSalary|
+----------------+--------------+-------------------+
|         42405.0|       65485.0|            53945.0|
|         60740.0|      162014.0|           111377.0|
|        51907.68|      54580.32|            53244.0|
|        51907.68|      54580.32|            53244.0|
|         72800.0|       72800.0|            72800.0|
+----------------+--------------+-------------------+
only showing top 5 rows



### <font color="#1F618D"> 2. Feature Extraction - </font>  <font color="#F5B041"> Degree List & Highest Degree </font> 

In [9]:
# extract all degrees like masters, pg, diploma, baccalaureate, high school etc

df = df.withColumn(
    "degrees",
    get_degree_list_udf(df["Minimum Qual Requirements"], lit(build_regex(keywords))),
)

In [10]:
# Get only the highest degree from list of degrees

df = df.withColumn("HighestDegree", get_highest_degree_udf(col("degrees")))

In [11]:
df.select("degrees", "HighestDegree").show(4, truncate=False)

+-------------------------------------+-------------+
|degrees                              |HighestDegree|
+-------------------------------------+-------------+
|[baccalaureate]                      |baccalaureate|
|[baccalaureate, high school, diploma]|baccalaureate|
|[high school]                        |high school  |
|[high school]                        |high school  |
+-------------------------------------+-------------+
only showing top 4 rows



### <font color="#1F618D"> 3. Feature Extraction - </font>  <font color="#F5B041"> Skills as ngrams</font> 

In [49]:
# remove junk characters
cleaned_df = preprocess_text_column(
    df, input_column="Preferred SKills", output_column="cleaned_text"
)

In [50]:
# tokenize the text & remove stop words
filtered_df = tokenize_and_remove_stopwords(
    cleaned_df, input_column="cleaned_text", output_column="filtered_words"
)

In [51]:
# Apply lemmatization using the UDF
lemmatized_df = apply_lemmatization(
    filtered_df, input_column="filtered_words", output_column="lemmatized_words"
)

In [52]:
# Extract n-grams
ngram_df = extract_ngrams(
    lemmatized_df, input_column="lemmatized_words", output_column="ngrams", n=1
)

### <font color="#1F618D"> 3. Feature Selection & </font>  <font color="#F5B041"> Data Serialization </font>   

In [54]:
# dropping all the columns which are not required for kpi analysis

cols_to_drop = [
    "Civil Service Title",
    "Title Code No",
    "Level",
    "Full-Time/Part-Time indicator",
    "Salary Range From",
    "Salary Range To",
    "Salary Frequency",
    "Work Location",
    "Division/Work Unit",
    "Job Description",
    "Minimum Qual Requirements",
    "Preferred Skills",
    "Additional Information",
    "To Apply",
    "Hours/Shift",
    "Work Location 1",
    "Recruitment Contact",
    "Residency Requirement",
    "Post Until",
    "Posting Updated",
    "Process Date",
    "cleaned_text",
    "words",
    "filtered_words",
    "lemmatized_words",
]

ngram_df = ngram_df.drop(*cols_to_drop)

In [55]:
import os

# Define the target directory path
target_directory = "../../dataset/processed_data/"

# Create the directory if it doesn't exist
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

In [56]:
# Rename columns with invalid characters
columns_with_valid_names = [
    col.replace(" ", "_").replace("-", "_").lower() for col in ngram_df.columns
]
ngram_df_renamed = ngram_df.toDF(*columns_with_valid_names)

In [17]:
# Convert array column to comma-separated string to persist
ngram_df_renamed = ngram_df_renamed.withColumn("ngrams", concat_ws(",", col("ngrams")))
ngram_df_renamed = ngram_df_renamed.withColumn(
    "degrees", concat_ws(",", col("degrees"))
)

In [18]:
# Convert "Posting Date" to date format
ngram_df_renamed = ngram_df_renamed.withColumn(
    "Posting_Date", col("Posting_Date").cast("date")
)

In [19]:
ngram_df_renamed.coalesce(1).write.mode("overwrite").format("parquet").save(
    "../../dataset/processed_data/nyc_job_postings_processed_data.parquet"
)

In [20]:
ngram_df_renamed.coalesce(1).write.mode("overwrite").format("csv").option(
    "sep", "\t"
).save("../../dataset/processed_data/nyc_job_postings_processed_data.csv", header=True)

In [21]:
!ls -l ../../dataset/processed_data/

total 0
drwxr-xr-x 1 root root 512 Aug 31 15:42 nyc_job_postings_processed_data.csv
drwxr-xr-x 1 root root 512 Aug 31 15:41 nyc_job_postings_processed_data.parquet
