In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors, DenseVector
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.window import Window
import math

spark = SparkSession.builder.getOrCreate()

In [0]:
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')

#### Job Titles and Locations

In [0]:
jobs = profiles.select('name', 'id', 'city', 'country_code', f.col('current_company').getField('name').alias('company_name'), f.col('experience')[0].getField('title').alias('job_title'), 'position')

In [0]:
jobs.display(5)

####Clustering job titles into meta job titles

In [0]:
# Create a DataFrame with the specified centroids
centroids_data = [
    ('Leadership',), ('Product',), ('Engineering',), ('DataScience',), ('Operations',),
    ('Marketing',), ('Sales',), ('Design',), ('Support',), ('Finance',),
    ('Resources',), ('Research',), ('Healthcare',), ('Education',), ('Security',),
    ('Logistics',), ('Legal',), ('Quality',), ('Management',), ('Content',)
]

centroids_df = spark.createDataFrame(centroids_data, ['processed_title'])

# Preprocess job titles
job_titles_df = jobs.select(
    f.when(f.col('job_title').isNotNull(), f.lower(f.col('job_title')))
    .otherwise(f.lower(f.col('position')))
    .alias('processed_title')
)

job_titles_df = job_titles_df.dropna()
tokenizer = Tokenizer(inputCol="processed_title", outputCol="tokened_title")
w2v = Word2Vec(inputCol="tokened_title", outputCol="vector", vectorSize=200, minCount=1)

# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, w2v])

# Train the pipeline model
model_vectorize = pipeline.fit(job_titles_df)

# Create embeddings for job titles and centroids
jobs_with_vectors = model_vectorize.transform(job_titles_df)
centroids_with_vectors = model_vectorize.transform(centroids_df)

jobs_temp = jobs_with_vectors.withColumnRenamed('vector', 'job_vector')
jobs_temp = jobs_temp.withColumnRenamed('processed_title', 'job_title')

centroids_temp = centroids_with_vectors.withColumnRenamed('processed_title', 'meta_job')
centroids_temp = centroids_temp.withColumnRenamed('vector', 'centroid_vector')

joined = jobs_temp.join(centroids_temp)

In [0]:
# Define a function to calculate cosine similarity
def cosine_similarity(v1, v2):
    if v1 is None or v2 is None:
        return None
    dot_product = float(v1.dot(v2))  # Dot product of the two vectors
    norm_v1 = math.sqrt(v1.dot(v1))  # Magnitude (norm) of v1
    norm_v2 = math.sqrt(v2.dot(v2))  # Magnitude (norm) of v2
    if norm_v1 == 0 or norm_v2 == 0:
        return None  # Avoid division by zero
    return dot_product / (norm_v1 * norm_v2)

# Register the function as a UDF
cosine_similarity_udf = f.udf(cosine_similarity, StringType())

# Add a new column to compute cosine similarity
joined = joined.withColumn(
    "cosine_similarity",
    cosine_similarity_udf(f.col("job_vector"), f.col("centroid_vector"))
)

In [0]:
# Show the result
joined.display()

In [0]:
window_spec = Window.partitionBy("job_title").orderBy(f.col("cosine_similarity").desc())

# Rank centroids for each job and select the closest one
ranked_df = joined.withColumn("rank", f.row_number().over(window_spec))

# Filter for the closest centroid
closest_centroids = ranked_df.filter(f.col("rank") == 1)

# Select relevant columns
result_df = closest_centroids.select(
    f.col("job_title"),
    f.col("meta_job").alias("closest_centroid"),
    f.col("cosine_similarity")
)

In [0]:
# Display the result
result_df.display()

In [0]:
profiles_with_state = profiles.withColumn(
    "state",
    f.split(f.col("city"), ", ")[1]  # The second element is the state
)

In [0]:
# Show the results
states_df = profiles_with_state.select("state").dropDuplicates().dropna()
states_df.display()

In [0]:
profiles_with_state = profiles.withColumn('state', f.split(f.col('city'), ', ')[1])
profiles_with_count = profiles_with_state

In [0]:
result_df.write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/classified_jobs.parquet")