In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re
from pyspark.sql.functions import col, concat_ws, udf
from pyspark.sql.types import StringType

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

In [0]:
# new df with scores
profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")

In [0]:
profiles_with_scores = profiles_with_scores.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 10, 1
    ).when(f.col('profile_score') < 15, 2
    ).when(f.col('profile_score') < 20, 3
    ).otherwise(4)
)

##Pre process good profile data

In [0]:





good_profiles_df = profiles_with_scores.filter(col('label').isin([3,4])).select(['id', 'city', 'education', 'name', 'position', 'about']).dropna()



def strip_and_choose_first(str_lst):
    return str_lst.strip("[]").split(", ")[0]


# UDF to process the 'education' field (extract degree and school information)
def process_education(degree, field, title):
    # Extract degree, field, and school title from each education entry
    degree = strip_and_choose_first(degree)
    field = strip_and_choose_first(field)
    title = strip_and_choose_first(title)
    edu_details = f"{degree} in {field} from {title}"
    return edu_details

def process_df(df):
    # Register UDF
    process_education_udf = udf(process_education, StringType())

    # Filter rows where the education column is not empty
    filtered_df = df.filter((col("education").isNotNull()) & (col("education") != f.lit([])))

    filtered_df = filtered_df.withColumn('degree', col('education').getField('degree').cast('string'))
    filtered_df = filtered_df.withColumn('field', col('education').getField('field').cast('string'))
    filtered_df = filtered_df.withColumn('school', col('education').getField('title').cast('string'))

    # Process the DataFrame
    good_profiles_df = filtered_df.withColumn("processed_education", 
                                            process_education_udf(col('degree'), col('field'), col('school')))

    # Show the resulting DataFrame
    processed_df = good_profiles_df.withColumn(
                                        "input_prompt",
                                        concat_ws(
                                            ", ",
                                            col("city"),
                                            col("processed_education"),
                                            col("name"),
                                            col("position"),
                                        )
                                )
    processed_df.display(limit=10)
    print(processed_df.count())
    processed_df.write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet")
    return processed_df

In [0]:
profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")
profiles_with_scores.display()

In [0]:
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')
import pyspark.sql.functions as f
jobs = profiles.select('name', 'id', 'city', 'country_code', f.col('current_company').getField('name').alias('company_name'), f.col('experience')[0].getField('title').alias('job_title'), 'position')

In [0]:
from pyspark.sql.functions import col
def process_education(degree, field, title):
    # Extract degree, field, and school title from each education entry
    degree = strip_and_choose_first(degree)
    field = strip_and_choose_first(field)
    title = strip_and_choose_first(title)
    edu_details = f"{degree} in {field} from {title}"
    return edu_details


profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")


job_titles_df = jobs.select(
    f.when(f.col('job_title').isNotNull(), f.lower(f.col('job_title')))
    .otherwise(f.when(f.col('position').isNotNull(), f.lower(f.col('position'))).otherwise(f.lit(''))
    .alias('processed_title')), 'id'
)

profiles_with_title = profiles_with_scores.join(job_titles_df, on='id')
edu_filtered_df = profiles_with_title.filter((col("education").isNotNull()) & (col("education") != f.lit([])))
no_edu_df = good_profiles_df.filter((col("education").isNull()) | (col("education") == f.lit([])))

filtered_df = edu_filtered_df.withColumn('degree', col('education').getField('degree').cast('string'))
filtered_df = filtered_df.withColumn('field', col('education').getField('field').cast('string'))
filtered_df = filtered_df.withColumn('school', col('education').getField('title').cast('string'))

# Process the DataFrame
edu_filtered_df = filtered_df.withColumn("processed_education", 
                                        process_education_udf(col('degree'), col('field'), col('school')))
no_edu_df = no_edu_df.withColumn("processed_education", lit(''))
df = edu_filtered_df.union(no_edu_df)
good_profiles_df = df.filter(col('label').isin([3,4])).select(['id', 'city', 'education', 'name', 'position', 'about', 'recommendations']).dropna()
df.display()
good_profiles_df.display()



tokenizer_title = Tokenizer(inputCol="processed_title", outputCol="tokened_title")
w2v_title = Word2Vec(inputCol="tokened_title", outputCol="vector_title", vectorSize=200, minCount=1)

tokenizer_edu = Tokenizer(inputCol="processed_education", outputCol="tokened_edu")
w2v_edu = Word2Vec(inputCol="tokened_edu", outputCol="vector_edu", vectorSize=200, minCount=1)

# Build the pipeline
pipeline = Pipeline(stages=[tokenizer_title, w2v_title, tokenizer_edu, w2v_edu])

# Train the pipeline model
model_vectorize = pipeline.fit(job_titles_df)

# Create embeddings for job titles and centroids
jobs_with_vectors = model_vectorize.transform(job_titles_df)
centroids_with_vectors = model_vectorize.transform(centroids_df)

jobs_temp = jobs_with_vectors.withColumnRenamed('vector', 'job_vector')
jobs_temp = jobs_temp.withColumnRenamed('processed_title', 'job_title')

centroids_temp = centroids_with_vectors.withColumnRenamed('processed_title', 'meta_job')
centroids_temp = centroids_temp.withColumnRenamed('vector', 'centroid_vector')

joined = jobs_temp.join(centroids_temp)