In [0]:
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
import sparknlp
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, StopWordsCleaner, WordEmbeddingsModel, SentenceEmbeddings, BertEmbeddings, Word2VecModel
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT, DenseVector
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as f

In [0]:
profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")

In [0]:
profiles_with_scores = profiles_with_scores.withColumn("about", f.when(f.col("about").isNull(), "No info").otherwise(f.col("about")))
profiles_with_scores = profiles_with_scores.withColumn("position", f.col("position").cast("string"))
profiles_with_scores = profiles_with_scores.withColumn("position", f.when(f.col("position").isNull(), "No info").otherwise(f.col("position")))

In [0]:
import matplotlib.pyplot as plt

sample = profiles_with_scores.select('profile_score').toPandas()

plt.figure(figsize=(8, 6))
plt.hist(sample['profile_score'], bins=50, edgecolor='k', alpha=0.7, color='goldenrod')
plt.title('Histogram of Profile Scores Before Optimization')
plt.xlabel('Profile Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [0]:
display(profiles_with_scores.limit(100))

In [0]:
# 1. Preprocess `about` using Spark NLP
document_assembler = DocumentAssembler() \
    .setInputCol("about") \
    .setOutputCol("about_document")

tokenizer = Tokenizer() \
    .setInputCols(["about_document"]) \
    .setOutputCol("about_token")

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["about_token"]) \
    .setOutputCol("about_clean_tokens")

embeddings = BertEmbeddings.pretrained("small_bert_L2_128") \
    .setInputCols(["about_document", "about_clean_tokens"]) \
    .setOutputCol("about_embeddings_bert")

sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["about_document", "about_embeddings_bert"]) \
    .setOutputCol("about_embeddings")

nlp_pipeline_about = Pipeline(stages=[document_assembler, tokenizer, stopwords_cleaner, embeddings, sentence_embeddings])

# Apply NLP Pipeline
nlp_model_about = nlp_pipeline_about.fit(profiles_with_scores)
processed_data1 = nlp_model_about.transform(profiles_with_scores)
display(processed_data1.limit(100))

In [0]:
# Check that 'position' exists in the original DataFrame (profiles_with_scores)
profiles_with_scores.columns

# 1. Preprocess `position` using Spark NLP
document_assembler = DocumentAssembler() \
    .setInputCol("position") \
    .setOutputCol("position_document")

tokenizer = Tokenizer() \
    .setInputCols(["position_document"]) \
    .setOutputCol("position_token")

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["position_token"]) \
    .setOutputCol("position_clean_tokens")

embeddings = BertEmbeddings.pretrained("small_bert_L2_128") \
    .setInputCols(["position_document", "position_clean_tokens"]) \
    .setOutputCol("position_embeddings_bert")

sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["position_document", "position_embeddings_bert"]) \
    .setOutputCol("position_embeddings")

# Define the NLP pipeline
nlp_pipeline_position = Pipeline(stages=[document_assembler, tokenizer, stopwords_cleaner, embeddings, sentence_embeddings])

# Apply NLP Pipeline to the original data
nlp_model_position = nlp_pipeline_position.fit(profiles_with_scores)
processed_data2 = nlp_model_position.transform(profiles_with_scores)

# Debugging Step: Display columns of processed_data2
processed_data2.columns

# Debugging Step: Show the first few rows to check the content
processed_data2.select('id', 'position_document').show(5)

# Display the processed result
display(processed_data2.limit(100))


In [0]:
# Select only the 'about' related columns from df1 (processed_data1)
processed_data1_selected = processed_data1.select(
    'id',  # Keep the 'id' column for the join
    'about',  # Only select 'about' column
    'about_document',
    'about_token',
    'about_clean_tokens',
    'about_embeddings_bert',
    'about_embeddings'
)

# Now perform the join, keeping all columns from df2 (processed_data2)
processed_data = processed_data2.join(
    processed_data1_selected,  # All columns from df2 will be kept
    on="id", 
    how="inner"
)

# Display the result
display(processed_data.limit(100))


In [0]:
processed_data1.select('id').show(5)
processed_data2.select('id').show(5)

In [0]:
# 3. Numerical Features
processed_data = processed_data \
    .withColumn("num_education", f.when(f.size(f.col('education')).isNull(), 0).otherwise(f.size(f.col('education')))) \
    .withColumn("num_experience", f.when(f.size(f.col('experience')).isNull(), 0).otherwise(f.size(f.col('experience')))) \
    .withColumn("num_languages", f.when(f.size(f.col('languages')).isNull(), 0).otherwise(f.size(f.col('languages')))) \
    .withColumn("total_followers", f.when(f.col("followers").isNull(), 0).otherwise(f.col("followers"))) \
    .withColumn("num_recommendations", f.when(f.col("recommendations_count").isNull(), 0).otherwise(f.col("recommendations_count")))

display(processed_data.limit(100))

In [0]:
def to_dense_vector(embeddings_array):
    return Vectors.dense(embeddings_array)

# Register a UDF to convert arrays to dense vectors
to_dense_udf = udf(lambda x: to_dense_vector(x), VectorUDT())

# Apply the UDF to the embeddings column (adjust column name as needed)
processed_data = processed_data.withColumn(
    "about_embeddings_dense", 
    to_dense_udf(f.expr("about_embeddings.embeddings[0]"))
)
processed_data = processed_data.withColumn(
    "position_embeddings_dense", 
    to_dense_udf(f.expr("position_embeddings.embeddings[0]"))
)

# Assemble features
assembler = VectorAssembler(inputCols=[
    "about_embeddings_dense", "position_embeddings_dense", "num_education", "num_experience", "num_languages",
    "total_followers", "num_recommendations",
], outputCol="features")

final_data = assembler.transform(processed_data)

# Select relevant columns
final_data = final_data.select('id' , "features", "profile_score")

display(final_data)

In [0]:
# final_data.write.parquet("processed_profile_data.parquet")
final_data.select('id', 'features', 'profile_score').write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/processed_data.parquet")