In [0]:
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, StopWordsCleaner, WordEmbeddingsModel, SentenceEmbeddings
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT, DenseVector
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as f

In [0]:
profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")

In [0]:
train_df, test_df = profiles_with_scores.randomSplit([0.8, 0.2], seed=42)

In [0]:
from sparknlp.pretrained import ResourceDownloader
print(ResourceDownloader.showPublicModels("word_embedding"))

In [0]:
import sparknlp
print(sparknlp.version())

In [0]:
# 1. Preprocess `about` using Spark NLP
document_assembler = DocumentAssembler() \
    .setInputCol("about") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["token"]) \
    .setOutputCol("clean_tokens")

embeddings = WordEmbeddingsModel.pretrained() \
    .setInputCols(["document", "clean_tokens"]) \
    .setOutputCol("embeddings")

sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("about_embeddings")

nlp_pipeline = Pipeline(stages=[document_assembler, tokenizer, stopwords_cleaner, embeddings, sentence_embeddings])

# Apply NLP Pipeline
nlp_model = nlp_pipeline.fit(train_df)
processed_data = nlp_model.transform(train_df)

In [0]:
# 2. Numerical Features
processed_data = processed_data.withColumn("num_education", f.size(f.col("education"))) \
    .withColumn("num_experience", f.size(f.col("experience"))) \
    .withColumn("num_languages", f.size(f.col("languages"))) \
    .withColumn("total_followers", f.col("followers")) \
    .withColumn("recommendations", f.col("recommendations_count"))

In [0]:
# 3. Categorical Features - TODO USE THEM???
# Index and encode categorical columns
# indexer = StringIndexer(inputCol="country_code", outputCol="country_code_index")
# encoder = OneHotEncoder(inputCol="country_code_index", outputCol="country_code_vec")
# cat_pipeline = Pipeline(stages=[indexer, encoder])

# Fit and transform categorical pipeline
# cat_model = cat_pipeline.fit(processed_data)
# processed_data = cat_model.transform(processed_data)

In [0]:
# 4. Assemble features
# assembler = VectorAssembler(inputCols=[
#     "about_embeddings", "num_education", "num_experience", "num_languages",
#     "total_followers", "recommendations", "country_code_vec"
# ], outputCol="features")

In [0]:
# from pyspark.sql.functions import udf
# from pyspark.ml.linalg import VectorUDT, DenseVector
# from pyspark.ml.feature import VectorAssembler
# import pyspark.sql.functions as f

# # UDF to extract embeddings and convert to DenseVector
# def extract_embeddings(embeddings):
#     return DenseVector(embeddings[0].embeddings) if embeddings else DenseVector([])

# extract_embeddings_udf = udf(extract_embeddings, VectorUDT())

# # Apply the UDF to extract embeddings
# processed_data = processed_data.withColumn("about_embeddings_vector", extract_embeddings_udf(f.col("about_embeddings")))

# # Assemble features
# assembler = VectorAssembler(
#     inputCols=[
#         "about_embeddings_vector", "num_education", "num_experience", "num_languages",
#         "total_followers", "recommendations"
#     ],
#     outputCol="features"
# )

# final_data = assembler.transform(processed_data)

# # Select relevant columns
# final_data = final_data.select("features", "filled_precent")

# # Save processed data
# # final_data.write.parquet("processed_profile_data.parquet")
# display(final_data.limit(100))

In [0]:
def to_dense_vector(embeddings_array):
    return Vectors.dense(embeddings_array)

# Register a UDF to convert arrays to dense vectors
to_dense_udf = udf(lambda x: to_dense_vector(x), VectorUDT())

# Apply the UDF to the embeddings column (adjust column name as needed)
processed_data = processed_data.withColumn(
    "about_embeddings_dense", 
    to_dense_udf(f.expr("about_embeddings.embeddings[0]"))
)


# Assemble features
assembler = VectorAssembler(inputCols=[
    "about_embeddings_dense", "num_education", "num_experience", "num_languages",
    "total_followers", "recommendations",
], outputCol="features", handleInvalid="skip")

final_data = assembler.transform(processed_data)

# Select relevant columns
final_data = final_data.select("features", "filled_percent")

# Save processed data
# final_data.write.parquet("processed_profile_data.parquet")
display(final_data.limit(100))

In [0]:
# final_data.write.parquet("processed_profile_data.parquet")
final_data.write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/processed_data.parquet")