In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re
import shutil
import os
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.functions import vector_to_array
import matplotlib.pyplot as plt
from pyspark.ml.classification import MultilayerPerceptronClassificationModel


pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

In [0]:
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')

# new df with processed vector to go into the model
processed_data = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/processed_data.parquet")

In [0]:
processed_data = processed_data.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 10, 1
    ).when(f.col('profile_score') < 15, 2
    ).when(f.col('profile_score') < 20, 3
    ).otherwise(4)
)

In [0]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

In [0]:
train_df, test_df = processed_data.randomSplit([0.7, 0.3], seed=42)

In [0]:
# Validate the training data
train_df = train_df.na.drop()
train_df = train_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

# Validate the test data
test_df = test_df.na.drop()
test_df = test_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

In [0]:
model_path = 'dbfs:/Workspace/Users/lihi.kaspi@campus.technion.ac.il/mlp_model'
mlp_model = MultilayerPerceptronClassificationModel.load(model_path)

In [0]:
mlp_predictions = mlp_model.transform(test_df)

accuracy = evaluator_accuracy.evaluate(mlp_predictions)
f1_score = evaluator_f1.evaluate(mlp_predictions)

print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1_score}")

In [0]:
import matplotlib.pyplot as plt
import numpy as np

# Convert predictions and actual labels to pandas DataFrame
sample = mlp_predictions.select('prediction', 'label').toPandas()

# Define the mapping of numeric values to category names
category_mapping = {0: 'bad', 1: 'below average', 2: 'average', 3: 'above average', 4: 'good'}

# Map numeric predictions and labels to category names
sample['predicted_category'] = sample['prediction'].map(category_mapping)
sample['actual_category'] = sample['label'].map(category_mapping)

# Count occurrences of each category for predictions and actual labels
pred_counts = sample['predicted_category'].value_counts().reindex(category_mapping.values(), fill_value=0)
actual_counts = sample['actual_category'].value_counts().reindex(category_mapping.values(), fill_value=0)

# Plot comparative bar chart
categories = list(category_mapping.values())
x = np.arange(len(categories))  # X positions for bars

plt.figure(figsize=(8, 6))
bar_width = 0.4  # Width of bars

plt.bar(x - bar_width/2, actual_counts.values, width=bar_width, label='real', color='#a2d5f2', edgecolor='black')
plt.bar(x + bar_width/2, pred_counts.values, width=bar_width, label='Predicted', color='#f2aac7', edgecolor='black')

# Formatting
plt.xticks(x, categories, rotation=15)
plt.xlabel('Profile Score')
plt.ylabel('Frequency')
plt.title('Comparison of real vs. Predicted Profile Scores')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

In [0]:
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Convert predictions and labels to pandas DataFrame
sample = mlp_predictions.select('prediction', 'label').toPandas()

# Define category mapping
category_mapping = {0: 'bad', 1: 'below average', 2: 'average', 3: 'above average', 4: 'good'}

# Compute classification report
category_report = classification_report(sample['label'], sample['prediction'], output_dict=True)

# Convert to DataFrame and remove overall metrics (last 3 rows)
category_df = pd.DataFrame(category_report).T.iloc[:-3]

# Map index (numeric classes) to category names
category_df.index = [category_mapping[int(idx)] for idx in category_df.index]

# Compute per-category accuracy (TP / Total Samples in Class)
category_df["accuracy"] = np.diag(pd.crosstab(sample["label"], sample["prediction"], normalize='index'))

# Display results
print(category_df[['precision', 'recall', 'f1-score', 'accuracy', 'support']])


In [0]:
display(profiles.select(['name', 'about', 'current_company', 'experience', 'education', 'followers', 'position']).limit(5))

In [0]:
from pyspark.sql.functions import col, explode, lit, sequence, to_date, date_format, array_except, collect_list
from pyspark.sql.types import ArrayType, StringType

# Step 1: Explode the experience array
df_exploded = profiles.withColumn("experience_entry", explode(col("experience")))

# Step 2: Extract and parse start_date and end_date
df_parsed = df_exploded.select(
    col("experience_entry.start_date").alias("start_date"),
    col("experience_entry.end_date").alias("end_date"),
    col("experience").alias("original_experience")
).withColumn(
    "start_date", to_date(col("start_date"), "MMM yyyy")
).withColumn(
    "end_date", to_date(
        when(col("end_date") != "Present", col("end_date"))
        .otherwise(lit("2023-09-01")),  # Assuming "Present" is the current date
        "MMM yyyy"
    )
)

# Step 3: Compute the global earliest and latest dates per row
global_dates = df_parsed.groupBy("original_experience").agg(
    f.min("start_date").alias("global_start"),
    f.max("end_date").alias("global_end")
)

# Step 4: Generate a full timeline per row
global_dates = global_dates.withColumn(
    "full_timeline", sequence(col("global_start"), col("global_end"), f.expr("INTERVAL 1 MONTH"))
)

# Explode the full_timeline and format the dates
global_dates = global_dates.withColumn("full_month", explode(col("full_timeline"))).withColumn(
    "full_months", date_format(col("full_month"), "yyyy-MM")
)

# Step 5: Generate covered months for each experience
covered_months = df_parsed.withColumn(
    "covered_range", sequence(col("start_date"), col("end_date"), lit("1 month").cast("interval"))
).withColumn(
    "covered_month", explode(col("covered_range"))
).withColumn(
    "covered_months", date_format(col("covered_month"), "yyyy-MM")
).select(
    col("original_experience"),
    col("covered_months")
)

# Step 6: Identify missing months per row
# Aggregate the covered months per row
covered_months_agg = covered_months.groupBy("original_experience").agg(
    collect_list("covered_months").alias("all_covered_months")
)

# Join with the full timeline and identify gaps
result = global_dates.join(covered_months_agg, "original_experience").withColumn(
    "missing_months",
    array_except(f.array(col("full_months")), col("all_covered_months"))
)

# Show missing months per row
result.select("original_experience", "missing_months").display()


In [0]:
from pyspark.sql.functions import size, when

# Add a column to check if there are missing months (binary)
result = result.withColumn(
    "has_missing_months",
    when(size(col("missing_months")) > 0, True).otherwise(False)
)

# Select the relevant columns
result.select("original_experience", "has_missing_months").display()


In [0]:
from pyspark.sql.functions import col, expr, lag, lead
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StructType, StructField, StringType

# Helper function to convert missing months to ranges
def convert_to_ranges(df, column="missing_months"):
    # Explode the missing months array
    exploded = df.select(
        col("original_experience"),
        explode(col(column)).alias("missing_month")
    ).withColumn("missing_month", to_date(expr("concat(missing_month, '-01')"), "yyyy-MM-dd"))
    
    # Add lag and lead to identify ranges
    window = Window.partitionBy("original_experience").orderBy("missing_month")
    exploded = exploded.withColumn("prev_month", lag("missing_month").over(window))
    exploded = exploded.withColumn("gap", expr("missing_month - INTERVAL 1 MONTH != prev_month"))
    
    # Assign groups to ranges
    exploded = exploded.withColumn(
        "range_group", expr("SUM(CASE WHEN gap THEN 1 ELSE 0 END) OVER (PARTITION BY original_experience ORDER BY missing_month)")
    )
    
    # Aggregate ranges
    ranges = exploded.groupBy("original_experience", "range_group").agg(
        expr("MIN(missing_month)").alias("range_start"),
        expr("MAX(missing_month)").alias("range_end")
    ).select("original_experience", "range_start", "range_end")
    
    return ranges

# Apply the function
missing_month_ranges = convert_to_ranges(result, "missing_months")

# Show the ranges
missing_month_ranges.display()

In [0]:
from pyspark.sql.functions import collect_list, struct

# Combine ranges into an array of start-end pairs
combined_ranges = missing_month_ranges.groupBy("original_experience").agg(
    collect_list(struct("range_start", "range_end")).alias("missing_ranges")
)

# Show the result
combined_ranges.display()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, expr, lag, struct, collect_list, to_date, when
from pyspark.sql.window import Window

# Step 1: Explode missing_months into individual rows
exploded = profiles.withColumn("missing_month", explode(col("missing_months"))) \
             .withColumn("missing_month", when(col("missing_month").isNotNull(), 
                                               to_date(expr("concat(missing_month, '-01')"), "yyyy-MM-dd"))
                                    .otherwise(None))

# Step 2: Identify gaps between consecutive months
window = Window.partitionBy("original_experience").orderBy("missing_month")
exploded = exploded.withColumn("prev_month", lag("missing_month").over(window))
exploded = exploded.withColumn("gap", expr("missing_month - INTERVAL 1 MONTH != prev_month"))

# Step 3: Assign groups for consecutive months
exploded = exploded.withColumn(
    "range_group",
    expr("SUM(CASE WHEN gap THEN 1 ELSE 0 END) OVER (PARTITION BY original_experience ORDER BY missing_month)")
)

# Step 4: Aggregate ranges
ranges = exploded.groupBy("original_experience", "range_group").agg(
    expr("MIN(missing_month)").alias("range_start"),
    expr("MAX(missing_month)").alias("range_end")
)

# Step 5: Exclude null ranges and group them into an array
cleaned_ranges = ranges.filter(
    col("range_start").isNotNull() & col("range_end").isNotNull()
)

combined_ranges = cleaned_ranges.groupBy("original_experience").agg(
    collect_list(struct("range_start", "range_end")).alias("missing_ranges")
)

# Step 6: Handle rows with no valid ranges (e.g., all nulls)
final_result = combined_ranges.withColumn(
    "missing_ranges",
    when(col("missing_ranges").isNotNull(), col("missing_ranges")).otherwise(f.array())
)

# Show the final result
final_result.display()