## Profiel Score Calculation

In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
import numpy as np
from pyspark.sql.window import Window
from datetime import datetime
import re

spark = SparkSession.builder.getOrCreate()

In [0]:
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')
companies = spark.read.parquet('/dbfs/linkedin_train_data')

# add scraped data

### User Profile Score

In [0]:
# columns that should be filled for better understaning the user
important_cols = ['about', 'city', 'country_code', 'current_company', 'education', 'experience', 'followers', 'id', 'languages', 'name', 'position', 'posts', 'recommendations_count']

In [0]:
# find the percent of filled column out of the important ones
profiles_df = profiles.withColumn(
    "filled_percent",
    f.round(
        (sum(
            f.when(
                f.col(col).isNotNull() & (~f.col(col).cast("string").rlike("^\\[\\]$")), 1
            ).otherwise(0)
            for col in important_cols
        )
    ) / len(important_cols) * 100, 2))

In [0]:
# words that indicate managment/board positions
important_titles = ['president', 'ceo', 'coo', 'cfo', 'cto', 'cmo', 'cdo', 'cso','cio', 'cpo', 'cro', 'vp', 'svp', 'rvp', 'evp', 'avp', 'chief', 'executive', 'exec' 'vice president', 'director', 'partner', 'dean', 'senior', 'head', 'principal', 'provost', 'treasurer', 'chair', 'chairman', 'chairwoman', 'chancellor', 'board']

In [0]:
# count how many indicative words each user has
regex_pattern = f"({'|'.join(map(re.escape, important_titles))})"

# count word in 'positon' field
profiles_df = profiles_df.withColumn(
  'position_count', 
      f.size(
        f.expr(
          f"filter(transform(split(lower(position), ' '), word -> word RLIKE '{regex_pattern}'), x -> x)")))

# count words in 'job title' field
profiles_df = profiles_df.withColumn(
  'title_count', 
      f.size(
        f.expr(
          f"filter(transform(split(lower(experience[0].title), ' '), word -> word RLIKE '{regex_pattern}'), x -> x)")))

In [0]:
# change the range of the count to be from 1 for the log function
profiles_df = profiles_df.withColumn(
  'position_count', 
    f.when(
      f.col('position_count') < 1, 1
    ).otherwise(f.col('position_count') + 1))
profiles_df = profiles_df.withColumn(
  'title_count', 
    f.when(
      f.col('title_count') < 1, 1
    ).otherwise(f.col('title_count') + 1))

# find the avg count of the indicative words
profiles_df = profiles_df.withColumn('avg_important_count', (f.col('position_count') + f.col('title_count')) / 2)

In [0]:
# calculate the followers score and log followers score
profiles_df = profiles_df.withColumn('followers_score', (f.col('followers') / f.col('avg_important_count')) + 1)
profiles_df = profiles_df.withColumn('log_followers_score', f.log(f.col('followers_score')))

# calculate the profile score
profiles_df = profiles_df.withColumn('profile_score', f.col('log_followers_score') * f.col('filled_percent') / 10)

In [0]:
profiles_df.write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")

### Company Profile Score

In [0]:
companies.display()

In [0]:
companies.printSchema()

In [0]:
important_cols = ['about', 'company_size', 'country_code', 'employees_in_linkedin', 'followers', 'locations', 'headquarters', 'id', 'industries', 'name', 'organization_type', 'slogan', 'sphere', 'type', 'website']

In [0]:
# find the percent of filled column out of the important ones
companies_df = companies.withColumn(
    "filled_percent",
    f.round(
        (sum(
            f.when(
                f.col(col).isNotNull() & (~f.col(col).cast("string").rlike("^\\[\\]$")), 1
            ).otherwise(0)
            for col in important_cols
        )
    ) / len(important_cols) * 100, 2))

In [0]:
# add popularity for all cities the companies has offices in instead of important words count

## score distribution

In [0]:
import matplotlib.pyplot as plt

sample = profiles_df.select('profile_score').sample(False, 0.1).toPandas()

plt.figure(figsize=(10, 6))
plt.hist(sample['profile_score'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Histogram of Profile Scores')
plt.xlabel('Profile Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()