# Profile Pro: A LinkedIn Profile Optimizer
## Final Project - Data Collection Lab (0940290)
### Lihi Kaspi (214676140), Harel Oved (326042389) & Lior Zaphir (326482213)

In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

## Relevant Data and Preprocessing

## i'm moving all the code cells that create a parquet file to different notebooks so we don't have to skip cells when running this notebook

In [0]:
# original datasets
companies = spark.read.parquet('/dbfs/linkedin_train_data')
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')

In [0]:
# new df of profiled with their "good profile" score -- code can be found in "Profile Score Calculation"
profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")

In [0]:
# new df with processed vector to go into the model
processed_data = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/processed_data.parquet")

### Data Pre-Processing

## i moved this code to the "Data_Preprocessing" notebook

### Scraped Data Preprocessing

#### Job Titles and Locations

In [0]:
jobs = profiles.select('name', 'id', 'city', 'country_code', f.col('current_company').getField('name').alias('company_name'), f.col('experience')[0].getField('title').alias('job_title'), 'position')

In [0]:
jobs.display(5)

####Clustering job titles into meta job titles

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, when, split
from pyspark.sql.types import StringType
from pyspark.ml.feature import Word2Vec, Tokenizer

# Create a DataFrame with the specified centroids
centroids_data = [
    ('Leadership',), ('Product',), ('Engineering',), ('DataScience',), ('Operations',),
    ('Marketing',), ('Sales',), ('Design',), ('Support',), ('Finance',),
    ('Resources',), ('Research',), ('Healthcare',), ('Education',), ('Security',),
    ('Logistics',), ('Legal',), ('Quality',), ('Management',), ('Content',)
]

centroids_df = spark.createDataFrame(centroids_data, ['processed_title'])

# Preprocess job titles
job_titles_df = jobs.select(
    when(col('job_title').isNotNull(), lower(col('job_title')))
    .otherwise(lower(col('position')))
    .alias('processed_title')
)
job_titles_df = job_titles_df.dropna()
tokenizer = Tokenizer(inputCol="processed_title", outputCol="tokened_title")
w2v = Word2Vec(inputCol="tokened_title", outputCol="vector", vectorSize=200, minCount=1)




# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, w2v])

# Train the pipeline model
model_vectorize = pipeline.fit(job_titles_df)



# Create embeddings for job titles and centroids
jobs_with_vectors = model_vectorize.transform(job_titles_df)
centroids_with_vectors = model_vectorize.transform(centroids_df)


jobs_temp = jobs_with_vectors.withColumnRenamed('vector', 'job_vector')
jobs_temp = jobs_temp.withColumnRenamed('processed_title', 'job_title')

centroids_temp = centroids_with_vectors.withColumnRenamed('processed_title', 'meta_job')
centroids_temp = centroids_temp.withColumnRenamed('vector', 'centroid_vector')

joined = jobs_temp.join(centroids_temp)
display(joined.limit(10))

In [0]:
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import DenseVector
import math

# Define a function to calculate cosine similarity
def cosine_similarity(v1, v2):
    if v1 is None or v2 is None:
        return None
    dot_product = float(v1.dot(v2))  # Dot product of the two vectors
    norm_v1 = math.sqrt(v1.dot(v1))  # Magnitude (norm) of v1
    norm_v2 = math.sqrt(v2.dot(v2))  # Magnitude (norm) of v2
    if norm_v1 == 0 or norm_v2 == 0:
        return None  # Avoid division by zero
    return dot_product / (norm_v1 * norm_v2)

# Register the function as a UDF
cosine_similarity_udf = udf(cosine_similarity, StringType())

# Add a new column to compute cosine similarity
joined = joined.withColumn(
    "cosine_similarity",
    cosine_similarity_udf(col("job_vector"), col("centroid_vector"))
)

# Show the result
joined.display()


In [0]:
window_spec = Window.partitionBy("job_title").orderBy(col("cosine_similarity").desc())

# Rank centroids for each job and select the closest one
ranked_df = joined.withColumn("rank", f.row_number().over(window_spec))

# Filter for the closest centroid
closest_centroids = ranked_df.filter(col("rank") == 1)

# Select relevant columns
result_df = closest_centroids.select(
    col("job_title"),
    col("meta_job").alias("closest_centroid"),
    col("cosine_similarity")
)

# Display the result
result_df.display()

In [0]:
profiles_with_state = profiles.withColumn(
    "state",
    split(col("city"), ", ")[1]  # The second element is the state
)

# Show the results
states_df = profiles_with_state.select("state").dropDuplicates().dropna()
states_df.display()

####Scraping 

In [0]:
%pip install selenium
%pip install beautifulsoup4

## this notebook is way to big so maybe move the scraping process to a new notebook and save the data in a parquet file to read from this notebook?
Yes we should divide the parts to differnet files it will be good to display them separately in the git

exactly

In [0]:
profiles_with_state = profiles.withColumn('state', split(col('city'), ', ')[1])
profiles_with_count = profiles_with_state

## Good Profiles Model

### i want to predit a numeric score and not binary label -- will be better for the final stage of suggesting improvemnts
### maybe predict categories of score (example below)

### Training the Model

possible models:
- Decision Tree Regressor
- Random Forest Regressor
- Gradient-Boosted Trees Regressor



### Evaluating the model

when checking accuracy - accepted score should be between (real_score-5, real_score+5)

## Profile Optimization

### 'about' Section Optimization

In [0]:
# take: about (if not null), position, job title, reccomendations 
# --> return: a sentence or two describing the person and job (in a new column called 'new_about')
# if all null: return message 'could not generate a short bio -- add more information to your profile' (put null in 'new_about' and add message in a new column called 'about_message')

### Improvements and Suggetions

score ranking:
- excellent score - 90+ and no suggestions
- high score - 90+ and atleast one suggestion
- medium high score - 60-90
- medium score - 40-60
- medium low score - 20-40
- low score - 20>

In [0]:
score_messages = {
    'excellent score': 'Your profile is excellent, keep it up!',
    'high score': 'Your profile is very strong, Check the suggestions to make it excellent',
    'medium high score': 'Your profile is good, Try to follow the suggestions to make it even better',
    'medium score': 'Your profile could use a few improvements, Try to follow the suggestions to make it even better',
    'medium low score': 'Your profile needs to improve, Try to follow the suggestion to make it better',
    'low score': 'Your profile is weak, Try to follow the suggestion to make it better',
}

In [0]:
missing_field_messages = {
    'no_experience': 'Add previous/current comapnies you worked in', 
    'no_education': 'List your degrees and schools you graduated from',
    'no_about': 'Add a short bio about yourself, here is a suggestion: ',
    'suggested_about': 'Try out this about section: ',
    'no_company': 'Add the company you currently work in',
    'no_languages': 'List all the languages you know and the level of knowledge',
    'no_position': 'Add the position you are currently in',
    'no_posts': 'Try to be more active with you account',
    'no_recommendations': 'Ask a colleague to write a few words about you',
    'missing_experience': 'There is a gap in your resume, Don\'t forget to add all of the previous comapnies you worked in',
    'low_followers': 'Ask your colleagues and friends to follow you on LinkedIn!'
    }

In [0]:
# placeholder name for the predictions: predicted_df (has all the previous columns + score predictions)

predicted_df = predicted_df.withColumn(
  'score_rank', 
  f.when(f.col('score') < 20, 'low score'
  ).when(f.col('score') < 40, 'medium low score'
  ).when(f.col('score') < 60, 'medium score'
  ).when(f.col('score') < 90, 'medium high score'
  ).when(f.col('filled_percent') < 100, 'high score'
  ).otherwise('excellent score')
)

predicted_df = predicted_df.withColumn(
  'score_message',
  score_messages.get(f.col('score_rank'))
)

In [0]:
# find if there are gaps in the experience array (name new column: 'gap_in_experience')
# TODO: Binary or explicit time period? 

In [0]:
predicted_df = predicted_df.withColumn('suggestions', f.array())

predicted_df = predicted_df.withColumn(
  'suggestions',
  f.array(
    f.when(
      f.size(f.col('education')) == 0, 
      missing_field_messages.get('no_education')),
    f.when(
      f.size(f.col('current_company')) == 0, 
      missing_field_messages.get('no_company')),
    f.when(
      f.size(f.col('languages')) == 0, 
      missing_field_messages.get('no_languages')),
    f.when(
      f.size(f.col('posts')) == 0, 
      missing_field_messages.get('no_posts')),
    f.when(
      f.col('recommendations_count') == 0, 
      missing_field_messages.get('no_recommendations')),
    f.when(
      f.col('about').isNull() & f.col('new_about').isNotNull(), 
      missing_field_messages.get('no_about') + f.col('new_about')),
    f.when(
      f.col('about').isNotNull() & f.col('new_about').isNotNull() & f.col('score') < 90, 
      missing_field_messages.get('suggested_about') + f.col('new_about')),
    f.when(
      f.col('about_message').isNotNull(), 
      f.col('about_message')),
    f.when(
      f.col('position').isNull(),
      missing_field_messages.get('no_position')),
    f.when(
      f.col('followers') < 20,
      missing_field_messages.get('low_followers')),
    f.when(
      f.size(f.col('experience')) == 0, 
      missing_field_messages.get('no_experience')), 
    f.when(
      f.col('gap_in_experience').isNotNull(), # TODO: adapt to binary or time period
      missing_field_messages.get('missing_experience'))
  )
)

In [0]:
optemized_df = predicted_df.select('name', 'id', 'url', 'score_rank', 'score_message', 'suggestions')
display(optemized_df)