# Profile Pro: A LinkedIn Profile Optimizer

In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re
import shutil
import os
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.functions import vector_to_array
import matplotlib.pyplot as plt
from pyspark.ml.classification import MultilayerPerceptronClassificationModel
from pyspark.ml.classification import MultilayerPerceptronClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

## Relevant Data

In [0]:
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')

# new df with processed vector to go into the model
processed_data = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/processed_data.parquet")

In [0]:
processed_data = processed_data.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 10, 1
    ).when(f.col('profile_score') < 15, 2
    ).when(f.col('profile_score') < 20, 3
    ).otherwise(4)
)

## Good Profiles Score Model

### Training the Model

In [0]:
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

In [0]:
train_df, test_df = processed_data.randomSplit([0.7, 0.3], seed=42)

In [0]:
# Validate the training data
train_df = train_df.na.drop()
train_df = train_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

# Validate the test data
test_df = test_df.na.drop()
test_df = test_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

In [0]:
# Define the layers of the neural network
layers = [133, 64, 32, 5]

# Initialize MLP Classifier
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    layers=layers,
    blockSize=128,
    seed=42
)

# Train the model
mlp_model = mlp.fit(train_df)

# Make predictions
mlp_predictions = mlp_model.transform(test_df)

# Evaluate
accuracy = evaluator_accuracy.evaluate(mlp_predictions)
f1_score = evaluator_f1.evaluate(mlp_predictions)

print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1_score}")

Accuracy: 0.8622098843801024
F1-Score: 0.860501719512912


In [0]:
model_path = 'dbfs:/Workspace/Users/lihi.kaspi@campus.technion.ac.il/mlp_model'
if os.path.exists(model_path):
    shutil.rmtree(model_path)

mlp_model.write().overwrite().save(model_path)

## Profile Optimization

### 'about' Section Optimization

In [0]:
def generate_about(input_prompt, model, tokenizer):
    # Tokenize the input prompt
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate output text
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)

    # Decode and return the generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
input_prompt = "City: New York, Education: Master's in Data Science from Columbia University, Name: Jane Doe, Position: Data Scientist"
about_section = generate_about(input_prompt, model, tokenizer)
print("Generated About Section:", about_section)


### Improvements and Suggetions

In [0]:
model_path = 'dbfs:/Workspace/Users/lihi.kaspi@campus.technion.ac.il/mlp_model'
mlp_model = MultilayerPerceptronClassificationModel.load(model_path)
mlp_predictions = mlp_model.transform(test_df)

In [0]:
predicted_df = profiles.join(mlp_predictions.select('id', 'prediction'), on='id', how='right')

In [0]:
score_messages = {
    'excellent score': 'Your profile is excellent, keep it up!',
    'high score': 'Your profile is very strong, Check the suggestions to make it excellent',
    'medium high score': 'Your profile is good, Try to follow the suggestions to make it even better',
    'medium score': 'Your profile could use a few improvements, Try to follow the suggestions to make it even better',
    'medium low score': 'Your profile needs to improve, Try to follow the suggestion to make it better',
    'low score': 'Your profile is weak, Try to follow the suggestion to make it better',
}

In [0]:
score_messages = {
    100: 'Your profile is excellent, keep it up!', # good profiles with no suggestions
    4: 'Your profile is very strong, Check the suggestions to make it excellent',
    3: 'Your profile is good, Try to follow the suggestions to make it even better',
    2: 'Your profile could use a few improvements, Try to follow the suggestions to make it even better',
    1: 'Your profile needs to improve, Try to follow the suggestion to make it better',
    0: 'Your profile is weak, Try to follow the suggestion to make it better',
}

In [0]:
missing_field_messages = {
    'no_experience': 'Add previous/current comapnies you worked in', 
    'no_education': 'List your degrees and schools you graduated from',
    'no_about': 'Add a short bio about yourself, here is a suggestion: ', # for profiles with no about section at all
    'suggested_about': 'Try out this about section: ', # for bad profiles only
    'no_company': 'Add the company you currently work in',
    'no_languages': 'List all the languages you know and the level of knowledge',
    'no_position': 'Add the position you are currently in',
    'no_posts': 'Tell your friends about projects you currently work on',
    'no_recommendations': 'Ask a colleague to write a few words about you',
    # 'missing_experience': 'There is a gap in your resume, Don\'t forget to add all of the previous comapnies you worked in', 
    'low_followers': 'Ask your colleagues and friends to follow you on LinkedIn!'
    }

In [0]:
predicted_df = predicted_df.withColumn(
  'suggestions',
  f.array(
    f.when(
      f.size(f.col('education')) == 0, 
      missing_field_messages.get('no_education')),
    f.when(
      f.col('current_company').isNull(), 
      missing_field_messages.get('no_company')),
    f.when(
      f.size(f.col('languages')) == 0, 
      missing_field_messages.get('no_languages')),
    f.when(
      f.size(f.col('posts')) == 0, 
      missing_field_messages.get('no_posts')),
    f.when(
      f.col('recommendations_count') == 0, 
      missing_field_messages.get('no_recommendations')),
    f.when(
      f.col('position').isNull(),
      missing_field_messages.get('no_position')),
    f.when(
      f.col('followers') < 20,
      missing_field_messages.get('low_followers')),
    f.when(
      f.size(f.col('experience')) == 0, 
      missing_field_messages.get('no_experience')), 
  )
)

In [0]:
predicted_df = predicted_df.withColumn("suggestions", f.array_except("suggestions", f.array(f.lit(None))))

display(predicted_df.select('suggestions'))

suggestions
"List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
"List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"
"List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
"List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"
"List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
"List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
"List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Add previous/current comapnies you worked in)"
"List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"
"List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
"List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"


In [0]:
predicted_df = predicted_df.withColumn(
  'score_rank', 
  f.when((f.col('prediction') == 4) & (f.size(f.col('suggestions')) == 0), 100
  ).otherwise(f.col('prediction'))
)

predicted_df = predicted_df.withColumn(
  'score_message',
  f.expr(f"CASE score_rank WHEN 100 THEN '{score_messages[100]}' " +
         f"WHEN 1 THEN '{score_messages[1]}' " +
         f"WHEN 2 THEN '{score_messages[2]}' " +
         f"WHEN 3 THEN '{score_messages[3]}' " +
         f"WHEN 4 THEN '{score_messages[4]}' " +
         f"WHEN 0 THEN '{score_messages[0]}' " +
         "ELSE 'Unknown' END")
)

In [0]:
display(predicted_df.select('name', 'score_message', 'suggestions'))

name,score_message,suggestions
Мила Жихарева,"Your profile is weak, Try to follow the suggestion to make it better","List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
عالین ولف,"Your profile is weak, Try to follow the suggestion to make it better","List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"
عبدالرحمان لحماري,"Your profile is weak, Try to follow the suggestion to make it better","List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
赵云,"Your profile is weak, Try to follow the suggestion to make it better","List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"
小黃魚 Huang,"Your profile is weak, Try to follow the suggestion to make it better","List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
王少云,"Your profile needs to improve, Try to follow the suggestion to make it better","List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
汪楠佳,"Your profile is weak, Try to follow the suggestion to make it better","List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Add previous/current comapnies you worked in)"
刘箫,"Your profile is weak, Try to follow the suggestion to make it better","List(List your degrees and schools you graduated from, List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"
邢雨川,"Your profile is weak, Try to follow the suggestion to make it better","List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!, Add previous/current comapnies you worked in)"
권미희,"Your profile is weak, Try to follow the suggestion to make it better","List(List all the languages you know and the level of knowledge, Tell your friends about projects you currently work on, Ask your colleagues and friends to follow you on LinkedIn!)"


In [0]:
optemized_df = predicted_df.select('name', 'id', 'url', 'score_rank', 'score_message', 'suggestions')
display(optemized_df)