# Profile Pro: A LinkedIn Profile Optimizer
## Final Project - Data Collection Lab (0940290)
### Lihi Kaspi (214676140), Harel Oved (326042389) & Lior Zaphir (326482213)

In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

## Relevant Data

In [0]:
# original datasets
companies = spark.read.parquet('/dbfs/linkedin_train_data')
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')

In [0]:
# new df with processed vector to go into the model
processed_data = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/processed_data.parquet")

In [0]:
processed_data = processed_data.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 10, 1
    ).when(f.col('profile_score') < 15, 2
    ).when(f.col('profile_score') < 20, 3
    ).otherwise(4)
)

## Good Profiles Model

### Training the Model

#### numeric models

In [0]:
train_df, test_df = processed_data.randomSplit([0.7, 0.3], seed=42)

In [0]:
from pyspark.ml.functions import vector_to_array

# Validate the training data
train_df = train_df.na.drop()
train_df = train_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

# Validate the test data
test_df = test_df.na.drop()
test_df = test_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

In [0]:
from pyspark.ml.regression import RandomForestRegressor

# Initialize the model
rf = RandomForestRegressor(featuresCol="features", labelCol="profile_score")

# Fit the model
rf_model = rf.fit(train_df)

# Make predictions
rf_predictions = rf_model.transform(test_df)
display(rf_predictions)

In [0]:
rf_predictions = rf_predictions.withColumn(
    'accurate', 
    (f.col('profile_score') - 3 <= f.col('prediction')) & 
    (f.col('prediction') <= f.col('profile_score') + 3)
)
count_accurate = rf_predictions.where(f.col('accurate') == 1).count()
len_df = rf_predictions.count()
rf_accuracy = count_accurate / len_df
print(rf_accuracy)

In [0]:
from pyspark.ml.regression import GBTRegressor

# Initialize the model
gbt = GBTRegressor(featuresCol="features", labelCol="profile_score")

# Fit the model
gbt_model = gbt.fit(train_df.select('features', 'profile_score'))

# Make predictions
gbt_predictions = gbt_model.transform(test_df)
display(gbt_predictions)

In [0]:
gbt_predictions = gbt_predictions.withColumn(
    'accurate', 
    (f.col('profile_score') - 5 <= f.col('prediction')) & 
    (f.col('prediction') <= f.col('profile_score') + 5)
)
# over_20 = gbt_predictions.where(f.col('profile_score') >= 30)
count_accurate = gbt_predictions.where(f.col('accurate') == 1).count()
len_df = gbt_predictions.count()
gbt_accuracy = count_accurate / len_df
print(gbt_accuracy)

#### multiclass classifiers

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [0]:
# turn to multiclass
multi_processed_data_3 = processed_data.withColumn(
    'label', 
    f.when(f.col('profile_score') < 7, 0
    ).when(f.col('profile_score') < 17, 1
    ).otherwise(2)
)
train_multi_df_3, test_multi_df_3 = multi_processed_data_3.randomSplit([0.7, 0.3], seed=42)

In [0]:
from pyspark.ml.functions import vector_to_array

# Validate the training data
train_multi_df = train_multi_df.na.drop()
train_multi_df = train_multi_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

# Validate the test data
test_multi_df = test_multi_df.na.drop()
test_multi_df = test_multi_df.filter(f.size(vector_to_array(f.col('features'))) == 133)

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Define the layers of the neural network
# Input layer = number of features, hidden layers = user-defined, output layer = number of classes
layers = [133, 64, 32, 3]

# Initialize MLP Classifier
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    layers=layers,
    blockSize=128,
    seed=42
)

# Train the model
mlp_model = mlp.fit(train_multi_df)

# Make predictions
mlp_predictions = mlp_model.transform(test_multi_df)

# Evaluate
accuracy = evaluator.evaluate(mlp_predictions)
print(f"MLP Accuracy: {accuracy}")

In [0]:
# turn to multiclass
multi_processed_data_4 = processed_data.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 12, 1
    ).when(f.col('profile_score') < 20, 2
    ).otherwise(3)
)
train_multi_df_4, test_multi_df_4 = multi_processed_data_4.randomSplit([0.7, 0.3], seed=42)

In [0]:
from pyspark.ml.functions import vector_to_array

# Validate the training data
train_multi_df_4 = train_multi_df_4.na.drop()
train_multi_df_4 = train_multi_df_4.filter(f.size(vector_to_array(f.col('features'))) == 133)

# Validate the test data
test_multi_df_4 = test_multi_df_4.na.drop()
test_multi_df_4 = test_multi_df_4.filter(f.size(vector_to_array(f.col('features'))) == 133)

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Define the layers of the neural network
# Input layer = number of features, hidden layers = user-defined, output layer = number of classes
layers = [133, 64, 32, 4]

# Initialize MLP Classifier
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    layers=layers,
    blockSize=128,
    seed=42
)

# Train the model
mlp_model = mlp.fit(train_multi_df_4)

# Make predictions
mlp_predictions = mlp_model.transform(test_multi_df_4)

# Evaluate
accuracy = evaluator.evaluate(mlp_predictions)
print(f"MLP Accuracy: {accuracy}")

In [0]:
# turn to multiclass
multi_processed_data_5 = processed_data.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 10, 1
    ).when(f.col('profile_score') < 15, 2
    ).when(f.col('profile_score') < 20, 3
    ).otherwise(4)
)
train_multi_df_5, test_multi_df_5 = multi_processed_data_5.randomSplit([0.7, 0.3], seed=42)

In [0]:
from pyspark.ml.functions import vector_to_array

# Validate the training data
train_multi_df_5 = train_multi_df_5.na.drop()
train_multi_df_5 = train_multi_df_5.filter(f.size(vector_to_array(f.col('features'))) == 133)

# Validate the test data
test_multi_df_5 = test_multi_df_5.na.drop()
test_multi_df_5 = test_multi_df_5.filter(f.size(vector_to_array(f.col('features'))) == 133)

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Define the layers of the neural network
# Input layer = number of features, hidden layers = user-defined, output layer = number of classes
layers = [133, 64, 32, 5]

# Initialize MLP Classifier
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    layers=layers,
    blockSize=128,
    seed=42
)

# Train the model
mlp_model = mlp.fit(train_multi_df_5)

# Make predictions
mlp_predictions = mlp_model.transform(test_multi_df_5)

# Evaluate
accuracy = evaluator.evaluate(mlp_predictions)
print(f"MLP Accuracy: {accuracy}")

In [0]:
import matplotlib.pyplot as plt

sample = mlp_predictions.select('prediction').toPandas()

plt.figure(figsize=(10, 6))
plt.hist(sample['prediction'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Histogram of Profile Scores')
plt.xlabel('Profile Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

when checking accuracy - accepted score should be between (real_score-5, real_score+5)

## Profile Optimization

### 'about' Section Optimization

In [0]:
# take: about (if not null), position, job title, reccomendations 
# --> return: a sentence or two describing the person and job (in a new column called 'new_about')
# if all null: return message 'could not generate a short bio -- add more information to your profile' (put null in 'new_about' and add message in a new column called 'about_message')

good_profiles_df = ''







In [0]:
def generate_about(input_prompt, model, tokenizer):
    # Tokenize the input prompt
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate output text
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)

    # Decode and return the generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
input_prompt = "City: New York, Education: Master's in Data Science from Columbia University, Name: Jane Doe, Position: Data Scientist"
about_section = generate_about(input_prompt, model, tokenizer)
print("Generated About Section:", about_section)


### Improvements and Suggetions

score ranking:
- excellent score - 90+ and no suggestions
- high score - 90+ and atleast one suggestion
- medium high score - 60-90
- medium score - 40-60
- medium low score - 20-40
- low score - 20>

In [0]:
score_messages = {
    'excellent score': 'Your profile is excellent, keep it up!',
    'high score': 'Your profile is very strong, Check the suggestions to make it excellent',
    'medium high score': 'Your profile is good, Try to follow the suggestions to make it even better',
    'medium score': 'Your profile could use a few improvements, Try to follow the suggestions to make it even better',
    'medium low score': 'Your profile needs to improve, Try to follow the suggestion to make it better',
    'low score': 'Your profile is weak, Try to follow the suggestion to make it better',
}

In [0]:
missing_field_messages = {
    'no_experience': 'Add previous/current comapnies you worked in', 
    'no_education': 'List your degrees and schools you graduated from',
    'no_about': 'Add a short bio about yourself, here is a suggestion: ',
    'suggested_about': 'Try out this about section: ',
    'no_company': 'Add the company you currently work in',
    'no_languages': 'List all the languages you know and the level of knowledge',
    'no_position': 'Add the position you are currently in',
    'no_posts': 'Tell your friends about projects you currently work on',
    'no_recommendations': 'Ask a colleague to write a few words about you',
    'missing_experience': 'There is a gap in your resume, Don\'t forget to add all of the previous comapnies you worked in',
    'low_followers': 'Ask your colleagues and friends to follow you on LinkedIn!'
    }

In [0]:
# placeholder name for the predictions: predicted_df (has all the previous columns + score predictions)

predicted_df = predicted_df.withColumn(
  'score_rank', 
  f.when(f.col('score') < 20, 'low score'
  ).when(f.col('score') < 40, 'medium low score'
  ).when(f.col('score') < 60, 'medium score'
  ).when(f.col('score') < 90, 'medium high score'
  ).when(f.col('filled_percent') < 100, 'high score'
  ).otherwise('excellent score')
)

predicted_df = predicted_df.withColumn(
  'score_message',
  score_messages.get(f.col('score_rank'))
)

In [0]:
# find if there are gaps in the experience array (name new column: 'gap_in_experience')
# TODO: Binary or explicit time period? 

In [0]:
predicted_df = predicted_df.withColumn('suggestions', f.array())

predicted_df = predicted_df.withColumn(
  'suggestions',
  f.array(
    f.when(
      f.size(f.col('education')) == 0, 
      missing_field_messages.get('no_education')),
    f.when(
      f.size(f.col('current_company')) == 0, 
      missing_field_messages.get('no_company')),
    f.when(
      f.size(f.col('languages')) == 0, 
      missing_field_messages.get('no_languages')),
    f.when(
      f.size(f.col('posts')) == 0, 
      missing_field_messages.get('no_posts')),
    f.when(
      f.col('recommendations_count') == 0, 
      missing_field_messages.get('no_recommendations')),
    f.when(
      f.col('about').isNull() & f.col('new_about').isNotNull(), 
      missing_field_messages.get('no_about') + f.col('new_about')),
    f.when(
      f.col('about').isNotNull() & f.col('new_about').isNotNull() & f.col('score') < 90, 
      missing_field_messages.get('suggested_about') + f.col('new_about')),
    f.when(
      f.col('about_message').isNotNull(), 
      f.col('about_message')),
    f.when(
      f.col('position').isNull(),
      missing_field_messages.get('no_position')),
    f.when(
      f.col('followers') < 20,
      missing_field_messages.get('low_followers')),
    f.when(
      f.size(f.col('experience')) == 0, 
      missing_field_messages.get('no_experience')), 
    f.when(
      f.col('gap_in_experience').isNotNull(), # TODO: adapt to binary or time period
      missing_field_messages.get('missing_experience'))
  )
)

In [0]:
optemized_df = predicted_df.select('name', 'id', 'url', 'score_rank', 'score_message', 'suggestions')
display(optemized_df)

### Re-Evaluating the Profiles Post Optimization

In [0]:
# need to see how listening to suggestions improve score - present the result by increasing number of sugggestions