# Profile Pro: A LinkedIn Profile Optimizer
## Final Project - Data Collection Lab (0940290)
### Lihi Kaspi (214676140), Harel Oved (326042389) & Lior Zaphir (326482213)

In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

## Relevant Data

In [0]:
# original datasets
companies = spark.read.parquet('/dbfs/linkedin_train_data')
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')

In [0]:
profiles.select(['city', 'education', 'name', 'position', 'about']).dropna().display()

In [0]:
# new df with processed vector to go into the model
processed_data = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/processed_data.parquet")

##Pre process good profile data

In [0]:
from pyspark.sql.functions import col, concat_ws, udf
from pyspark.sql.types import StringType

good_profiles_df = profiles.select(['city', 'education', 'name', 'position', 'about']).dropna()



def strip_and_choose_first(str_lst):
    return str_lst.strip("[]").split(", ")[0]


# UDF to process the 'education' field (extract degree and school information)
def process_education(degree, field, title):
    # Extract degree, field, and school title from each education entry
    degree = strip_and_choose_first(degree)
    field = strip_and_choose_first(field)
    title = strip_and_choose_first(title)
    edu_details = f"{degree} in {field} from {title}"
    return edu_details

# Register UDF
process_education_udf = udf(process_education, StringType())

# Filter rows where the education column is not empty
filtered_df = good_profiles_df.filter((col("education").isNotNull()) & (col("education") != f.lit([])))

filtered_df = filtered_df.withColumn('degree', col('education').getField('degree').cast('string'))
filtered_df = filtered_df.withColumn('field', col('education').getField('field').cast('string'))
filtered_df = filtered_df.withColumn('school', col('education').getField('title').cast('string'))

# Process the DataFrame
good_profiles_df = filtered_df.withColumn("processed_education", 
                                          process_education_udf(col('degree'), col('field'), col('school')))

# Show the resulting DataFrame
processed_df = good_profiles_df.withColumn(
                                    "input_prompt",
                                    concat_ws(
                                        ", ",
                                        col("city"),
                                        col("processed_education"),
                                        col("name"),
                                        col("position"),
                                    )
                            )
processed_df.display()


###Tokenization and preparation of input column

In [0]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer
from pyspark.ml import Pipeline

# Document assembler
document_assembler = DocumentAssembler().setInputCol("input_prompt").setOutputCol("document")

# Tokenizer
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

# Define the pipeline
nlp_pipeline = Pipeline(stages=[document_assembler, tokenizer])

# Apply the pipeline to the DataFrame
tokenized_df = nlp_pipeline.fit(processed_df).transform(processed_df)

# Show tokenized data
tokenized_df.select("input_prompt", "token").display()

# Save with overwrite mode
tokenized_df.select("input_prompt", "about", "token").write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet")

In [0]:
file_path = "/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet"

# Read the Parquet file into a Spark DataFrame
df = spark.read.parquet(file_path)

# Show the DataFrame schema and data
df.printSchema()

df.write.mode("overwrite").json("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/tokenized_data.json")
df = spark.read.json("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/tokenized_data.json")
df.display()

In [0]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import torch
import json

# Custom PyTorch dataset
class CustomDataset(Dataset):
    def __init__(self, json_file, tokenizer, max_length=512):
        # Load JSON data
        with open(json_file, "r") as f:
            self.data = [json.loads(line) for line in f]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract tokenized input and target
        item = self.data[idx]
        input_tokens = " ".join(item["result"])  # Join tokenized input
        target_text = item["about"]             # Target text

        # Tokenize inputs and targets
        inputs = self.tokenizer(input_tokens, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        targets = self.tokenizer(target_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        # Add labels
        inputs["labels"] = targets["input_ids"]
        return {key: torch.squeeze(val) for key, val in inputs.items()}

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Initialize dataset and dataloader
dataset = CustomDataset("/dbfs/Workspace/Users/lihi.kaspi@campus.technion.ac.il/tokenized_data.json", tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


###Training

In [0]:
# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in dataloader:
        optimizer.zero_grad()  # Reset gradients
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss      # Compute loss
        loss.backward()          # Backward pass
        optimizer.step()         # Update model parameters

    print(f"Epoch {epoch + 1} completed.")


###Save Model

In [0]:
# Save the model and tokenizer
model.save_pretrained("path/to/fine_tuned_gpt2")
tokenizer.save_pretrained("path/to/fine_tuned_gpt2")
print("Model saved successfully!")


###Inference

In [0]:
def generate_about(input_prompt, model, tokenizer):
    # Tokenize the input prompt
    inputs = tokenizer(input_prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate output text
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)

    # Decode and return the generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
input_prompt = "City: New York, Education: Master's in Data Science from Columbia University, Name: Jane Doe, Position: Data Scientist"
about_section = generate_about(input_prompt, model, tokenizer)
print("Generated About Section:", about_section)


## Good Profiles Model

### i want to predit a numeric score and not binary label -- will be better for the final stage of suggesting improvemnts
### maybe predict categories of score (example below)

### Training the Model

possible models:
- Decision Tree Regressor
- Random Forest Regressor
- Gradient-Boosted Trees Regressor



#### numeric models

In [0]:
train_df, test_df = processed_data.randomSplit([0.7, 0.3], seed=42)

In [0]:
# Validate the training data
train_df = train_df.na.drop()
train_df = train_df.filter(col("features").isNotNull() & col("profile_score").isNotNull())

# Validate the test data
test_df = test_df.na.drop()
test_df = test_df.filter(col("features").isNotNull() & col("profile_score").isNotNull())

In [0]:
from pyspark.ml.regression import RandomForestRegressor

# Filter out rows with inconsistent feature vector lengths
train_df_filtered = train_df.dropna(subset=['features', 'profile_score']) \
                            .filter(f.size(f.col('features')) == 133)

# Initialize the model
rf = RandomForestRegressor(featuresCol="features", labelCol="profile_score")

# Fit the model
rf_model = rf.fit(train_df.dropna(subset=['features', 'profile_score']))

# Make predictions
rf_predictions = rf_model.transform(test_df)
display(rf_predictions)

In [0]:
rf_predictions = rf_predictions.withColumn('accurate', f.expr("cast(profile_score - 5 <= prediction <= profile_score + 5) as int"))
count_accurate = rf_predictions.where(f.col('accurate') == 1).count()
len_df = rf_predictions.count()
rf_accuracy = count_accurate / len_df
print(rf_accuracy)

In [0]:
from pyspark.ml.regression import GBTRegressor

# Initialize the model
gbt = GBTRegressor(featuresCol="features", labelCol="profile_score")

# Fit the model
gbt_model = gbt.fit(train_df)

# Make predictions
gbt_predictions = gbt_model.transform(test_df)
display(gbt_predictions)

In [0]:
gbt_predictions = gbt_predictions.withColumn('accurate', f.expr("cast(profile_score - 5 <= prediction <= profile_score + 5) as int")) 
count_accurate = gbt_predictions.where(f.col('accurate') == 1).count()
len_df = gbt_predictions.count()
gbt_accuracy = count_accurate / len_df
print(gbt_accuracy)

#### multiclass classifiers

In [0]:
score_labels = []
# turn to multiclass

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="profile_score", predictionCol="prediction", metricName="accuracy")

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Define the layers of the neural network
# Input layer = number of features, hidden layers = user-defined, output layer = number of classes
layers = [133, 64, 32, 5]

# Initialize MLP Classifier
mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="score_labels", maxIter=100, layers=layers, blockSize=128, seed=1234)

# Train the model
mlp_model = mlp.fit(training_data)

# Make predictions
mlp_predictions = mlp_model.transform(test_data)

# Evaluate
accuracy = evaluator.evaluate(mlp_predictions)
print(f"MLP Accuracy: {accuracy}")

### Evaluating the model

when checking accuracy - accepted score should be between (real_score-5, real_score+5)

## Profile Optimization

### 'about' Section Optimization

In [0]:
# take: about (if not null), position, job title, reccomendations 
# --> return: a sentence or two describing the person and job (in a new column called 'new_about')
# if all null: return message 'could not generate a short bio -- add more information to your profile' (put null in 'new_about' and add message in a new column called 'about_message')

good_profiles_df = ''







### Improvements and Suggetions

score ranking:
- excellent score - 90+ and no suggestions
- high score - 90+ and atleast one suggestion
- medium high score - 60-90
- medium score - 40-60
- medium low score - 20-40
- low score - 20>

In [0]:
score_messages = {
    'excellent score': 'Your profile is excellent, keep it up!',
    'high score': 'Your profile is very strong, Check the suggestions to make it excellent',
    'medium high score': 'Your profile is good, Try to follow the suggestions to make it even better',
    'medium score': 'Your profile could use a few improvements, Try to follow the suggestions to make it even better',
    'medium low score': 'Your profile needs to improve, Try to follow the suggestion to make it better',
    'low score': 'Your profile is weak, Try to follow the suggestion to make it better',
}

In [0]:
missing_field_messages = {
    'no_experience': 'Add previous/current comapnies you worked in', 
    'no_education': 'List your degrees and schools you graduated from',
    'no_about': 'Add a short bio about yourself, here is a suggestion: ',
    'suggested_about': 'Try out this about section: ',
    'no_company': 'Add the company you currently work in',
    'no_languages': 'List all the languages you know and the level of knowledge',
    'no_position': 'Add the position you are currently in',
    'no_posts': 'Try to be more active with you account',
    'no_recommendations': 'Ask a colleague to write a few words about you',
    'missing_experience': 'There is a gap in your resume, Don\'t forget to add all of the previous comapnies you worked in',
    'low_followers': 'Ask your colleagues and friends to follow you on LinkedIn!'
    }

In [0]:
# placeholder name for the predictions: predicted_df (has all the previous columns + score predictions)

predicted_df = predicted_df.withColumn(
  'score_rank', 
  f.when(f.col('score') < 20, 'low score'
  ).when(f.col('score') < 40, 'medium low score'
  ).when(f.col('score') < 60, 'medium score'
  ).when(f.col('score') < 90, 'medium high score'
  ).when(f.col('filled_percent') < 100, 'high score'
  ).otherwise('excellent score')
)

predicted_df = predicted_df.withColumn(
  'score_message',
  score_messages.get(f.col('score_rank'))
)

In [0]:
# find if there are gaps in the experience array (name new column: 'gap_in_experience')
# TODO: Binary or explicit time period? 

In [0]:
predicted_df = predicted_df.withColumn('suggestions', f.array())

predicted_df = predicted_df.withColumn(
  'suggestions',
  f.array(
    f.when(
      f.size(f.col('education')) == 0, 
      missing_field_messages.get('no_education')),
    f.when(
      f.size(f.col('current_company')) == 0, 
      missing_field_messages.get('no_company')),
    f.when(
      f.size(f.col('languages')) == 0, 
      missing_field_messages.get('no_languages')),
    f.when(
      f.size(f.col('posts')) == 0, 
      missing_field_messages.get('no_posts')),
    f.when(
      f.col('recommendations_count') == 0, 
      missing_field_messages.get('no_recommendations')),
    f.when(
      f.col('about').isNull() & f.col('new_about').isNotNull(), 
      missing_field_messages.get('no_about') + f.col('new_about')),
    f.when(
      f.col('about').isNotNull() & f.col('new_about').isNotNull() & f.col('score') < 90, 
      missing_field_messages.get('suggested_about') + f.col('new_about')),
    f.when(
      f.col('about_message').isNotNull(), 
      f.col('about_message')),
    f.when(
      f.col('position').isNull(),
      missing_field_messages.get('no_position')),
    f.when(
      f.col('followers') < 20,
      missing_field_messages.get('low_followers')),
    f.when(
      f.size(f.col('experience')) == 0, 
      missing_field_messages.get('no_experience')), 
    f.when(
      f.col('gap_in_experience').isNotNull(), # TODO: adapt to binary or time period
      missing_field_messages.get('missing_experience'))
  )
)

In [0]:
optemized_df = predicted_df.select('name', 'id', 'url', 'score_rank', 'score_message', 'suggestions')
display(optemized_df)