In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

In [0]:
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')

In [0]:
# new df with scores
profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")

In [0]:
profiles_with_scores = profiles_with_scores.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 10, 1
    ).when(f.col('profile_score') < 15, 2
    ).when(f.col('profile_score') < 20, 3
    ).otherwise(4)
)

##Pre process good profile data

In [0]:
from pyspark.sql.functions import col, concat_ws, udf
from pyspark.sql.types import StringType

good_profiles_df = profiles_with_scores.filter(col('label').isin([3,4])).select(['city', 'education', 'name', 'position', 'about']).dropna()



def strip_and_choose_first(str_lst):
    return str_lst.strip("[]").split(", ")[0]


# UDF to process the 'education' field (extract degree and school information)
def process_education(degree, field, title):
    # Extract degree, field, and school title from each education entry
    degree = strip_and_choose_first(degree)
    field = strip_and_choose_first(field)
    title = strip_and_choose_first(title)
    edu_details = f"{degree} in {field} from {title}"
    return edu_details

# Register UDF
process_education_udf = udf(process_education, StringType())

# Filter rows where the education column is not empty
filtered_df = good_profiles_df.filter((col("education").isNotNull()) & (col("education") != f.lit([])))

filtered_df = filtered_df.withColumn('degree', col('education').getField('degree').cast('string'))
filtered_df = filtered_df.withColumn('field', col('education').getField('field').cast('string'))
filtered_df = filtered_df.withColumn('school', col('education').getField('title').cast('string'))

# Process the DataFrame
good_profiles_df = filtered_df.withColumn("processed_education", 
                                          process_education_udf(col('degree'), col('field'), col('school')))

# Show the resulting DataFrame
processed_df = good_profiles_df.withColumn(
                                    "input_prompt",
                                    concat_ws(
                                        ", ",
                                        col("city"),
                                        col("processed_education"),
                                        col("name"),
                                        col("position"),
                                    )
                            )
processed_df.display(limit=10)
print(processed_df.count())
processed_df.write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet")


###Setting up dataset class, with stochastic sampling to account for size

In [0]:
# Initialize the tokenizer and SparkDataset
from transformers import GPT2Tokenizer
from torch.utils.data import DataLoader
import torch


#Util function
def get_sampled_dataframe(df, sample_size=1000, seed=None):
    """
    Randomly samples rows from a Spark DataFrame.

    Args:
        df (DataFrame): The full Spark DataFrame.
        sample_size (int): The number of rows to sample.
        seed (int, optional): Random seed for reproducibility.

    Returns:
        DataFrame: A sampled Spark DataFrame.
    """
    return df.sample(withReplacement=False, fraction=sample_size / df.count(), seed=seed)

class SparkSampledDataset(torch.utils.data.IterableDataset):
    def __init__(self, spark_df, tokenizer, max_length=512, sample_size=100, seed=None):
        """
        PyTorch Dataset that samples from a Spark DataFrame dynamically with a fixed number of batches.

        Args:
            spark_df (DataFrame): The full Spark DataFrame.
            tokenizer: Hugging Face tokenizer.
            max_length (int): Maximum sequence length for tokenization.
            sample_size (int): Number of rows to sample at each iteration.
            num_batches (int): Total number of batches to generate.
            seed (int, optional): Random seed for reproducibility.
        """
        self.spark_df = spark_df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sample_size = sample_size
        self.seed = seed

    def __iter__(self):
        """
        Yields batches of tokenized data sampled from the Spark DataFrame.
        Stops after `num_batches` iterations.
        """
        # Sample a subset of the DataFrame
        sampled_df = get_sampled_dataframe(self.spark_df, sample_size=self.sample_size, seed=self.seed)
        local_data = [row.asDict() for row in sampled_df.collect()]
        
        # Tokenize each row in the sample
        for row in local_data:
            input_text = row["input_prompt"]  # Replace with your input column name
            target_text = row["about"]  # Replace with your target column name
            
            # Tokenize inputs and targets
            inputs = self.tokenizer(
                input_text,
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt"
            )
            targets = self.tokenizer(
                target_text,
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt"
            )   
            # Yield tokenized data
            yield {
                "input_ids": inputs["input_ids"].squeeze(0),
                "attention_mask": inputs["attention_mask"].squeeze(0),
                "labels": targets["input_ids"].squeeze(0)
            }

file_path = "/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet"

# Read the Parquet file into a Spark DataFrame
df = spark.read.parquet(file_path)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

dataset = SparkSampledDataset(
    spark_df=df,
    tokenizer=tokenizer,
    max_length=256,
    sample_size=100,
    seed=42
)

###Training

In [0]:
# Create the DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
import torch
dataloader = DataLoader(dataset, batch_size=4)
#---- Model definition ----
# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define pad_token (GPT-2 doesn't have a native pad_token)
tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ---- Optimizer definition ----
optimizer = AdamW(model.parameters(), lr=5e-5)

EPOCHS = 10

# ---- Training loop ----
model.train()
for epoch in range(EPOCHS):  # Number of epochs
    i=0
    for batch in dataloader:
        print(batch['input_ids'].shape, i)
        i+=1
        optimizer.zero_grad()  # Reset gradients
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss      # Compute loss
        loss.backward()          # Backward pass
        optimizer.step()         # Update model parameters

    print(f"Epoch {epoch + 1} completed.")


###Save Model

In [0]:
# Save the model and tokenizer
# model.save_pretrained("Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")
# tokenizer.save_pretrained("Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")
model.save_pretrained("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")
tokenizer.save_pretrained("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")
print("Model saved successfully!")


In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Define the layers of the neural network
# Input layer = number of features, hidden layers = user-defined, output layer = number of classes
layers = [133, 64, 32, 3]

# Initialize MLP Classifier
mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    maxIter=100,
    layers=layers,
    blockSize=128,
    seed=42
)

# Train the model
mlp_model = mlp.fit(train_multi_df)

# Make predictions
mlp_predictions = mlp_model.transform(test_multi_df)

# Evaluate
accuracy = evaluator.evaluate(mlp_predictions)
print(f"MLP Accuracy: {accuracy}")