In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, Tokenizer, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.window import Window
from datetime import datetime
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

In [0]:
# new df with scores
profiles_with_scores = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/user_profiles_with_scores.parquet")

In [0]:
profiles_with_scores = profiles_with_scores.withColumn(
    'label', 
    f.when(f.col('profile_score') < 5, 0
    ).when(f.col('profile_score') < 10, 1
    ).when(f.col('profile_score') < 15, 2
    ).when(f.col('profile_score') < 20, 3
    ).otherwise(4)
)

##Pre process good profile data

In [0]:
from pyspark.sql.functions import col, concat_ws, udf
from pyspark.sql.types import StringType

good_profiles_df = profiles_with_scores.filter(col('label').isin([3,4])).select(['id', 'city', 'education', 'name', 'position', 'about']).dropna()



def strip_and_choose_first(str_lst):
    return str_lst.strip("[]").split(", ")[0]


# UDF to process the 'education' field (extract degree and school information)
def process_education(degree, field, title):
    # Extract degree, field, and school title from each education entry
    degree = strip_and_choose_first(degree)
    field = strip_and_choose_first(field)
    title = strip_and_choose_first(title)
    edu_details = f"{degree} in {field} from {title}"
    return edu_details

# Register UDF
process_education_udf = udf(process_education, StringType())

# Filter rows where the education column is not empty
filtered_df = good_profiles_df.filter((col("education").isNotNull()) & (col("education") != f.lit([])))

filtered_df = filtered_df.withColumn('degree', col('education').getField('degree').cast('string'))
filtered_df = filtered_df.withColumn('field', col('education').getField('field').cast('string'))
filtered_df = filtered_df.withColumn('school', col('education').getField('title').cast('string'))

# Process the DataFrame
good_profiles_df = filtered_df.withColumn("processed_education", 
                                          process_education_udf(col('degree'), col('field'), col('school')))

# Show the resulting DataFrame
processed_df = good_profiles_df.withColumn(
                                    "input_prompt",
                                    concat_ws(
                                        ", ",
                                        col("city"),
                                        col("processed_education"),
                                        col("name"),
                                        col("position"),
                                    )
                            )
processed_df.display(limit=10)
print(processed_df.count())
processed_df.write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet")


###Setting up dataset class, with stochastic sampling to account for size

In [0]:
# Initialize the tokenizer and SparkDataset
from transformers import GPT2Tokenizer
from torch.utils.data import DataLoader
import torch


#Util function
def get_sampled_dataframe(df, sample_size=1000, seed=None):
    """
    Randomly samples rows from a Spark DataFrame.

    Args:
        df (DataFrame): The full Spark DataFrame.
        sample_size (int): The number of rows to sample.
        seed (int, optional): Random seed for reproducibility.

    Returns:
        DataFrame: A sampled Spark DataFrame.
    """
    return df.sample(withReplacement=False, fraction=sample_size / df.count())

class SparkSampledDataset(torch.utils.data.IterableDataset):
    def __init__(self, spark_df, tokenizer, max_length=512, sample_size=100):
        """
        PyTorch Dataset that samples from a Spark DataFrame dynamically with a fixed number of batches.

        Args:
            spark_df (DataFrame): The full Spark DataFrame.
            tokenizer: Hugging Face tokenizer.
            max_length (int): Maximum sequence length for tokenization.
            sample_size (int): Number of rows to sample at each iteration.
            num_batches (int): Total number of batches to generate.
            seed (int, optional): Random seed for reproducibility.
        """
        self.spark_df = spark_df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sample_size = sample_size

    def __iter__(self):
        """
        Yields batches of tokenized data sampled from the Spark DataFrame.
        Stops after `num_batches` iterations.
        """
        # Sample a subset of the DataFrame
        sampled_df = get_sampled_dataframe(self.spark_df, sample_size=self.sample_size)
        local_data = [row.asDict() for row in sampled_df.collect()]
        # Tokenize each row in the sample
        for row in local_data:
            input_text = f'Here is some of my data:{row["input_prompt"]}. Please craft an about section for me.'  
            print(input_text)
            # Replace with your input column name
            target_text = row["about"]  # Replace with your target column name
            print(target_text)
            # Tokenize inputs and targets
            inputs = self.tokenizer(
                input_text,
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt"
            )
            targets = self.tokenizer(
                target_text,
                truncation=True,
                padding="max_length",
                max_length=self.max_length,
                return_tensors="pt"
            )   
            # Yield tokenized data
            yield {
                "input_ids": inputs["input_ids"].squeeze(0),
                "attention_mask": inputs["attention_mask"].squeeze(0),
                "labels": targets["input_ids"].squeeze(0)
            }

file_path = "/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet"

# Read the Parquet file into a Spark DataFrame
df = spark.read.parquet(file_path)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

dataset = SparkSampledDataset(
    spark_df=df,
    tokenizer=tokenizer,
    max_length=256,
    sample_size=100,
)

###Training

In [0]:
# Create the DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
import torch
dataloader = DataLoader(dataset, batch_size=4)
#---- Model definition ----
# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define pad_token (GPT-2 doesn't have a native pad_token)
tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ---- Optimizer definition ----
optimizer = AdamW(model.parameters(), lr=5e-5)

EPOCHS = 10

# ---- Training loop ----
model.train()
for epoch in range(EPOCHS):  # Number of epochs
    i=0
    for batch in dataloader:
        print(batch['input_ids'].shape, i)
        i+=1
        optimizer.zero_grad()  # Reset gradients
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss      # Compute loss
        loss.backward()          # Backward pass
        optimizer.step()         # Update model parameters

    print(f"Epoch {epoch + 1} completed.")


In [0]:
# Read the Parquet file into a Spark DataFrame
df = spark.read.parquet('/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet')
df.limit(10000).display()

In [0]:
df.select("input_prompt", "about").limit(10000).display()

In [0]:
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW
from datasets import Dataset
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

# ---- Model Definition ----
model_name = "facebook/bart-base"  # Switch to bart-large for higher capacity
tokenizer = BartTokenizer.from_pretrained(model_name, max_length=64)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ---- Dataset Preparation ----
# Example data
df = spark.read.parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/training_data.parquet")
data = df.limit(10000).collect()
# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({
    "input_text": [d["input_prompt"] for d in data],
    "output_text": [d["about"] for d in data]
})

# Tokenize the dataset
def preprocess_function(examples):
    inputs = tokenizer(
        examples["input_text"], max_length=512, truncation=True, padding="max_length"
    )
    targets = tokenizer(
        examples["output_text"], max_length=512, truncation=True, padding="max_length"
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Create a DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True)

# ---- Optimizer ----
optimizer = AdamW(model.parameters(), lr=5e-5)

# ---- Training Loop ----
EPOCHS = 5
model.train()

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    epoch_loss = 0

    for batch in tqdm(dataloader, desc=f"Training Epoch {epoch + 1}"):
        # Move batch data to the same device as the model
        input_ids = torch.stack([torch.tensor(ids) for ids in batch["input_ids"]]).to(device)
        attention_mask = torch.stack([torch.tensor(mask) for mask in batch["attention_mask"]]).to(device)
        labels = torch.stack([torch.tensor(label) for label in batch["labels"]]).to(device)

        # Zero out gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track loss
        epoch_loss += loss.item()

    # Log epoch loss
    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f}")

    # Save the model and tokenizer after each epoch
    model.save_pretrained(f"/Workspace/Users/harel.oved@campus.technion.ac.il/bart_fine_tuned_epoch_{epoch + 1}")
    tokenizer.save_pretrained(f"/Workspace/Users/harel.oved@campus.technion.ac.il/bart_fine_tuned_epoch_{epoch + 1}")

print("Training completed!")

###Save Model

In [0]:
# Save the model and tokenizer
# model.save_pretrained("Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")
# tokenizer.save_pretrained("Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")
# model.save_pretrained("dbfs:/Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")
# tokenizer.save_pretrained("dbfs:/Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_gpt2")

# model.save_pretrained("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_GPT")
# tokenizer.save_pretrained("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/fine_tuned_GPT")
# print("Model saved successfully!")

torch.save(model.state_dict(), "/Workspace/Users/harel.oved@campus.technion.ac.il/gpt/model.pth")
# torch.save(tokenizer.state_dict(), "/Workspace/Users/lihi.kaspi@campus.technion.ac.il/Profile-Pro/fine_tuned_GPT/tokenizer.pth")


### Inference

In [0]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
import torch


model_path = "/Workspace/Users/harel.oved@campus.technion.ac.il/gpt/model.pth"
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.load_state_dict(torch.load(model_path))

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define pad_token (GPT-2 doesn't have a native pad_token)
tokenizer.pad_token = tokenizer.eos_token

In [0]:
# Dummy input
input_prompt = "Here is some of my details:New York, New York, United States, Master of Public Health - MPH in null from State University of New York (SUNY) Downstate Medical Center School of Public Health, Taylor A., Chief of Staff - MPH Candidate, please craft an about section"

In [0]:
# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ensure the pad token is set (if applicable
# )
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the input
inputs = tokenizer(
    input_prompt,
    return_tensors="pt",
    padding=True,           # Ensures padding if needed
    truncation=True         # Ensures truncation to the model's max length
).to(device)

# Ensure `pad_token_id` is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Generate text
output = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],  # Pass attention mask
    max_length=100,       # Maximum length of the generated sequence
    num_beams=5,          # Beam search for better results
    early_stopping=True,  # Stop generation when reaching the EOS token
    pad_token_id=tokenizer.pad_token_id       # Ensure correct padding
)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text:", generated_text)
