In [None]:
import random
from langchain_vllm
from langchain_ollama import OllamaLLM
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import Runnable
import re
from typing import List, Tuple

from langchain_ollama import OllamaLLM
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import Runnable


def generate_applicant_data(model, n: int, p: float, shuffle: bool = True):
    """
    Generates a dataset of data science applicant summaries and their labels.

    Args:
        n (int): The total number of applicant summaries to generate.
        p (float): The probability (0.0 to 1.0) of generating a strong applicant.
        shuffle (bool): Whether to shuffle the generated data (X and y) together.

    Returns:
        tuple: A tuple containing two lists:
            - X (list): A list of applicant summary strings.
            - y (list): A list of labels (1 for strong, 0 for typical).
    """

    strong_applicant_prompt = PromptTemplate.from_template(
        "You are a data science professional articulating your suitability for a challenging data science role. "
        "In 100 words, outline your key contributions to data-driven projects, focusing on how your "
        "analytical approach led to measurable improvements or novel insights. Briefly mention your "
        "familiarity with diverse methodologies, tools, problem-solving abilities, and your ability to "
        "communicate complex findings to non-technical stakeholders. Feel free to focus on specifics in "
        "how you have turned data into actionable intelligence."
    )

    typical_applicant_prompt = PromptTemplate.from_template(
        "You are an aspiring data scientist outlining your qualifications for an entry-level or junior role. "
        "In 100 words, outline your key qualifications, including relevant coursework, projects, or prior experience. "
        "Highlight your enthusiasm for learning and applying data analysis techniques to real-world problems. "
        "Mention your comfort with standard data tools and a desire to grow your skills within a collaborative team environment."
    )

    X = []  # List to store applicant summaries
    y = []  # List to store labels (1 for strong, 0 for typical)

    # Create chains for each prompt
    strong_chain: Runnable = strong_applicant_prompt | model
    typical_chain: Runnable = typical_applicant_prompt | model

    # Determine the number of strong and typical applicants
    num_strong = round(n * p)
    num_typical = n - num_strong

    # Generate strong applicants
    if num_strong > 0:
        print(f"Generating {num_strong} strong applicants...")
        strong_inputs = [{} for _ in range(num_strong)] # Empty dictionaries as input since prompts are self-contained
        strong_results = strong_chain.batch(strong_inputs)
        X.extend(strong_results)
        y.extend([1] * num_strong)

    # Generate typical applicants
    if num_typical > 0:
        print(f"Generating {num_typical} typical applicants...")
        typical_inputs = [{} for _ in range(num_typical)]
        typical_results = typical_chain.batch(typical_inputs)
        X.extend(typical_results)
        y.extend([0] * num_typical)

    # Combine and shuffle if requested
    if shuffle:
        combined_data = list(zip(X, y))
        random.shuffle(combined_data)
        X, y = zip(*combined_data) # Unzip the shuffled data
        X = list(X)
        y = list(y)

    print("Generation complete!")
    return X, y

In [None]:
%cd ../..

/gpfs/home/vhl2022/projects/rcpp


In [7]:

import rcpp

import pickle
with open("./applications/application_tracking_system/figures/expected_loss/trajectory_0.pkl", "rb") as f:
    trajectory = pickle.load(f)
trajectory


Trajectory(lambda_hat=0.0, num_iterations=1, final_risk=0.6000, guaranteed_T=-1, delta_lambda=-1.000000)

In [8]:
trajectory.lambdas

array([0., 0.])

In [11]:
trajectory.risks_tm1_t, trajectory.risks_tt

(array([0.6]), array([0.6, 0.6]))

In [16]:
ollama_model = OllamaLLM(model="llama3.2", temperature=1.0, max_tokens=250)

In [17]:
%time _ = generate_applicant_data(ollama_model, 2, 0)

%time _ = generate_applicant_data(ollama_model, 10, 0)

Generating 2 typical applicants...


ConnectError: [Errno 111] Connection refused

Generating 10 typical applicants...


ConnectError: [Errno 111] Connection refused

In [8]:
%%time
generate_applicant_data(ollama_model, 100, 0)

Generating 100 typical applicants...
Generation complete!
CPU times: user 952 ms, sys: 90.4 ms, total: 1.04 s
Wall time: 1min 10s


(['As an aspiring data scientist, I bring a strong foundation in statistical inference, machine learning, and data visualization. My academic background includes coursework in Python programming, R, and SQL, as well as proficiency in data manipulation libraries like Pandas and NumPy. I have completed personal projects that applied machine learning techniques to real-world datasets, including predictive modeling and data exploration. Prior experience working with large datasets has honed my attention to detail and analytical skills. I am eager to collaborate with a team of professionals and continue learning to grow my skills in this exciting field.',
  'As an aspiring data scientist, I possess a strong foundation in statistical modeling, machine learning, and data visualization through coursework in computer science, statistics, and mathematics. Relevant projects include building predictive models for customer churn prediction and sentiment analysis of social media posts. Prior experie

In [54]:
%%time
generate_applicant_data(ollama_model, 10, 0)

Generating 10 typical applicants...
Generation complete!
CPU times: user 251 ms, sys: 20.3 ms, total: 271 ms
Wall time: 11.4 s


(['As an aspiring data scientist, I possess a solid academic foundation in data analysis. Throughout my studies, I completed courses in statistics, machine learning, and data visualization, solidifying my understanding of fundamental concepts. Notable projects include a predictive modeling competition and a data-driven business case study, where I applied techniques to real-world problems. I am eager to learn and grow within a collaborative team environment, leveraging tools like Python, R, Tableau, and SQL. With a strong work ethic and passion for learning, I aim to apply data analysis techniques to drive insights and inform business decisions in a dynamic role.',
  'I possess a solid academic foundation in data science, having completed coursework in statistics, machine learning, data mining, and visualization. I have also developed foundational projects utilizing libraries such as Python, R, and SQL, applying data analysis techniques to real-world problems. My enthusiasm for data-dr

In [None]:
%%time
generate_applicant_data(ollama_model, 30, 0)

Generating 30 typical applicants...
Generation complete!
CPU times: user 689 ms, sys: 55 ms, total: 744 ms
Wall time: 33.9 s


(["I hold a Bachelor's degree in Mathematics and Statistics, with a strong academic foundation in data science courses, including machine learning, statistical modeling, and data visualization. Throughout my undergraduate studies, I worked on numerous individual projects that applied data analysis techniques to real-world problems, such as analyzing customer churn patterns for a telecom company and predicting house prices using regression models. I'm eager to apply my skills in an entry-level role, surrounded by like-minded professionals who share my enthusiasm for learning and problem-solving. I'm excited to collaborate with the team and continue growing my data science skills.",
  'As an aspiring data scientist, I have established a strong academic foundation in statistics, machine learning, and data visualization through coursework such as "Data Mining," "Machine Learning," and "Data Visualization." I completed foundational projects like analyzing consumer behavior using SQL and cre

In [33]:
def create_ats_scorer(eval_model) -> Runnable:
    """
    Creates an LLM-based ATS scorer chain, with the prompt updated to reflect
    a general "Hiring Manager" persona.
    """
    ats_prompt_template = PromptTemplate.from_template(
        """You are a seasoned Hiring Manager for a Data Scientist role. Rate the applicant's summary (0-100).

        Indicators of a strong candidate: advanced ML (deep learning, NLP, CV), MLOps/deployment, leadership, quantifiable impact, specialized tools (TF, PyTorch, Spark, cloud), PhD/research.

        Score: 100 (top-tier), 80-99 (very strong), 60-79 (good/typical), 40-59 (decent), <40 (needs experience).

        Applicant Summary:
        {applicant_summary}

        Your Output Format:
        Score: [0-100]
        Justification: [Brief one sentence explanation]
        """
    )
    return ats_prompt_template | eval_model


def score_applicant_summary(ats_scorer: Runnable, summary: str) -> Tuple[int, str]:
    """
    Uses the ATS scorer to get a numerical rating and justification for a single summary.

    Args:
        ats_scorer (Runnable): The LangChain Runnable chain for the ATS scorer.
        summary (str): The applicant's professional summary.

    Returns:
        Tuple[int, str]: A tuple containing the extracted score (0-100) and the justification.
                         Returns (0, "Error parsing score") if the score cannot be extracted.
    """
    response_text = ats_scorer.invoke({"applicant_summary": summary})

    # Use regex to find the score. It's robust to variations in spacing.
    score_match = re.search(r"Score:\s*(\d+)", response_text)
    justification_match = re.search(r"Justification:\s*(.*)", response_text, re.DOTALL)

    score = 0
    justification = "Error parsing score or justification."

    if score_match:
        try:
            score = int(score_match.group(1))
            # Ensure score is within 0-100
            score = max(0, min(100, score))
        except ValueError:
            pass # Score remains 0 if conversion fails

    if justification_match:
        justification = justification_match.group(1).strip()

    return score, justification


def run_ats_on_batch(
    model,
    applicant_summaries: List[str],
    labels: List[int] # For comparison, not used by ATS for scoring
) -> List[Tuple[int, str]]:
    """
    Runs the ATS system on a batch of applicant summaries.

    Args:
        applicant_summaries (List[str]): A list of applicant summary strings.
        labels (List[int]): The true labels (1 for strong, 0 for typical) for comparison.

    Returns:
        List[Tuple[int, str]]: A list of tuples, each containing (score, justification).
    """
    ats_scorer = create_ats_scorer(model)
    results = []
    for i, summary in enumerate(applicant_summaries):
        print(f"\nProcessing Applicant {i+1} (True Label: {'Strong' if labels[i] == 1 else 'Typical'}):")
        score, justification = score_applicant_summary(ats_scorer, summary)
        results.append((score, justification))
        print(f"Score: {score}, Justification: {justification}")
    print("Batch processing complete!")
    return results

In [34]:
### Example Usage

ollama_model = OllamaLLM(model="llama3.2", temperature=1.0, max_tokens=250)

# Generate 10 applicants with a 70% chance of being strong, and shuffle the data
applicant_summaries, labels = generate_applicant_data(ollama_model, n=10, p=0.2, shuffle=True)

print("\n--- Generated Applicants ---")
for i, (summary, label) in enumerate(zip(applicant_summaries, labels)):
    print(f"\nApplicant {i+1} (Label: {'Strong' if label == 1 else 'Typical'}):")
    print(summary)
    print("-" * 30)

Generating 2 strong applicants...
Generating 8 typical applicants...
Generation complete!

--- Generated Applicants ---

Applicant 1 (Label: Typical):
With a strong academic foundation in data science, I possess a solid grasp of statistical modeling, machine learning algorithms, and data visualization. Coursework included data mining, programming in Python and R, and experience with databases such as MySQL. Additionally, I completed a capstone project analyzing customer behavior using social media trends to inform marketing strategies. I am eager to apply my skills in real-world settings and continuously learn new techniques. I am confident in my ability to work collaboratively within a team, utilizing tools like pandas, NumPy, and scikit-learn to drive insights that solve business problems.
------------------------------

Applicant 2 (Label: Typical):
As an aspiring data scientist, I have built a solid academic foundation through coursework in statistics, machine learning, and data vi

In [47]:
def create_summary_modifier() -> Runnable:
    """
    Creates an LLM-based chain for modifying applicant summaries.
    """
    modification_prompt_template = PromptTemplate.from_template(
        """You are an AI career coach helping a data scientist improve their resume summary.
        You have analyzed their current summary and received feedback from an ATS system.

        Original Summary:
        {original_summary}

        ATS Score: {ats_score}
        ATS Justification: {ats_justification}

        Modification Intensity (0 = no change, 1 = maximum change): {modification_intensity:.2f}

        Based on the ATS feedback and the modification intensity:
        - If intensity is 0, return the original summary verbatim.
        - If intensity is high (e.g., near 1), make significant changes to address the ATS justification,
          emphasizing business impact, advanced techniques, leadership, and quantifiable results.
          Ensure the new summary sounds like a very strong candidate.
        - If intensity is moderate, make thoughtful, subtle improvements focusing on clarity and incorporating
          stronger phrasing suggested by the ATS justification.

        Ensure the revised summary is under 100 words and sounds natural. Only return the revised summary text!
        Any other information will be punished by the ATS system.
        """
    )
    return modification_prompt_template | ollama_model


def modify_applicant_summary(
    original_summary: str,
    ats_score: int,
    ats_justification: str,
    lambda_param: int, # lambda between 0 and 100
    summary_modifier_chain: Runnable
) -> str:
    """
    Modifies an applicant summary based on ATS feedback and a lambda parameter.

    Args:
        original_summary (str): The applicant's original summary.
        ats_score (int): The score received from the ATS (0-100).
        ats_justification (str): The ATS's justification for the score.
        lambda_param (int): Controls modification extent (0=max change, 100=no change).
        summary_modifier_chain (Runnable): The LangChain Runnable for summary modification.

    Returns:
        str: The modified applicant summary.
    """
    if not (0 <= lambda_param <= 100):
        raise ValueError("lambda_param must be between 0 and 100.")

    # Convert lambda to modification intensity (0 = no change, 1 = max change)
    modification_intensity = (100 - lambda_param) / 100.0

    if modification_intensity == 0:
        return original_summary # Directly return if no modification is desired

    # Invoke the LLM for modification
    modified_summary = summary_modifier_chain.invoke({
        "original_summary": original_summary,
        "ats_score": ats_score,
        "ats_justification": ats_justification,
        "modification_intensity": modification_intensity
    })
    return modified_summary.strip()

In [48]:
ats_results = run_ats_on_batch(ollama_model, applicant_summaries, labels)


Processing Applicant 1 (True Label: Typical):
Score: 65, Justification: While the applicant demonstrates a solid foundation in data science concepts and some practical experience, their lack of explicit mentions of advanced ML (deep learning, NLP, CV), MLOps/deployment, leadership, quantifiable impact, or specialized tools like TF, PyTorch, Spark, or cloud platforms reduces their overall score.

Processing Applicant 2 (True Label: Typical):
Score: 40, Justification: The applicant lacks advanced technical skills such as deep learning, MLOps, specialized tools (TF, PyTorch, Spark, cloud), and a PhD/research background, indicating they require significant experience to be considered for a Data Scientist role.

Processing Applicant 3 (True Label: Typical):
Score: 60, Justification: The summary lacks advanced technical details about the applicant's experience with machine learning, MLOps/deployment, leadership, and specialized tools, indicating a good but typical candidate profile.

Proces

In [50]:
labels

[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]

In [49]:
modify_applicant_summary(
    original_summary=applicant_summaries[0],
    ats_score=ats_results[0][0],
    ats_justification=ats_results[0][1],
    lambda_param=0,  # Example lambda value
    summary_modifier_chain=create_summary_modifier()
)

'With a strong academic foundation in data science, I leveraged advanced machine learning algorithms like deep learning and NLP to drive business growth through predictive modeling and data-driven insights. Utilizing cloud platforms like AWS and specialized tools such as TensorFlow, PyTorch, and Spark, I optimized MLOps workflows and deployed scalable solutions that yielded significant ROI. As a collaborative leader, I fostered cross-functional teams to drive quantifiable results, resulting in notable improvements in customer behavior and marketing strategies.'

In [51]:
ats_results

[(65,
  'While the applicant demonstrates a solid foundation in data science concepts and some practical experience, their lack of explicit mentions of advanced ML (deep learning, NLP, CV), MLOps/deployment, leadership, quantifiable impact, or specialized tools like TF, PyTorch, Spark, or cloud platforms reduces their overall score.'),
 (40,
  'The applicant lacks advanced technical skills such as deep learning, MLOps, specialized tools (TF, PyTorch, Spark, cloud), and a PhD/research background, indicating they require significant experience to be considered for a Data Scientist role.'),
 (60,
  "The summary lacks advanced technical details about the applicant's experience with machine learning, MLOps/deployment, leadership, and specialized tools, indicating a good but typical candidate profile."),
 (60,
  "The summary lacks specific technical details about the applicant's expertise in machine learning (e.g., deep learning, NLP, CV), MLOps/deployment experience, or specialized tools, w

In [37]:
[x[0] for x in ats_results]

[60, 60, 60, 60, 97, 85, 60, 60, 60, 60]

In [38]:
import ollama

In [None]:
from ollama import Client
client = Client(
  host='http://gn-0002:11434',
  headers={'x-some-header': 'some-value'}
)
response = client.chat(model='llama3.2', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
response

ChatResponse(model='llama3.2', created_at='2025-05-21T01:55:44.329040636Z', done=True, done_reason='stop', total_duration=2957029345, load_duration=30380736, prompt_eval_count=31, prompt_eval_duration=4162034, eval_count=319, eval_duration=2921507686, message=Message(role='assistant', content="The sky appears blue because of a phenomenon called scattering, which occurs when sunlight interacts with the tiny molecules of gases in the Earth's atmosphere.\n\nHere's what happens:\n\n1. **Sunlight enters the atmosphere**: When the sun rises or sets, its light travels through the air and hits the tiny molecules of gases such as nitrogen (N2) and oxygen (O2).\n2. **Scattering occurs**: The shorter (blue) wavelengths of light are scattered more than the longer (red) wavelengths by these gas molecules. This is known as Rayleigh scattering.\n3. **Blue light is dispersed**: As a result, the blue light is dispersed in all directions, spreading throughout the atmosphere and reaching our eyes from ev

In [None]:
response = ollama.chat(
model=‘deepseek-r1:1.5b’,
messages=[{‘role’: ‘user’, ‘content’: ‘Your question here’}]
)
response

In [6]:
ollama_model.invoke("Tell me a funny fact about cats")

ConnectError: [Errno 111] Connection refused