In [9]:
import json
import re

In [None]:
import pandas as pd
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain



# Load the dataset to analyze its structure
file_path = 'Dataset_2.0_Akkodis.xlsx'
data = pd.ExcelFile(file_path)
sheet_names = data.sheet_names

# Load the first sheet and clean column names
df = pd.read_excel(data, sheet_name=sheet_names[0])
df.columns = df.columns.str.strip()  # Handle whitespaces in column names

# Group rows by the ID column
id_column = 'ID'  # Replace with the actual ID column name in your dataset
grouped_data = df.groupby(id_column)


In [2]:


# Analyze the structure of the dataset
columns = df.columns
example_groups = []
for group_id, group in grouped_data:
    example_groups.append({id_column: group_id, "data": group.to_dict(orient="records")})

example_groups = example_groups[:3]  # Use the first 3 groups as examples

# Prepare a template prompt
template = """
You are tasked with generating synthetic data for a dataset where each candidate has multiple rows (grouped by ID).
Columns: {columns}
Here are some example groups of rows:
{examples}

Please generate synthetic data for {num_groups} new groups, with each group having a unique ID and multiple related rows.
Output only a JSON array of objects, where each object represents a group with an ID and its rows.
"""


In [3]:
example_groups

[{'ID': 15,
  'data': [{'ID': 15,
    'Candidate State': 'Imported',
    'Age Range': '26 - 30 years',
    'Residence': 'SQUINZANO » Lecce ~ Puglia',
    'Sex': 'Male',
    'Protected category': nan,
    'TAG': nan,
    'Study area': 'computer engineering',
    'Study Title': 'Three-year degree',
    'Years Experience': '[0]',
    'Sector': nan,
    'Last Role': nan,
    'Year of insertion': '[2023]',
    'Year of Recruitment': nan,
    'Recruitment Request': nan,
    'Assumption Headquarters': nan,
    'Job Family Hiring': nan,
    'Job Title Hiring': nan,
    'event_type__val': 'CV request',
    'event_feedback': nan,
    'linked_search__key': nan,
    'Overall': nan,
    'Job Description': nan,
    'Candidate Profile': nan,
    'Years Experience.1': nan,
    'Minimum Ral': nan,
    'Ral Maximum': nan,
    'Study Level': nan,
    'Study Area.1': nan,
    'Akkodis headquarters': nan,
    'Current Ral': nan,
    'Expected Ral': nan,
    'Technical Skills': nan,
    'Standing/Position':

In [5]:

# Initialize Ollama LLM
llm = Ollama(model="llama3")  # Use a model available in your Ollama setup

# Create a chain to execute the task
prompt = PromptTemplate(input_variables=["columns", "examples", "num_groups"], template=template)
chain = LLMChain(llm=llm, prompt=prompt)


In [6]:

# Generate synthetic data
num_synthetic_groups = 5  # Specify the number of synthetic groups you want
response = chain.run(columns=list(columns), examples=example_groups, num_groups=num_synthetic_groups)


In [None]:

# # Convert the response into a DataFrame
# synthetic_groups = eval(response)

# # Reformat the synthetic data into a flat DataFrame
# synthetic_rows = []
# for group in synthetic_groups:
#     for row in group["data"]:
#         synthetic_rows.append(row)

# synthetic_data = pd.DataFrame(synthetic_rows)

# # Save the synthetic data
# output_path = "outputs/Synthetic_Data_Grouped.xlsx"
# synthetic_data.to_excel(output_path, index=False)
# print(f"Synthetic data saved to {output_path}")



In [None]:
import json
import re
import pandas as pd

# Step 1: Extract JSON-like block
def extract_json(response):
    # Use regex to capture content starting with backticks and an array
    json_match = re.search(r'```\s*(\[\s*{.*)', response, re.DOTALL)
    if not json_match:
        raise ValueError("No valid JSON block found in the response.")
    return json_match.group(1)

# Step 2: Clean JSON block
def clean_json(json_content):
    # Replace invalid values with JSON-compliant values
    json_content = json_content.replace("nan", "null")  # Replace 'nan' with 'null'
    json_content = json_content.strip()  # Remove extra spaces or newlines
    # Add a closing bracket if the JSON block is incomplete
    if not json_content.endswith(']'):
        json_content += ']'
    return json_content

try:
    # Extract and clean the JSON block
    raw_json = extract_json(response)
    cleaned_json = clean_json(raw_json)

    # Parse the cleaned JSON
    synthetic_groups = json.loads(cleaned_json)
    
    # Flatten the groups into rows for DataFrame conversion
    synthetic_rows = []
    for group in synthetic_groups:
        for row in group["data"]:
            synthetic_rows.append(row)
    
    # Convert to DataFrame
    synthetic_data = pd.DataFrame(synthetic_rows)
    
    # Save to Excel
    output_path = "outputs/Synthetic_Data_Cleaned.xlsx"
    synthetic_data.to_excel(output_path, index=False)
    print(f"Synthetic data saved to {output_path}")

except ValueError as ve:
    print(f"ValueError: {ve}")
except json.JSONDecodeError as je:
    print(f"JSONDecodeError: {je}")


Synthetic data saved to Synthetic_Data_Cleaned.xlsx


In [8]:
import json

try:
    synthetic_groups = json.loads(response)  # Safely parse JSON response
except json.JSONDecodeError as e:
    print(f"Error parsing LLM response as JSON: {e}")
    print("LLM Response:", response)
    synthetic_groups = []  # Handle or re-run with a corrected prompt


Error parsing LLM response as JSON: Expecting value: line 1 column 1 (char 0)
LLM Response: I can generate synthetic data for you. I will make sure that the generated data is diverse and does not duplicate any existing values.

Here are 5 new groups of synthetic data:

```
[
    {
        "ID": 42,
        "data": [
            {"ID": 42, "Candidate State": "Imported", "Age Range": "31 - 35 years", "Residence": "VERONA » Verona ~ Veneto", "Sex": "Female", "Protected category": nan, "TAG": nan, "Study area": "Biotechnology Engineering", "Study Title": "Three-year degree", "Years Experience": "[0]", "Sector": nan, "Last Role": nan, "Year of insertion": "[2022]", "Year of Recruitment": nan, "Recruitment Request": nan, "Assumption Headquarters": nan, "Job Family Hiring": nan, "Job Title Hiring": nan, "event_type__val": "CV request", "event_feedback": nan, "linked_search__key": nan, "Overall": nan, "Job Description": nan, "Candidate Profile": nan, "Years Experience.1": nan, "Minimum Ral": n