In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# Load the dataset
input_file = "data/fake_data_systems_tables.csv",
data = pd.read_csv(input_file)

In [None]:
# Original columns from the dataset
original_columns = data.columns.tolist()
 
# New columns for the generated table structure
generated_columns = ['ColumnNames', 'ColumnAcronyms', 'DataTypes', 'Nullability', 'ColumnDescriptions']
 
# Initialize an empty dataframe to store the results
# Note: We adjust the DataFrame initialization to better accommodate list storage for generated columns
results_df = pd.DataFrame()
 
results_df.head(10)
data.head()

In [None]:
# Subset data for efficiency, comment out the line below if you want to go through the full dataset
data = data[0:3]

In [None]:

from huggingface_hub import login
# ADD YOUR HUGGINGFACE LOGIN TOKEN HERE
login(token = "API_KEY")


In [None]:
# THE FIRST TIME YOU RUN THIS, IT MIGHT TAKE A WHILE

model_path_or_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
model = AutoModelForCausalLM.from_pretrained(
    model_path_or_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    bnb_4bit_compute_dtype=torch.float16,
    attn_implementation="flash_attention_2",
    load_in_4bit=True
)

def generate(prompt):
    """Convenience function for generating model output"""
    # Tokenize the input
    input_ids = tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True).input_ids.cuda()
    
    # Generate new tokens based on the prompt, up to max_new_tokens
    # Sample according to the parameter
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids, 
            max_new_tokens=1000, 
            do_sample=True, 
            top_p=0.9,
            temperature=0.9,
            use_cache=True
        )
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

In [None]:
import pandas as pd

def generate_table_structure(info):
    """Generates a prompt for creating synthetic table structures."""
    prompt = f"""Please generate a table with 20 synthetic column names, column acronyms, data types, nullability, and column descriptions for the given table information: {info}.
    Provide the output in CSV format."""
    return prompt

def parse_table_structure(table_text):
    """Parses the generated table structure into a list of dictionaries."""
    lines = table_text.strip().split('\n')
    table_data = []
    for line in lines:
        fields = line.split(',')
        if len(fields) == 5:  # Assuming the structure includes 5 items: column name, acronym, type, nullability, and description
            table_data.append({
                "Column Name": fields[0].strip(),
                "Acronym": fields[1].strip(),
                "Data Type": fields[2].strip(),
                "Nullability": fields[3].strip(),
                "Description": fields[4].strip()
            })
    return table_data

# Initialize a list to hold prompts
prompts = data.apply(lambda row: generate_table_structure(row.to_json()), axis=1)

# Convert prompts to list for processing
prompts_list = list(prompts.values)

# Prepare a list to store all generated tables
generated_tables = []

# Iterate over each prompt, invoke the model, and parse the generated tables
for idx, prompt in enumerate(prompts_list):
    if idx >= len(data):  # Prevent index error by checking bounds
        break
    
    
    # Call the model to generate output for each prompt
    output = generate(prompt)
    # Extract the text from the output (since it's a string)
    table_text = output.strip()
    parsed_table = parse_table_structure(table_text)

    # Extract original data
    table_name = data.iloc[idx]['Table Name']
    table_description = data.iloc[idx]['Table Description']

    # Add the parsed table to the generated tables list
    generated_tables.append({
        'Table Name': table_name,
        'Table Description': table_description,
        'Generated Table': parsed_table
    })

# Now prepare the final DataFrame with all the generated tables
output_data = []
for idx, table_info in enumerate(generated_tables):
    for row in table_info['Generated Table']:
        output_data.append({
            'Source System Name': data.iloc[idx]['Source System Name'],
            'Source System Acronym': data.iloc[idx]['Source System Acronym'],
            'Table Name': table_info['Table Name'],
            'Table Description': table_info['Table Description'],
            'Generated_ColumnNames': row['Column Name'],
            'Generated_ColumnAcronyms': row['Acronym'],
            'Generated_DataTypes': row['Data Type'],
            'Generated_Nullability': row['Nullability'],
            'Generated_ColumnDescriptions': row['Description']
        })

# Convert the output data into a DataFrame
final_df = pd.DataFrame(output_data)

# Save to CSV
output_file = 'data/generated_table_structures_with_original_data.csv'
final_df.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")
