In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# Load the dataset
input_file = "data/fake_data_systems_tables.csv",
data = pd.read_csv(input_file)

In [None]:
# Original columns from the dataset
original_columns = data.columns.tolist()
 
# New columns for the generated table structure
generated_columns = ['ColumnNames', 'ColumnAcronyms', 'DataTypes', 'Nullability', 'ColumnDescriptions']
 
# Initialize an empty dataframe to store the results
# Note: We adjust the DataFrame initialization to better accommodate list storage for generated columns
results_df = pd.DataFrame()
 
results_df.head(10)
data.head()

In [None]:

from huggingface_hub import login
# ADD YOUR HUGGINGFACE LOGIN TOKEN HERE
login(token = "INSERT_TOEKEN")


In [None]:
# THE FIRST TIME YOU RUN THIS, IT MIGHT TAKE A WHILE
model_path_or_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
model = AutoModelForCausalLM.from_pretrained(
    model_path_or_id,
    torch_dtype=torch.bfloat16,
    bnb_4bit_compute_dtype=torch.float16,
    use_flash_attention_2=True,
    attn_implementation="flash_attention_2",
    load_in_4bit=True,
)

In [None]:
import pandas as pd

def generate_table_structure(info):
    """Generates a prompt for creating synthetic table structures."""
    prompt = f"""Please generate a table with 20 column names, column acronyms, data types, nullability, and column descriptions for the given information: {info}.
    Provide the output in CSV format."""
    return prompt

def parse_table_structure(table_text):
    """Parses the generated table structure into a list of dictionaries."""
    lines = table_text.strip().split('\n')
    table_data = []
    for line in lines:
        fields = line.split(',')
        if len(fields) == 5:  # Assuming the structure includes 5 items: column name, acronym, type, nullability, and description
            table_data.append({
                "Column Name": fields[0],
                "Acronym": fields[1],
                "Data Type": fields[2],
                "Nullability": fields[3],
                "Description": fields[4]
            })
    return table_data

# Initialize a list to hold prompts
prompts = data.apply(lambda row: generate_table_structure(row.to_json()), axis=1)

# Convert prompts to list for processing
prompts_list = list(prompts.values)

# Generate outputs using the preloaded model
outputs = model.generate(prompts_list)

# Prepare a list to store all generated tables
generated_tables = []

# Iterate over the outputs and parse the generated tables
for idx, output_tuple in enumerate(outputs):
    if idx >= len(data):  # Prevent index error by checking bounds
        break

    # Extract the text directly from the output (since it's a string)
    table_text = output_tuple[0].strip()
    parsed_table = parse_table_structure(table_text)

    # Extract original data
    table_name = data.iloc[idx]['Table Name']
    table_description = data.iloc[idx]['Table Description']

    # Add the parsed table to the generated tables list
    generated_tables.append({
        'Table Name': table_name,
        'Table Description': table_description,
        'Generated Table': parsed_table
    })

# Now prepare the final DataFrame with all the generated tables
output_data = []
for table_info in generated_tables:
    for row in table_info['Generated Table']:
        output_data.append({
            'Source System Name': data.iloc[idx]['Source System Name'],
            'Source System Acronym': data.iloc[idx]['Source System Acronym'],
            'Table Name': table_info['Table Name'],
            'Table Description': table_info['Table Description'],
            'Generated_ColumnNames': row['Column Name'],
            'Generated_ColumnAcronyms': row['Acronym'],
            'Generated_DataTypes': row['Data Type'],
            'Generated_Nullability': row['Nullability'],
            'Generated_ColumnDescriptions': row['Description']
        })

# Convert the output data into a DataFrame
final_df = pd.DataFrame(output_data)

# Save to CSV
output_file = 'data/generated_table_structures_with_original_data.csv'
final_df.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")
