In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch

# Load your dataset
dataset = pd.read_csv('input/soc_sample.csv', header=None)
dataset = dataset[dataset.iloc[:, 0].str.contains('in the city of Halle', case=False)]

print(dataset.head)
# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient, lightweight model

# Prepare and embed data
text_data = dataset.apply(lambda row: f"{row.to_json()}", axis=1).tolist()
embeddings = embedding_model.encode(text_data, convert_to_tensor=True)


<bound method NDFrame.head of                           0       1            2            3       4  \
87953  in the city of Halle  female  born in bel      married  age 44   
87954  in the city of Halle    male  born in bel      married  age 32   
87955  in the city of Halle  female  born in bel      married  age 56   
87956  in the city of Halle  female  born in bel      married  age 82   
87957  in the city of Halle  female  born in bel      married  age 80   
...                     ...     ...          ...          ...     ...   
88954  in the city of Halle  female      foreign  not married  age 50   
88955  in the city of Halle  female      foreign  not married  age 77   
88956  in the city of Halle  female      foreign  not married  age 37   
88957  in the city of Halle    male      foreign  not married  age 37   
88958  in the city of Halle    male      foreign  not married  age 51   

                             5  
87953  number of people is 116  
87954   number of people is

In [30]:
from torch.nn.functional import cosine_similarity

def retrieve_relevant_context(query, embeddings, text_data, embedding_model, top_k=10):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    similarities = cosine_similarity(query_embedding.unsqueeze(0), embeddings)
    top_k_indices = torch.topk(similarities, top_k).indices
    return [text_data[idx] for idx in top_k_indices]


In [34]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load a small local language model (DistilGPT-2 as an example)
model_name = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the pad token id if not already set (GPT models typically do not have a pad token)
if tokenizer.pad_token is None:
    print("setting tokens")
    tokenizer.pad_token = "<|endoftext|>"  # Set the pad token to the end of text token
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

def generate_answer(query, context):
    input_text = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    
    # Tokenize the input text with padding, truncation, and return tensors
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,               # Ensures proper padding
        truncation=True,            # Truncates to max length
        max_length=500              # Set a maximum length to avoid excessive tokenization
    )

    # Generate the response using attention masks
    outputs = model.generate(
        inputs['input_ids'],        # Use 'input_ids' from the tokenizer output
        attention_mask=inputs['attention_mask'],  # Include attention mask
        max_length=600,             # Adjust length as needed
        num_return_sequences=1,
        max_new_tokens=100
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

setting tokens


In [35]:
def ask_question(query):
    relevant_context = retrieve_relevant_context(query, embeddings, text_data, embedding_model)
    context = "\n".join(relevant_context)
    answer = generate_answer(query, context)
    return answer

# Example usage
question = "Hany males in Halle who were born in bel of age 30 are divorced"
print(ask_question(question))


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Both `max_new_tokens` (=100) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Context: {"0":"in the city of Halle","1":"male","2":"born in bel","3":"divorced","4":"age 66","5":"number of people is 32"}
{"0":"in the city of Halle","1":"male","2":"born in bel","3":"not married","4":"age 66","5":"number of people is 29"}
{"0":"in the city of Halle","1":"male","2":"born in bel","3":"not married","4":"age 4","5":"number of people is 217"}
{"0":"in the city of Halle","1":"male","2":"born in bel","3":"not married","4":"age 17","5":"number of people is 219"}
{"0":"in the city of Halle","1":"male","2":"born in bel","3":"divorced","4":"age 71","5":"number of people is 23"}
{"0":"in the city of Halle","1":"male","2":"born in bel","3":"divorced","4":"age 68","5":"number of people is 38"}
{"0":"in the city of Halle","1":"female","2":"born in bel","3":"not married","4":"age 28","5":"number of people is 194"}
{"0":"in the city of Halle","1":"male","2":"born in bel","3":"divorced","4":"age 73","5":"number of people is 18"}
{"0":"in the city of Halle","1":"male","2":"born in bel

In [3]:
import pandas as pd
import torch
import faiss
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Load your dataset (replace 'your_dataset.csv' with your actual dataset file)
dataframe = pd.read_csv('input/soc_sample.csv')
dataframe = dataframe[dataframe.iloc[:, 0].str.startswith('in the city of Halle')]

# Initialize the embedding model (you can choose an appropriate model)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')



In [4]:

# Step 1: Embed the dataset
# Concatenate all column values for each row into a single string
text_data = dataframe.astype(str).agg(' '.join, axis=1).tolist()
embeddings = embedding_model.encode(text_data, convert_to_tensor=True)


In [5]:

# Step 2: Create a FAISS index for efficient similarity search
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)  # Using L2 distance
index.add(embeddings.cpu().numpy())  # Add embeddings to the index



In [6]:
# Load the language model
model_name = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:

# Set the pad token id if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = "<|endoftext|>"
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

def retrieve_relevant_context(query, embedding_model, index, text_data, top_k=3):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    query_embedding_np = query_embedding.cpu().numpy().reshape(1, -1)  # Ensure it's 2D (1, embedding_size)
    _, top_k_indices = index.search(query_embedding_np, top_k)  # Retrieve top K indices
    return [text_data[idx] for idx in top_k_indices[0]]

def generate_answer(query):
    # Step 3: Retrieve relevant context from the dataset
    relevant_contexts = retrieve_relevant_context(query, embedding_model, index, text_data)
    context = "\n".join(relevant_contexts)  # Combine contexts for the language model

    # Create a structured input
    input_text = f"Dataset context:\n{context}\n\nQuestion about the dataset: {query}\nAnswer:"

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=600  # Adjust based on your needs
    )

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        num_return_sequences=1,
        max_new_tokens=10
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [15]:

# Example usage
question = "Hany males in Halle who were born in bel of age 30 are divorced"
answer = generate_answer(question)
print(answer)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Dataset context:
in the city of Halle male born in bel divorced age 73 number of people is 18
in the city of Halle male born in bel divorced age 70 number of people is 18
in the city of Halle male born in bel divorced age 64 number of people is 59

Question about the dataset: Hany males in Halle who were born in bel of age 30 are divorced
Answer:
Question:
Question:
Question:

