In [None]:
## Cambio de formato gramatical - Alternativa local
## Se hace uso del modelo cuantizado Llama 3.1 8B Instruct, pero se podrá utilizar con cualquier modelo siempre y cuando se use el formato correcto de prompt.

In [1]:
%%time
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-09-03 11:30:26.367823: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-03 11:30:26.389233: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-03 11:30:26.410578: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-03 11:30:26.417014: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-03 11:30:26.436002: I tensorflow/core/platform/cpu_feature_guar

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: Quadro RTX 6000. Max memory: 23.645 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
CPU times: user 13.6 s, sys: 3.56 s, total: 17.2 s
Wall time: 16.6 s


In [2]:
import csv

## Función para extraer SOLO el output del modelo.
def output_cleaner(output):
    # Convert list to string
    output_str = output[0]
    # Define the start of the assistant's response
    assistant_start = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    # Find the index of the assistant's response start
    user_index = output_str.find(assistant_start)
    # If the assistant's response is not found, return an empty string or handle the error
    if user_index == -1:
        return ""
    # Slice the string to remove everything up to the start of the assistant's response
    processed_output = output_str[user_index + len(assistant_start):]
    # Trim any extra newline or unwanted characters at the end (optional based on your data)
    processed_output = processed_output.strip()
    # Remove any trailing unwanted characters (such as extra newline or space)
    # This assumes the response ends with a newline
    if processed_output.endswith("<|eot_id|>"):
        processed_output = processed_output[:-10]
    
    return processed_output

# Función para convertir afirmaciones a preguntas
def sentence_to_question(cell_content):
    
    prompt ="""<|start_header_id|>system<|end_header_id|>

            You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

            For the following text extract, engineer a prompt asking for the reference/research for the text extract given. Write ONLY ONE SENTENCE. If you find abbreviations of words or phrases that can’t be deduced from the context don’t make them up, use them as is. Keep the main subject of the topic (paraphrasing is allowed). Remove any text mentioning or pointing to a source found in the original sentence. Openings can include (but shouldn't be limited to, be creative!): ‘Where can I find more information about…’, ‘Could you recommend some articles on…’, ‘I'm looking for articules about…’, ‘Do you know of any sources that compare…’, ‘I need references for a paper comparing…’, ‘What are the latest studies on…’, ‘What are some authoritative books on…’, ‘I'm interested in X. What sources should I look at?’, ‘Where can I find technical details about…’, ‘I need data on…’, ‘I'm looking for expert opinions on…’, ‘Where can I find information about…’, ‘Can you recommend any textbooks for…’, or anything else you come up with, finding the best alternative for each sentence. For example:
prompt: However, in the same year, the number of firms seeking to capitalize on big data utilization decreased by about 6.1 percent (Van der Meulen, 2016) .
bot: Find me research relating big data utilization and the decrease of firms looking to capitalize on it.
prompt: With the increasingly digital economy, the open and collaborative models become economically more viable [ 2 ].
bot: Where can I learn about the economic viability of open and collaborative models?
prompt: Borrowing from industrial organization economics, the theories adhered to suggest that, unless otherwise impeded, competitive forces should drive abnormally high profits toward more normal levels (Jacobson, 1988; Mueller, 1986; Rumelt, 1987, 1991).
bot: Can you recommend any textbooks about competition driving price drops?
prompt: {}
bot:<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        prompt.format(
            "Where can I find research about new navigational location techniques", # input
            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")

## Inferencia para convertir oración en pregunta
    model_answer= model.generate(**inputs, temperature=0.7, max_new_tokens = 256, use_cache = True)
    output=tokenizer.batch_decode(model_answer)
    cleaned_output=output_cleaner(output)
    print(cleaned_output+"\n")
    return cleaned_output

# Función para procesar el CSV
def process_csv(input_csv, output_csv, input_column_name, output_column_name):
    with open(input_csv, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        fieldnames = reader.fieldnames + [output_column_name]  # Add new column name
        rows = []

        # Procesar cada file
        for row in reader:
            cell_content = row[input_column_name]
            processed_content = sentence_to_question(cell_content)
            row[output_column_name] = processed_content  # Add processed content to new column
            rows.append(row)

    # Escribir el output al CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

# Ejemplo de uso
input_csv = 'cleaned_merged.csv'        # Input CSV file path
output_csv = 'output.csv'      # Output CSV file path
input_column_name = 'Context'  # Name of the column to process
output_column_name = 'Question'  # Name of the new column

process_csv(input_csv, output_csv, input_column_name, output_column_name)

Where can I find research about the role of AI in improving supply chain management?

Where can I find more information about the economic viability of open and collaborative models in the context of the increasingly digital economy?

Where can I find research about the use of artificial intelligence in decision-making processes?

Could you recommend some articles on the impact of big data utilization on firms seeking to capitalize on it?

Could you recommend some articles on the economic viability of open and collaborative models in the digital economy?

Could you recommend some articles on the impact of digitalization on the economic viability of open and collaborative models?

Where can I find technical details about the relationship between the number of firms seeking to capitalize on big data utilization and its economic viability?

Where can I find information about the relationship between big data utilization and the decrease of firms looking to capitalize on it?

Where can I f

In [None]:
import csv

# Define your custom function here
def custom_function(cell_content):
    # Example: convert to uppercase (you can modify this function)
    return cell_content.upper()

# Function to process the CSV
def process_csv(input_csv, output_csv, input_column_name, output_column_name):
    with open(input_csv, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        fieldnames = reader.fieldnames + [output_column_name]  # Add new column name
        rows = []

        # Process each row
        for row in reader:
            cell_content = row[input_column_name]
            processed_content = custom_function(cell_content)
            row[output_column_name] = processed_content  # Add processed content to new column
            rows.append(row)

    # Write to the output CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

# Example usage
input_csv = 'input.csv'        # Input CSV file path
output_csv = 'output.csv'      # Output CSV file path
input_column_name = 'original_column'  # Name of the column to process
output_column_name = 'processed_column'  # Name of the new column

process_csv(input_csv, output_csv, input_column_name, output_column_name)
