 # Libraries

In [2]:
%%capture
!pip install bertopic accelerate bitsandbytes adjustText

In [4]:
!pip install --no-deps xformers



In [5]:
pip install --upgrade sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import numpy as np # linear algebra
import pandas as pd 
from torch import bfloat16
import transformers
from huggingface_hub import notebook_login
from huggingface_hub import HfFolder
import re
import transformers
from tqdm import  tqdm
import os

# Data Cleaning

In [8]:
stopwords = ['tidio','tidio.','tidio,','forethought','ada','rulai','twixor',"twixor's",'botpress','intercom','kiliba','watermelon','Google','dialogflow','voiceflow','octocom','gleen','kore','kore.ai','kore.ai,','kore.ai.','aisera','praiz','humanlinker','cobbai','liveperson','zowie','replicant','zendesk','conversica','exceed', 'a', 'an', 'the']

In [9]:
def cleaning(comment):
    comment = comment.lower()
    comment = re.sub(r'[.,]', ' ', comment)
    comment = comment.strip()
    wordings = [token for token in comment.split()]
    wordings_2 = []
    for i in wordings:
        if i in stopwords:
            continue
        else:
            wordings_2.append(i)
    final_phrase = ' '.join(wordings_2).strip()
    return final_phrase

In [None]:
commentsa= pd.read_csv('/kaggle/input/g2-reviews/parsed_g2_reviews.csv')
commentsa['Likes'] = commentsa['Likes'].apply(cleaning)
commentsa['Dislikes'] = commentsa['Dislikes'].apply(cleaning)
commentsa['Problems Solved'] = commentsa['Problems Solved'].apply(cleaning)
commentsa['Complete Comments'] = '<br>What like about it: ' + commentsa['Likes'] + '<br>What dislike about it: ' + commentsa['Dislikes'] + '<br>Issue solved: ' + commentsa['Problems Solved']
commentsa.head(2)

# Gen AI and analysis

In [None]:
model_id = 'meta-llama/Llama-3.2-1B'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

# Prompting

In [17]:


def generate_classification(documents_text, model, tokenizer, max_tokens=1000):
    """
    Generate a classification response for GenAI tool reviews using a transformer model.
    
    Args:
        documents_text (str): The text containing the GenAI tool review
        model: The loaded transformer model
        tokenizer: The loaded tokenizer
        max_tokens (int): Maximum number of tokens to generate
    
    Returns:
        str: Classification results in dictionary format
    """
    # Define the complete prompt structure
    system_prompt = """
        <s>[INST] <<SYS>>
        You are an AI expert analyst specializing in evaluating GenAI tool implementations. Your task is to analyze user reviews and provide detailed, structured assessments focusing on implementation, personalization, ethical considerations, and user experience.
        <</SYS>>
        """
    
    example_prompt = """
       Here's an example analysis:
Review:
What like about it: The AI assistant handles initial lead conversations, which sales team sometimes despises.
What dislike about it: Nothing much to dislike. It's positively impacting business.
Issue solved: Sales team gets notified only for qualified leads, saving time and improving results.

Example Output:
{
    "Summary": "GenAI tool automates lead conversations and qualification in sales process",
    "Process": "Sales Leads",
    "Automatization": "Total Automatization",
    "Hyperpersonalization": "No",
    "Hyperpersonalization_think": "Only reshaping existing process, no personalized interactions mentioned",
    "Hyp_response": "No Detail",
    "Bias": "No",
    "Bias_think": "No mention of bias concerns",
    "Humanlikeness": "No",
    "Humanlikeness_think": "User shows no concerns about authenticity",
    "Privacy": "No Detail",
    "Privacy_think": "No privacy considerations mentioned",
    "Compliant": "No Detail",
    "Compliant_think": "No compliance issues discussed",
    "Ethics": "No Detail",
    "Ethics_think": "No ethical considerations mentioned",
    "Explainability": "No Detail",
    "Explainability_think": "No explanation concerns raised",
    "Transparency": "No Detail",
    "Transparency_think": "No transparency details provided"
}
        """

    main_prompt =f"""
    Now analyze this review:
    {documents_text}
    
    Provide a similar analysis in the same dictionary format, considering:
    1. Summary and process identification
    2. Automation level (Total/Medium/Small/No detail)
    3. Personalization aspects and reasoning
    4. Ethical considerations (bias, privacy, compliance)
    5. User experience (humanlikeness, transparency, explainability)
    
    Be specific in your reasoning and maintain the exact dictionary format shown in the example.
    [/INST] 
    """
    main_prompt_2 = f"""
        [INST] This is the text received: {documents_text}
        Based on the information above, please put the next information to be requested in a dictionary format, where 
        in each key it will placed different information:
        + Summary: Condense the text provided and send a summary of the key points.
        + Process: Detail which is the business process where the user implemented the tool.
        + Automatization: Classify the degree in which the user has implemented genai in the process by automatizating completly the whole steps or augmenatating the skills of the humans by the use of genai tools by one of next classes: 'Total Automatization', 'Medium automatization + human valid','Small automatization','No detail' 
        + Hyperpersonalization: It is considered hyperpersonalization when tool uses customer data, such as sales history and preferences, to generate responses or unique recommendations. Did the user mention examples of this type of hiperpersonalization in addition to comments?. Asnwer: 'Yes', 'No'
        + Hyperpersonalization_think: Explain why you response yes or no in the Hyperpersonalization question.
        + Hyp_response: How does user describe the use and effect of tool in the personalization of customer interactions? Being from an spectrum from experience highly personalized to customers, improving the personalization or no impact in personalizarion Answer: 'Hyperpersonalization', 'No Hiper', 'No Detail' 
        + Bias: Classify the message by understanding if the user mentions that the genai tool needs to be trained to eliminate biases as for example:  identifications, discriminations and more. Classify them by: 'Yes' , 'No'
        + Bias_think: Explain why you response yes or no in the Bias question.
        + Humanlikeness: Is the user expressly concerned about the lack of authenticity or human connection to the  user?. Answer by: 'Yes' ,'No' 
        + Humanlikeness_think: Explain why you response yes or no in the Humanlikeness question.
        + Privacy: The user mentions some specific information to protect the privacy and security of customer data when using tool?  Answer by: 'Yes' ,'No'   
        + Privacy_think: Explain why you response yes or no in the Privacy question.
        + Compliant: Classify the message to know if user mentions topic related to data regulations standards or any other topic related to the legal. Clasify them by: 'Yes' , 'No' 
        + Compliant_think: Explain why you response yes or no in the Compliant question.
        + Ethics: Does the user share any reflections on the ethical challenges of implementing tool in their work?. Classify them by: 'Yes', 'No'
        + Ethics_think: Explain why you response yes or no in the Ethics question.
        + Explainability: Does the user mention any difficulties in understanding or explaining how GenAI works?. Classify them by: 'Yes', 'No' 
        + Explainability_think: Explain why you response yes or no in the Explainability question.
        + Transparency: Does the user describe how customers are informed about the use of GenAI in customer service? Classify them by: 'Yes', 'No' 
        + Transparency_think: Explain why you response yes or no in the Transparency question.
        [/INST]
        """
    
    # Combine all prompts
    full_prompt = system_prompt + example_prompt + main_prompt
    
    # Create the generator pipeline
    generator = transformers.pipeline(
        model=model,
        tokenizer=tokenizer,
        task='text-generation',
        temperature=0.01,
        max_new_tokens=max_tokens,
        repetition_penalty=1.1,
        #pad_token_id=tokenizer.pad_token_id,
        #bos_token_id=tokenizer.bos_token_id,
        #eos_token_id=tokenizer.eos_token_id
    )
    
    # Generate response
    response = generator(full_prompt)[0]['generated_text']
    response = response.split('[/INST]')[-1].strip()
    return response


In [18]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=

In [19]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [20]:
def process_dataset(commentsa, model, tokenizer, output_path):
    """
    Process the dataset and save results to a CSV file.
    
    Args:
        commentsa (pd.DataFrame): DataFrame containing the dataset
        model: The loaded transformer model
        tokenizer: The loaded tokenizer
        output_path (str): Path to save the output CSV file
    """
    for i, row in tqdm(commentsa.iterrows(), total=len(commentsa), desc="Processing dataset"):
        try:
            documents_text = row['Complete Comments']
            result = generate_classification(documents_text, model, tokenizer)
            
            # Append the result to the output DataFrame
            combined_row = row.to_dict()  # Convert the original row to dictionary
            combined_row['LLM_results'] = result  # Add the LLM results
            combined_row['row_index'] = i

            output_df = pd.DataFrame([combined_row])
            # Save the output to a CSV file
            output_df.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)
        
        except Exception as e:
            print(f"Error processing row {i}: {str(e)}")


In [21]:
output_path = "genai_classification_results.csv"
process_dataset(commentsa, model, tokenizer, output_path)

Processing dataset:   0%|          | 0/530 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Processing dataset:  43%|████▎     | 228/530 [3:34:55<4:44:41, 56.56s/it]


KeyboardInterrupt: 

In [None]:
commentsa['LLM_Responses'] = commentsa['Complete Comments'].apply(lambda x: generate_classification(x, model, tokenizer))  

In [20]:
response = generator(prompt)[0]['generated_text']
print(response)


<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>


We receive the following review of a genai tool, where the user explain what they like about it, dislike and what problem has been
solved:

<br>What like about it: The AI assistant takes up the responsibility of handling the long initial conversation with the leads, which the sales team sometimes despises.
<br>What dislike about it: There is nothing much I dislike about the. It's cool and is positively impacting the business so far.
<br>Issue solved: My sales team is always notified only when there is a qualified lead. This saves more quality time in the sales process and leads to more goal-oriented results. exactly what was needed.

Acording to the instructions detailed the output should be:
{Summary: GenAI tool is satysfing the process of converation with leads in the sales process by automating the chat and send notifications when a lead is qualified,
 Automatization: 'Total Automat