# Data Enrichement with Mixtral

## 1. Library imports and Dataset Imports

In [1]:
import logging
import json
import os
from aws_lambda_powertools import Metrics, Tracer, Logger
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3
from opensearchpy.helpers import bulk
from sentence_transformers import SentenceTransformer
import PyPDF2
import os
from random import randint
from time import sleep

import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import os
import cv2
import pytesseract
import torch
import pandas as pd


OS_DOMAIN_NAME = os.getenv('OS_DOMAIN_NAME')
REGION = 'eu-west-2'
CHUNK_SIZE = 1200

os.environ["ENDPOINT_NAME"]="mixtral-8x-7b-instruct-endpoint"
os.environ["OS_DOMAIN_NAME"]="genai-contract"
os.environ["REGION"]="eu-west-2"
os.environ["CHUNK_SIZE"]="1200"
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE"))

REGION = os.getenv('REGION')
OS_DOMAIN_NAME = os.getenv('OS_DOMAIN_NAME')
ENDPOINT_NAME = os.getenv('ENDPOINT_NAME')
endpoint_name = os.getenv('ENDPOINT_NAME')
BUCKET = os.getenv('BUCKET')
# Configure logging  
logging.basicConfig(level=logging.INFO) 

## 2. Calling Mixtral endpoint

In [2]:
def query_endpoint(endpoint,prompt,region,temperature=0.3,max_new_tokens=700):
    """
    Query the LLM endpoint with the given payload.
    :param endpoint: The name of the endpoint to query.
    :param payload: The payload to send to the endpoint.
    :return: The response from the endpoint.
    :rtype: string 
    """
    try:
        sagemaker_client = boto3.client("sagemaker-runtime",region_name=region)
        payload = {
            "inputs": prompt, 
            "parameters": {
                "do_sample": False,
                "top_k": 10,
                "temperature": temperature,
                "max_new_tokens": max_new_tokens,
                "stop": ["<|endoftext|>", "</s>"]
        }}
        response = sagemaker_client.invoke_endpoint(
            EndpointName=endpoint,
            ContentType="application/json",
            Body=json.dumps(payload),
        )
        response = response["Body"].read().decode("utf8")
        response = json.loads(response)
        return response[0]['generated_text']
    
    except Exception as e:
        logging.error("Error @ query_endpoint: {}".format(e))

In [3]:
message = "what is your name, Are u mistral or mixtral model, What is the difference between the two ?"
output = query_endpoint(endpoint_name,message,REGION)
print(output)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
ERROR:root:Error @ query_endpoint: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint mixtral-8x-7b-instruct-endpoint of account 767397757887 not found.


None


## 3. Generating Text

In [None]:
df = pd.read_csv("/home/ec2-user/SageMaker/hargurjeet/lambda-vectorization/Customertravel.csv")

# Set up the base prompt  
base_prompt = {  
    "role": "system",  
    "content": "You are an AI assistant with good knowledge of the travel industry.\
                A Tour & Travels Company Wants To Predict Whether A Customer Will Churn Or Not Based On Indicators Given Below.\
                Convert the following row to a detailed textual description:\
                Consider the following constraints: 1.Dont repeat similar discription for every line. No need to follow an order\
                while generation of discription which mean the age can go in last or annual income can be in first...etc.\
                dont include this (Here's a detailed textual description of the given row:) for each response.\
"  
}  

# Define the column names  
column_names = ["Age", "FrequentFlyer", "AnnualIncomeClass", "ServicesOpted", "AccountSyncedToSocialMedia", "BookedHotelOrNot"]  

generated_texts = []  
for index, row in df.iterrows():  
    prompt = base_prompt.copy()  
    row_description = ""  
    for col_name in column_names:  
        row_description += f"{col_name}: {row[col_name]}, "  
    prompt["content"] += row_description  
    generated_text = query_endpoint(endpoint_name, json.dumps(prompt), REGION)
    
    # Find the start of the description (after the prompt)  
    start_index = generated_text.find("Here's a detailed description of the given indicators:")  
    if start_index != -1:  
        generated_text = generated_text[start_index + len("Here's a detailed description of the given indicators:"):]  # Keep the part after the start 
        
    generated_text = generated_text.strip()
    generated_text.replace("\n\n", "")
    generated_texts.append(generated_text)
    
df["GeneratedText"] = generated_texts

In [81]:
df = pd.read_csv("/home/ec2-user/SageMaker/hargurjeet/lambda-vectorization/mixtral_original_output.csv")
df

Unnamed: 0.1,Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target,GeneratedText
0,0,34,No,Middle Income,6,No,Yes,0,"{""role"": ""system"", ""content"": ""You are an AI a..."
1,1,34,Yes,Low Income,5,Yes,No,1,The customer's age is 34. This is a significan...
2,2,37,No,Middle Income,3,Yes,No,0,The customer's age is 37. This is a significan...
3,3,30,No,Middle Income,2,No,No,0,"{""role"": ""system"", ""content"": ""You are an AI a..."
4,4,30,No,Low Income,1,No,No,0,The customer's age is 30. This could indicate ...
...,...,...,...,...,...,...,...,...,...
949,949,31,Yes,Low Income,1,No,No,0,The customer's age is 31. This is a relatively...
950,950,30,No,Middle Income,5,No,Yes,0,The customer's age is 30. This could indicate ...
951,951,37,No,Middle Income,4,No,No,0,The customer's age is 37. This is a significan...
952,952,30,No,Low Income,1,Yes,Yes,0,The customer's age is 30. This could indicate ...


## 4. Saving Final output

In [None]:
df.to_csv('mixtral_output.csv')

In [82]:
df.head

<bound method NDFrame.head of      Unnamed: 0  Age FrequentFlyer AnnualIncomeClass  ServicesOpted  \
0             0   34            No     Middle Income              6   
1             1   34           Yes        Low Income              5   
2             2   37            No     Middle Income              3   
3             3   30            No     Middle Income              2   
4             4   30            No        Low Income              1   
..          ...  ...           ...               ...            ...   
949         949   31           Yes        Low Income              1   
950         950   30            No     Middle Income              5   
951         951   37            No     Middle Income              4   
952         952   30            No        Low Income              1   
953         953   31           Yes       High Income              1   

    AccountSyncedToSocialMedia BookedHotelOrNot  Target  \
0                           No              Yes       0   

## 5. Cleaning text

In [94]:
# Function to extract clean text
def extract_clean_text(text):
    start_pos = text.find("The customer")
    if start_pos != -1:
        return text[start_pos:]
    else:
        return text 
    
mask = df_cleaned['GeneratedText'].str.startswith(')')

# Apply the function to the filtered rows
df_cleaned.loc[mask, 'GeneratedText'] = df_cleaned.loc[mask, 'GeneratedText'].apply(extract_clean_text)
df_cleaned

Unnamed: 0.1,Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target,GeneratedText
0,0,34,No,Middle Income,6,No,Yes,0,The customer is 34 years old and does not have...
1,1,34,Yes,Low Income,5,Yes,No,1,The customer's age is 34. This is a significan...
2,2,37,No,Middle Income,3,Yes,No,0,The customer's age is 37. This is a significan...
3,3,30,No,Middle Income,2,No,No,0,The customer is a 30-year-old individual who d...
4,4,30,No,Low Income,1,No,No,0,The customer's age is 30. This could indicate ...
...,...,...,...,...,...,...,...,...,...
949,949,31,Yes,Low Income,1,No,No,0,The customer's age is 31. This is a relatively...
950,950,30,No,Middle Income,5,No,Yes,0,The customer's age is 30. This could indicate ...
951,951,37,No,Middle Income,4,No,No,0,The customer's age is 37. This is a significan...
952,952,30,No,Low Income,1,Yes,Yes,0,The customer's age is 30. This could indicate ...


In [95]:
df_cleaned.to_csv("mixtral_output_post_cleaning.csv")

In [None]:

def process_and_save_batches(df, batch_size, output_prefix):
    """Process the DataFrame in batches and save each batch to a CSV file."""
    num_batches = (len(df) + batch_size - 1) // batch_size
    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, len(df))
        batch_df = df.iloc[start:end]

        generated_texts = []
        for _, row in batch_df.iterrows():
            messages = [
                {"role": "system", "content": "You are an AI assistant with good knowledge of the travel industry.\
                A Tour & Travels Company Wants To Predict Whether A Customer Will Churn Or Not Based On Indicators Given Below.\
                Convert the following row to a detailed textual description:\
                Consider the following constraints: 1.Dont repeat similar discription for every line. No need to follow an order\
                while generation of discription which mean the age can go in last or annual income can be in first...etc.\
                dont include this (Here's a detailed textual description of the given row:) for each response.\
                "},
                {"role": "user", "content": f"Age: {row['Age']}, FrequentFlyer: {row['FrequentFlyer']}, AnnualIncomeClass: \
                {row['AnnualIncomeClass']}, ServicesOpted: {row['ServicesOpted']}, \
                AccountSyncedToSocialMedia: {row['AccountSyncedToSocialMedia']}, BookedHotelOrNot: {row['BookedHotelOrNot']}"}
            ]
            generated_text = generate_text(text_generation_pipeline, messages)
            generated_texts.append(generated_text)

        batch_df['GeneratedText'] = generated_texts

        # Save batch to CSV
        output_file = f"{output_prefix}_batch_{i + 1}.csv"
        batch_df.to_csv(output_file, index=False)

        # Download the CSV file
        files.download(output_file)
        print(f"Batch {i + 1} processed and saved as {output_file}")


process_and_save_batches(customer_data_tail, batch_size=50, output_prefix='customer_travel_with_text_description')

## Important note - 

This is not the final notebook, and there are a series of notebooks that should be followed after this one. The reason I haven't included all the code in one place is due to computational requirements. I cannot rerun all the code again to format it.

Please follow the order outlined below after reviewing this notebook:

- 05-Mixtral_training_ml_model
- 06-AdvancedExperiments