# Feature Engineering

In [1]:
# import libraries
import os 
import pandas as pd
import time

In [2]:
# load the data for preview
df = pd.read_csv('../data/exam/processed/merged_data.csv')

# preview the data
display(df.head())

Unnamed: 0,CustomerID,Age,Gender,Location,MembershipLevel,TotalPurchases,TotalSpent,FavoriteCategory,LastPurchaseDate,WebsiteClickRate,TimeSpentOnSite,SocialMediaEngagement,AdClickHistory,GeneratedReview,CustomerSentimentScore,PersonaTag,Churn
0,4efed90,Female,Denver,CO,Silver,12,753.6,Clothing,2023-10-20,0.065,15.2,Medium,Clicked,"Great experience, love this store!",0.91,Regular Buyer,0
1,d7f26e8,Male,Los Angeles,CA,Gold,28,2155.4,Electronics,2023-10-25,0.092,22.5,High,Sometimes,Very happy with my purchases.,0.95,Loyal,0
2,6b4a427,Other,Chicago,IL,Platinum,41,4510.1,Home Goods,2023-09-18,0.115,28.1,High,Clicked,Excellent service and products.,0.98,Loyal,0
3,68eec52,Male,Houston,TX,Bronze,2,85.7,Books,2023-08-01,0.041,8.9,Low,Not Clicked,"Had some issues, not fully satisfied.",0.52,Window Shopper,1
4,3a2af82,Female,Phoenix,AZ,Silver,18,1220.5,Beauty,2023-10-10,0.078,18.7,Medium,Clicked,Will definitely buy again.,0.88,Engaged,0


In [3]:
# configure api
from dotenv import load_dotenv
import os

load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")

In [4]:
from google import genai
from google.genai import types

client = genai.Client(api_key=gemini_api_key)

model = [
    "gemini-2.5-flash-preview-04-17"
]

generate_content_config = types.GenerateContentConfig(
    response_mime_type="application/json",
)

In [5]:
import json
from datetime import datetime

def process_sentiment_batch(batch_df):
    """
    Process a batch of rows using the Gemini model to analyze sentiment in text responses.
    Returns sentiment scores for the generated reviews.
    """
    # Combine all text responses into a single prompt for batch processing
    rows = []
    for idx, row in batch_df.iterrows():
        row_data = {
            "id": row["CustomerID"],
            "sentiment_level": row["GeneratedReview"],
        }
        rows.append(row_data)
    
    prompt = f"""Analyze the sentiment in these responses and rate each on a scale from 1-5:
    - Sentiment level (1: Very Bad, 5: Very Good)

    For each customer, return ONLY a JSON object with their ID and the one numerical ratings.
    If a response is missing or unclear, assign a neutral value of 3.

    Customer responses:
    {rows}
    """

    response = client.models.generate_content(model=model[0], contents=prompt, config=generate_content_config)
    
    return response.text

In [6]:
# Process the dataframe in batches of 20 rows
batch_size = 40
request_count = 0
max_retries = 3

for i in range(0, len(df), batch_size):
    print(f"Processing batch {i // batch_size + 1} of {(len(df) + batch_size - 1) // batch_size}")
    batch_df = df.iloc[i:i+batch_size]
    batch_results = process_sentiment_batch(batch_df)

    success = False
    retry_count = 0
    
    while not success and retry_count < max_retries:
        try:
            batch_results = process_sentiment_batch(batch_df)
            
            # Try to save the response to a file
            try:
                with open(f"../data/exam/raw_2/run_{i}.json", "w") as file:
                    file.write(batch_results)
                success = True
                print(f"Successfully processed and saved batch {i // batch_size + 1}")
            except Exception as e:
                print(f"Error saving results: {str(e)}. Retrying...")
                retry_count += 1
                time.sleep(2)  # Short delay before retry
                
        except Exception as e:
            print(f"Error processing batch: {str(e)}. Retrying...")
            retry_count += 1
            time.sleep(5)  # Slightly longer delay for API errors
    
    if not success:
        print(f"Failed to process batch starting at index {i} after {max_retries} attempts. Skipping.")
    
    # Increment request counter
    request_count += 1
    
    # Add delay after every 5 requests
    if request_count % 5 == 0 and i + batch_size < len(df):
        print(f"Completed {request_count} requests. Taking a 1-minute break to avoid rate limiting...")
        time.sleep(60)  # Sleep for 60 seconds (1 minute)
        print("Resuming processing...")

Processing batch 1 of 22
Successfully processed and saved batch 1
Processing batch 2 of 22
Successfully processed and saved batch 2
Processing batch 3 of 22
Successfully processed and saved batch 3
Processing batch 4 of 22
Successfully processed and saved batch 4
Processing batch 5 of 22
Successfully processed and saved batch 5
Completed 5 requests. Taking a 1-minute break to avoid rate limiting...
Resuming processing...
Processing batch 6 of 22
Successfully processed and saved batch 6
Processing batch 7 of 22
Successfully processed and saved batch 7
Processing batch 8 of 22
Successfully processed and saved batch 8
Processing batch 9 of 22
Successfully processed and saved batch 9
Processing batch 10 of 22
Successfully processed and saved batch 10
Completed 10 requests. Taking a 1-minute break to avoid rate limiting...
Resuming processing...
Processing batch 11 of 22
Successfully processed and saved batch 11
Processing batch 12 of 22
Successfully processed and saved batch 12
Processing 

In [8]:
import glob

# Define the directory containing the JSON files
json_dir = "../data/exam/raw_2"

# Get a list of all JSON files
json_files = glob.glob(os.path.join(json_dir, "*.json"))

# Initialize empty lists to store the data
all_ratings = []

# Process each JSON file
for file_path in json_files:
    try:
        with open(file_path, 'r') as file:
            content = file.read()

            # Try to parse the JSON
            try:
                data = json.loads(content)
                
                # Handle both list and single object formats
                if isinstance(data, list):
                    ratings = data
                else:
                    ratings = [data]
                
                # Process each rating entry
                for rating in ratings:
                    # Standardize field names
                    patient_id = rating.get("id", rating.get("CustomerID"))
                    
                    # Handle different possible field names for fatigue
                    fatigue_level = rating.get("sentiment", 
                                     rating.get("sentiment_level", 
                                     rating.get("rating", None)))
                    
                    
                    # Add to our collection if valid
                    if patient_id:
                        all_ratings.append({
                            "CustomerID": patient_id,
                            "llm_sentiment": fatigue_level,
                        })
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON in file {file_path}: {e}")
                
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Create a dataframe from the collected ratings
ratings_df = pd.DataFrame(all_ratings)

# Print summary statistics
print(f"Successfully processed {len(ratings_df)} patient ratings")
print(f"Number of unique patients: {ratings_df['CustomerID'].nunique()}")

# Display the first few rows of the ratings dataframe
display(ratings_df.head())

# Now merge with the original dataframe (df_encoded)
df_with_ratings = df.merge(ratings_df, on="CustomerID", how="left")

# Check for any patients without ratings
missing_ratings = df_with_ratings[df_with_ratings['llm_sentiment'].isna()]['CustomerID'].count()
print(f"Patients without ratings: {missing_ratings} out of {len(df_with_ratings)}")

# Display the first few rows of the merged dataframe
display(df_with_ratings.head())

Successfully processed 880 patient ratings
Number of unique patients: 880


Unnamed: 0,CustomerID,llm_sentiment
0,4efed90,5
1,d7f26e8,4
2,6b4a427,5
3,68eec52,2
4,3a2af82,4


Patients without ratings: 0 out of 880


Unnamed: 0,CustomerID,Age,Gender,Location,MembershipLevel,TotalPurchases,TotalSpent,FavoriteCategory,LastPurchaseDate,WebsiteClickRate,TimeSpentOnSite,SocialMediaEngagement,AdClickHistory,GeneratedReview,CustomerSentimentScore,PersonaTag,Churn,llm_sentiment
0,4efed90,Female,Denver,CO,Silver,12,753.6,Clothing,2023-10-20,0.065,15.2,Medium,Clicked,"Great experience, love this store!",0.91,Regular Buyer,0,5
1,d7f26e8,Male,Los Angeles,CA,Gold,28,2155.4,Electronics,2023-10-25,0.092,22.5,High,Sometimes,Very happy with my purchases.,0.95,Loyal,0,4
2,6b4a427,Other,Chicago,IL,Platinum,41,4510.1,Home Goods,2023-09-18,0.115,28.1,High,Clicked,Excellent service and products.,0.98,Loyal,0,5
3,68eec52,Male,Houston,TX,Bronze,2,85.7,Books,2023-08-01,0.041,8.9,Low,Not Clicked,"Had some issues, not fully satisfied.",0.52,Window Shopper,1,2
4,3a2af82,Female,Phoenix,AZ,Silver,18,1220.5,Beauty,2023-10-10,0.078,18.7,Medium,Clicked,Will definitely buy again.,0.88,Engaged,0,4


In [9]:
# Save the merged dataframe to a new file
df_with_ratings.to_csv('../data/exam/processed/merged_data_with_ratings.csv', index=False)
print("DataFrame with ratings saved to '../data/exam/processed/merged_data_with_ratings.csv'")

DataFrame with ratings saved to '../data/exam/processed/merged_data_with_ratings.csv'
