## Dataset Preparation

In [1]:
import pandas as pd
import os
from pathlib import Path
import sys

sys.path.insert(0, str(Path('./').parent.parent))

# Đọc cả 2 file CSV
csv_male = './prompt_male.csv'
csv_female = './prompt_female.csv'

df_male = pd.read_csv(csv_male)
df_female = pd.read_csv(csv_female)

# Thêm cột gender để phân biệt
df_male['gender'] = 'male'
df_female['gender'] = 'female'

# Join 2 dataframes
df = pd.concat([df_male, df_female], ignore_index=True)

print(f"Số dòng male: {len(df_male)}")
print(f"Số dòng female: {len(df_female)}")
print(f"Tổng số dòng sau khi join: {len(df)}")
print(f"\nCác cột: {df.columns.tolist()}")

# Hiển thị một vài dòng
df.head()

Số dòng male: 99
Số dòng female: 100
Tổng số dòng sau khi join: 199

Các cột: ['Images', 'JSON', 'Prompt_text', 'gender']


Unnamed: 0,Images,JSON,Prompt_text,gender
0,1.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male
1,2.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male
2,3.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male
3,4.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male
4,5.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male


In [2]:
# Cập nhật hàm find_image_path để tìm theo gender
dataset_path = '/home/hungnq/hungnq/sd_stuff/datahub/dataset_200'

def find_image_path(row):
    """
    Tìm đường dẫn đầy đủ của ảnh dựa trên gender và filename
    """
    filename = row['Images']
    gender = row['gender']
    
    # Tìm trong folder tương ứng với gender
    image_path = os.path.join(dataset_path, gender, filename)
    
    if os.path.exists(image_path):
        return image_path
    else:
        return None


In [3]:
# Áp dụng hàm
df['image_path'] = df.apply(find_image_path, axis=1)

# Kiểm tra kết quả
missing_images = df[df['image_path'].isna()]
if not missing_images.empty:
    print(f"Cảnh báo: Có {len(missing_images)} ảnh không tìm thấy:")
    print(missing_images[['Images', 'gender']].head(20))

print(f"\nTổng số dòng: {len(df)}")
print(f"Số ảnh tìm thấy: {df['image_path'].notna().sum()}")
print(f"Số ảnh không tìm thấy: {df['image_path'].isna().sum()}")

# Thống kê theo gender
print("\nThống kê theo gender:")
print(df.groupby('gender')['image_path'].apply(lambda x: x.notna().sum()))

# Hiển thị vài dòng
df.head(10)


Tổng số dòng: 199
Số ảnh tìm thấy: 199
Số ảnh không tìm thấy: 0

Thống kê theo gender:
gender
female    100
male       99
Name: image_path, dtype: int64


Unnamed: 0,Images,JSON,Prompt_text,gender,image_path
0,1.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
1,2.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
2,3.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
3,4.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
4,5.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
5,6.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
6,7.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
7,8.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
8,9.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
9,10.jpg,"[\n{\n""gender"": ""male"",\n""age"": ""6–8"",\n""face_...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...


## Create variations of prompts for images in dataset_200

In [4]:
df.columns

Index(['Images', 'JSON', 'Prompt_text', 'gender', 'image_path'], dtype='object')

In [5]:
df.head()

Unnamed: 0,Images,JSON,Prompt_text,gender,image_path
0,1.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
1,2.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
2,3.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
3,4.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...
4,5.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...


### Step 1: Collect all unique feature values from JSON column

In [6]:
import json
import copy
from collections import defaultdict
import ast

# Collect all unique values for each feature
feature_pool = defaultdict(set)

def safe_parse_json(json_str):
    """Safely parse JSON string with multiple fallback methods"""
    try:
        # Try standard JSON parsing
        parsed = json.loads(json_str)
        return parsed[0] if isinstance(parsed, list) else parsed
    except:
        try:
            # Try ast.literal_eval for Python-style strings
            parsed = ast.literal_eval(json_str)
            return parsed[0] if isinstance(parsed, list) else parsed
        except:
            return None

# Parse all JSON entries and collect unique values
print("Parsing JSON entries...")
parsed_jsons = []
for idx, json_str in enumerate(df['JSON']):
    parsed = safe_parse_json(json_str)
    if parsed:
        parsed_jsons.append(parsed)
        # Collect unique values for each feature
        for key, value in parsed.items():
            if value:  # Only add non-empty values
                feature_pool[key].add(value)

print(f"Successfully parsed {len(parsed_jsons)} JSON entries")
print(f"\nFeatures found: {list(feature_pool.keys())}")
print(f"\nNumber of unique values per feature:")
for feature, values in sorted(feature_pool.items()):
    print(f"  {feature}: {len(values)} unique values")

# Convert sets to lists for easier sampling
for key in feature_pool:
    feature_pool[key] = list(feature_pool[key])

print("\n✓ Feature pool collected successfully")

Parsing JSON entries...
Successfully parsed 199 JSON entries

Features found: ['gender', 'age', 'face_shape', 'skin_tone', 'skin_texture', 'eye', 'eyebrow', 'nose', 'mouth', 'lip', 'cheek', 'chin', 'forehead', 'ear', 'jawline', 'face_proportion', 'hair', 'expression', 'angle', 'lighting']

Number of unique values per feature:
  age: 10 unique values
  angle: 100 unique values
  cheek: 92 unique values
  chin: 40 unique values
  ear: 86 unique values
  expression: 155 unique values
  eye: 107 unique values
  eyebrow: 78 unique values
  face_proportion: 110 unique values
  face_shape: 102 unique values
  forehead: 88 unique values
  gender: 2 unique values
  hair: 155 unique values
  jawline: 56 unique values
  lighting: 152 unique values
  lip: 57 unique values
  mouth: 141 unique values
  nose: 65 unique values
  skin_texture: 70 unique values
  skin_tone: 43 unique values

✓ Feature pool collected successfully


### Step 2: Create variation generator with semantic difference check

In [7]:
import hashlib
import random
from difflib import SequenceMatcher

class VariationGenerator:
    """Generate unique variations with significant semantic differences"""
    
    def __init__(self, feature_pool):
        self.feature_pool = feature_pool
        # All features that can be modified - PRIORITIZE facial features
        self.facial_features = [
            'expression', 'mouth', 'eye', 'eyebrow',  # Expression (4)
            'face_shape', 'nose', 'lip', 'cheek', 'chin', 'forehead',  # Facial structure (6)
            'ear', 'jawline', 'face_proportion',  # Additional facial (3)
            'hair',  # Hair style (1)
        ]
        self.non_facial_features = [
            'angle', 'lighting',  # Pose/lighting (2)
            'skin_tone', 'skin_texture',  # Skin (2) - de-prioritized
            'age'  # Age (1)
        ]
        self.all_features = self.facial_features + self.non_facial_features
        
    def _hash_variation(self, obj):
        """Create hash to check for duplicates"""
        sorted_items = sorted(obj.items())
        return hashlib.md5(str(sorted_items).encode()).hexdigest()
    
    def _calculate_similarity(self, text1, text2):
        """Calculate similarity ratio between two texts (0-1, where 1 is identical)"""
        return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
    
    def _is_significantly_different(self, text1, text2, min_difference=0.3):
        """Check if two texts are significantly different"""
        similarity = self._calculate_similarity(text1, text2)
        return similarity < (1 - min_difference)
    
    def _get_alternative_value(self, feature, current_value, exclude_values, seed=None, min_difference=0.3):
        """
        Get alternative value for a feature that is significantly different
        min_difference: minimum required difference ratio (0.3 = 30% different)
        """
        available = [v for v in self.feature_pool.get(feature, []) 
                     if v != current_value and v not in exclude_values]
        
        if not available:
            return current_value
        
        # Filter for significantly different values
        significantly_different = [
            v for v in available 
            if self._is_significantly_different(str(v), str(current_value), min_difference)
        ]
        
        # If we have significantly different options, use them
        if significantly_different:
            available = significantly_different
        
        # Select based on seed or hash
        if seed is not None:
            idx = (hash(current_value) + seed) % len(available)
        else:
            idx = hash(current_value) % len(available)
        
        return available[idx]
    
    def generate_variations(self, base_obj, num_variations=4, min_changes=9, min_difference=0.3):
        """
        Generate exactly num_variations unique variations
        Ensures at least min_changes features are modified per variation
        FOCUS ON FACIAL FEATURES, avoid changing lighting and skin_tone
        min_difference: minimum semantic difference required (0.3 = 30%)
        """
        variations = []
        seen_hashes = {self._hash_variation(base_obj)}
        
        # Get available features from base object - PRIORITIZE FACIAL FEATURES
        available_facial = [f for f in self.facial_features if f in base_obj]
        available_non_facial = [f for f in self.non_facial_features if f in base_obj]
        
        if len(available_facial) < min_changes:
            print(f"Warning: Only {len(available_facial)} facial features available, less than minimum {min_changes}")
            min_changes = max(1, len(available_facial) // 2)
        
        # Define 4 different strategies - ALL FOCUS ON FACIAL FEATURES
        strategies = [
            {
                'name': 'Expression & Eyes Focus',
                'priority_features': ['expression', 'eye', 'eyebrow', 'mouth', 'lip', 'cheek', 'forehead', 'face_shape', 'nose'],
                'avoid_features': ['lighting', 'skin_tone'],  # Avoid these
                'seed': 42,
                'min_diff': 0.4
            },
            {
                'name': 'Facial Structure Focus',
                'priority_features': ['face_shape', 'nose', 'chin', 'jawline', 'forehead', 'cheek', 'face_proportion', 'eyebrow', 'ear'],
                'avoid_features': ['lighting', 'skin_tone'],
                'seed': 123,
                'min_diff': 0.35
            },
            {
                'name': 'Features & Hair Focus',
                'priority_features': ['hair', 'eye', 'eyebrow', 'nose', 'mouth', 'lip', 'face_shape', 'expression', 'cheek'],
                'avoid_features': ['lighting', 'skin_tone'],
                'seed': 456,
                'min_diff': 0.3
            },
            {
                'name': 'Balanced Facial Focus',
                'priority_features': ['expression', 'mouth', 'eye', 'hair', 'nose', 'face_shape', 'eyebrow', 'cheek', 'chin', 'lip'],
                'avoid_features': ['lighting', 'skin_tone'],
                'seed': 789,
                'min_diff': 0.35
            },
        ]
        
        for strategy_idx, strategy in enumerate(strategies[:num_variations]):
            new_obj = copy.deepcopy(base_obj)
            used_values = set()
            changed_features = []
            similar_changes = []
            
            # Filter priority features to exclude avoided ones
            priority_features = [
                f for f in strategy['priority_features'] 
                if f in base_obj and f not in strategy['avoid_features']
            ]
            
            # Ensure we have enough features to change (prefer facial features)
            features_to_change = priority_features[:min_changes]
            
            # If not enough, add more facial features first
            if len(features_to_change) < min_changes:
                remaining_facial = [
                    f for f in available_facial 
                    if f not in features_to_change and f not in strategy['avoid_features']
                ]
                random.seed(strategy['seed'] + strategy_idx)
                additional = random.sample(
                    remaining_facial, 
                    min(min_changes - len(features_to_change), len(remaining_facial))
                )
                features_to_change.extend(additional)
            
            # Only add non-facial features if absolutely necessary (excluding avoided features)
            if len(features_to_change) < min_changes:
                remaining_non_facial = [
                    f for f in available_non_facial 
                    if f not in features_to_change and f not in strategy['avoid_features']
                ]
                random.seed(strategy['seed'] + strategy_idx + 100)
                additional = random.sample(
                    remaining_non_facial,
                    min(min_changes - len(features_to_change), len(remaining_non_facial))
                )
                features_to_change.extend(additional)
            
            # Apply changes to selected features
            for feature in features_to_change:
                current_val = new_obj[feature]
                new_val = self._get_alternative_value(
                    feature, current_val, used_values, 
                    seed=strategy['seed'] + strategy_idx,
                    min_difference=strategy['min_diff']
                )
                
                # Check if change is significant
                similarity = self._calculate_similarity(str(current_val), str(new_val))
                
                if new_val != current_val:
                    new_obj[feature] = new_val
                    used_values.add(new_val)
                    changed_features.append(feature)
                    
                    if similarity > (1 - strategy['min_diff']):
                        similar_changes.append((feature, similarity))
            
            # If we have too many similar changes, try to fix them (avoid lighting/skin_tone)
            if len(similar_changes) > 2:
                print(f"  Warning: {len(similar_changes)} changes were too similar, attempting to improve...")
                for feature, _ in similar_changes[:3]:
                    if feature not in strategy['avoid_features']:
                        current_val = new_obj[feature]
                        new_val = self._get_alternative_value(
                            feature, current_val, used_values, 
                            seed=strategy['seed'] + strategy_idx + 3000,
                            min_difference=0.5
                        )
                        if new_val != current_val and self._is_significantly_different(str(current_val), str(new_val), 0.4):
                            new_obj[feature] = new_val
            
            # Verify minimum changes achieved (prioritize facial features)
            actual_changes = len([k for k in new_obj if new_obj[k] != base_obj.get(k)])
            if actual_changes < min_changes:
                unchanged_facial = [
                    f for f in available_facial 
                    if f not in changed_features and f not in strategy['avoid_features']
                ]
                for feature in unchanged_facial[:min_changes - actual_changes]:
                    current_val = new_obj[feature]
                    new_val = self._get_alternative_value(
                        feature, current_val, used_values, 
                        seed=strategy['seed'] + strategy_idx + 1000,
                        min_difference=0.4
                    )
                    if new_val != current_val:
                        new_obj[feature] = new_val
                        used_values.add(new_val)
                        changed_features.append(feature)
            
            # Check for duplicates
            var_hash = self._hash_variation(new_obj)
            if var_hash not in seen_hashes:
                variations.append(new_obj)
                seen_hashes.add(var_hash)
                actual_changes = len([k for k in new_obj if new_obj[k] != base_obj.get(k)])
                facial_changes = len([k for k in changed_features if k in self.facial_features])
                print(f"  {strategy['name']}: Changed {actual_changes} features ({facial_changes} facial)")
                if similar_changes:
                    print(f"    (Note: {len(similar_changes)} had lower semantic difference)")
            else:
                # Fallback: force change in additional FACIAL features
                print(f"  Duplicate detected for {strategy['name']}, applying fallback")
                fallback_features = [
                    f for f in available_facial 
                    if f not in changed_features and f not in strategy['avoid_features']
                ][:3]
                for feature in fallback_features:
                    new_obj[feature] = self._get_alternative_value(
                        feature, new_obj[feature], used_values, 
                        seed=strategy['seed'] + 2000,
                        min_difference=0.5
                    )
                var_hash = self._hash_variation(new_obj)
                if var_hash not in seen_hashes:
                    variations.append(new_obj)
                    seen_hashes.add(var_hash)
                    print(f"    Fallback successful")
                    break
        
        return variations

# Initialize generator
generator = VariationGenerator(feature_pool)
print("✓ Enhanced variation generator initialized - FOCUS ON FACIAL FEATURES")
print(f"  Facial features: {len(generator.facial_features)} ({', '.join(generator.facial_features[:8])}...)")
print(f"  Avoided features: lighting, skin_tone")
print(f"  Target: At least 9 feature changes per variation (prioritize facial)")
print(f"  Minimum semantic difference: 30-40%")

✓ Enhanced variation generator initialized - FOCUS ON FACIAL FEATURES
  Facial features: 14 (expression, mouth, eye, eyebrow, face_shape, nose, lip, cheek...)
  Avoided features: lighting, skin_tone
  Target: At least 9 feature changes per variation (prioritize facial)
  Minimum semantic difference: 30-40%


### Step 3: Test the generator with first entry

In [8]:
# Test with first entry
base_obj = parsed_jsons[0]

print("Base object:")
print(f"Total features: {len(base_obj)}")
print(f"Features: {list(base_obj.keys())}")

# Generate variations
print("\nGenerating variations with minimum 9 changes and semantic difference check...")
variations = generator.generate_variations(base_obj, num_variations=4, min_changes=9, min_difference=0.3)

print(f"\n✓ Generated {len(variations)} unique variations")

# Display differences from base with similarity scores
for i, var in enumerate(variations, 1):
    print(f"\n{'='*80}")
    print(f"--- Variation {i} ---")
    differences = {k: v for k, v in var.items() if v != base_obj.get(k)}
    print(f"Changed features ({len(differences)}): {list(differences.keys())}")
    
    # Show first 3 changes in detail with similarity scores
    for idx, (key, value) in enumerate(list(differences.items())[:3]):
        base_val = base_obj.get(key, 'N/A')
        similarity = generator._calculate_similarity(str(base_val), str(value))
        difference_pct = (1 - similarity) * 100
        
        print(f"\n  {key} (Difference: {difference_pct:.1f}%):")
        print(f"    Base: {str(base_val)[:100]}{'...' if len(str(base_val)) > 100 else ''}")
        print(f"    New:  {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")
    
    if len(differences) > 3:
        print(f"\n  ... and {len(differences) - 3} more changes")

# Verify no duplicates and minimum changes
hashes = [generator._hash_variation(v) for v in variations]
print(f"\n{'='*80}")
print(f"✓ All variations unique: {len(hashes) == len(set(hashes))}")
print(f"✓ Minimum changes met: {all(len([k for k in v if v[k] != base_obj.get(k)]) >= 9 for v in variations)}")

# Statistics
change_counts = [len([k for k in v if v[k] != base_obj.get(k)]) for v in variations]
print(f"\nChange statistics:")
print(f"  Min changes: {min(change_counts)}")
print(f"  Max changes: {max(change_counts)}")
print(f"  Average changes: {sum(change_counts) / len(change_counts):.1f}")

# Calculate average semantic difference
print(f"\nSemantic difference analysis:")
for i, var in enumerate(variations, 1):
    differences = [(k, generator._calculate_similarity(str(base_obj.get(k)), str(v))) 
                   for k, v in var.items() if v != base_obj.get(k)]
    if differences:
        avg_similarity = sum(sim for _, sim in differences) / len(differences)
        avg_difference_pct = (1 - avg_similarity) * 100
        print(f"  Variation {i}: Average {avg_difference_pct:.1f}% different")

Base object:
Total features: 20
Features: ['gender', 'age', 'face_shape', 'skin_tone', 'skin_texture', 'eye', 'eyebrow', 'nose', 'mouth', 'lip', 'cheek', 'chin', 'forehead', 'ear', 'jawline', 'face_proportion', 'hair', 'expression', 'angle', 'lighting']

Generating variations with minimum 9 changes and semantic difference check...
  Expression & Eyes Focus: Changed 9 features (9 facial)
  Facial Structure Focus: Changed 9 features (9 facial)
  Features & Hair Focus: Changed 9 features (9 facial)
    (Note: 1 had lower semantic difference)
  Balanced Facial Focus: Changed 9 features (9 facial)
    (Note: 1 had lower semantic difference)

✓ Generated 4 unique variations

--- Variation 1 ---
Changed features (9): ['face_shape', 'eye', 'eyebrow', 'nose', 'mouth', 'lip', 'cheek', 'forehead', 'expression']

  face_shape (Difference: 45.5%):
    Base: rounded oval with soft contours and gentle jawline curvature
    New:  ovoid with youthful, soft contours and significant lower facial fullness

### Step 4: Apply to entire dataset

In [9]:
# Generate variations for all rows with enhanced generator
print("Generating variations for all rows (minimum 9 feature changes)...")
df['JSON_variations'] = None

total_changes = []
for idx, json_str in enumerate(df['JSON']):
    base_obj = safe_parse_json(json_str)
    if base_obj:
        variations = generator.generate_variations(base_obj, num_variations=4, min_changes=9, min_difference=0.3)
        df.at[idx, 'JSON_variations'] = json.dumps(variations, ensure_ascii=False)
        
        # Track statistics
        for var in variations:
            change_count = len([k for k in var if var[k] != base_obj.get(k)])
            total_changes.append(change_count)
    
    if (idx + 1) % 50 == 0:
        print(f"  Processed {idx + 1}/{len(df)} rows...")

print(f"\n✓ Generated variations for {df['JSON_variations'].notna().sum()} rows")
print(f"✗ Failed for {df['JSON_variations'].isna().sum()} rows")

if total_changes:
    print(f"\nOverall statistics:")
    print(f"  Total variations: {len(total_changes)}")
    print(f"  Min changes: {min(total_changes)}")
    print(f"  Max changes: {max(total_changes)}")
    print(f"  Average changes: {sum(total_changes) / len(total_changes):.1f}")
    print(f"  Variations with ≥9 changes: {sum(1 for c in total_changes if c >= 9)} ({sum(1 for c in total_changes if c >= 9) / len(total_changes) * 100:.1f}%)")

# Show sample
df[['Images', 'gender', 'JSON_variations']].head()

Generating variations for all rows (minimum 9 feature changes)...
  Expression & Eyes Focus: Changed 9 features (9 facial)
  Facial Structure Focus: Changed 9 features (9 facial)
  Features & Hair Focus: Changed 9 features (9 facial)
    (Note: 1 had lower semantic difference)
  Balanced Facial Focus: Changed 9 features (9 facial)
    (Note: 1 had lower semantic difference)
  Expression & Eyes Focus: Changed 9 features (9 facial)
  Facial Structure Focus: Changed 9 features (9 facial)
  Features & Hair Focus: Changed 9 features (9 facial)
  Balanced Facial Focus: Changed 9 features (9 facial)
  Expression & Eyes Focus: Changed 9 features (9 facial)
  Facial Structure Focus: Changed 9 features (9 facial)
  Features & Hair Focus: Changed 9 features (9 facial)
  Balanced Facial Focus: Changed 9 features (9 facial)
  Expression & Eyes Focus: Changed 9 features (9 facial)
  Facial Structure Focus: Changed 9 features (9 facial)
  Features & Hair Focus: Changed 9 features (9 facial)
  Balance

Unnamed: 0,Images,gender,JSON_variations
0,1.jpg,male,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""..."
1,2.jpg,male,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""..."
2,3.jpg,male,"[{""gender"": ""male"", ""age"": ""8–10"", ""face_shape..."
3,4.jpg,male,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""..."
4,5.jpg,male,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""..."


In [10]:
df.head()

Unnamed: 0,Images,JSON,Prompt_text,gender,image_path,JSON_variations
0,1.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""..."
1,2.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""..."
2,3.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""8–10"", ""face_shape..."
3,4.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""..."
4,5.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""..."


In [11]:
# Final verification
print("Final dataset statistics:")
print(f"Total rows: {len(df)}")
print(f"Rows with variations: {df['JSON_variations'].notna().sum()}")
print(f"Rows without variations: {df['JSON_variations'].isna().sum()}")
print(f"\nSuccess rate: {df['JSON_variations'].notna().sum() / len(df) * 100:.2f}%")

# Sample check
print("\n\nSample of successful variations:")
sample_idx = df[df['JSON_variations'].notna()].index[0]
sample_vars = json.loads(df.loc[sample_idx, 'JSON_variations'])
print(f"\nImage: {df.loc[sample_idx, 'Images']}")
print(f"Number of variations: {len(sample_vars)}")
print(f"First variation keys: {list(sample_vars[0].keys())}")

Final dataset statistics:
Total rows: 199
Rows with variations: 199
Rows without variations: 0

Success rate: 100.00%


Sample of successful variations:

Image: 1.jpg
Number of variations: 4
First variation keys: ['gender', 'age', 'face_shape', 'skin_tone', 'skin_texture', 'eye', 'eyebrow', 'nose', 'mouth', 'lip', 'cheek', 'chin', 'forehead', 'ear', 'jawline', 'face_proportion', 'hair', 'expression', 'angle', 'lighting']


In [12]:
df

Unnamed: 0,Images,JSON,Prompt_text,gender,image_path,JSON_variations
0,1.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""..."
1,2.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""..."
2,3.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""8–1...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""8–10"", ""face_shape..."
3,4.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""6–8...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""..."
4,5.jpg,"[\n {\n ""gender"": ""male"",\n ""age"": ""7–9...",A highly realistic portrait photograph of a cu...,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""..."
...,...,...,...,...,...,...
194,unsplash_92.jpg,"[\n {\n ""gender"": ""female"",\n ""age"": ""7...","A lively, wholesome portrait of a young Japane...",female,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""female"", ""age"": ""7"", ""face_shape""..."
195,unsplash_93.jpg,"[\n {\n ""gender"": ""female"",\n ""age"": ""6...","A gentle, wholesome portrait of a young Japane...",female,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""female"", ""age"": ""6"", ""face_shape""..."
196,unsplash_94.jpg,"[\n {\n ""gender"": ""female"",\n ""age"": ""6...","A bright, wholesome portrait of a young Japane...",female,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""female"", ""age"": ""6"", ""face_shape""..."
197,unsplash_101.jpg,"[\n {\n ""gender"": ""female"",\n ""age"": ""6...","A soft, wholesome portrait of a young Japanese...",female,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""female"", ""age"": ""6"", ""face_shape""..."


In [13]:
# Failed rows check
failed_rows = df[df['JSON_variations'].isna()]
if not failed_rows.empty:
    print(f"\n\nSample of failed rows (no variations generated):")
    print(failed_rows[['Images', 'gender', 'JSON']].head())

### Step 6: Create Prompt Text for Each Variation

In [14]:
# Renumber IDs in JSON_variations to create 1000 unique IDs
# Formula: new_id = base_row_index * 4 + variation_index (1-4)
# This gives us: 1-4, 5-8, 9-12, ... up to 997-1000

print("Renumbering IDs in JSON variations...")
print(f"Total base images: {len(df)}")
print(f"Expected total prompts: {len(df) * 4}")

renumbered_count = 0
id_list = []

for idx in df.index:
    if pd.notna(df.loc[idx, 'JSON_variations']):
        try:
            variations = json.loads(df.loc[idx, 'JSON_variations'])
            
            # Renumber each variation
            for var_idx, var_obj in enumerate(variations, 1):
                new_id = idx * 4 + var_idx
                var_obj['id'] = new_id
                id_list.append(new_id)
            
            # Save back to dataframe
            df.at[idx, 'JSON_variations'] = json.dumps(variations, ensure_ascii=False)
            renumbered_count += 1
            
        except Exception as e:
            print(f"Error at index {idx}: {str(e)}")

print(f"\n✓ Renumbered {renumbered_count} rows")
print(f"✓ Total unique IDs created: {len(id_list)}")
print(f"✓ ID range: {min(id_list)} to {max(id_list)}")
print(f"✓ All IDs unique: {len(id_list) == len(set(id_list))}")

# Show sample
sample_variations = json.loads(df.loc[0, 'JSON_variations'])
print(f"\nSample IDs from first row:")
for i, var in enumerate(sample_variations, 1):
    print(f"  Variation {i}: ID = {var.get('id', 'N/A')}")

Renumbering IDs in JSON variations...
Total base images: 199
Expected total prompts: 796

✓ Renumbered 199 rows
✓ Total unique IDs created: 796
✓ ID range: 1 to 796
✓ All IDs unique: True

Sample IDs from first row:
  Variation 1: ID = 1
  Variation 2: ID = 2
  Variation 3: ID = 3
  Variation 4: ID = 4


### Step 7: Setup Gemini API for Prompt Generation

In [15]:
# Configure Gemini API
GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY")  # Replace with your actual API key

In [16]:
from google import genai
from google.genai import types
import time
from typing import Dict, Optional
import os



# Initialize Gemini model
client = genai.Client(api_key=GEMINI_API_KEY)

def generate_prompt_with_gemini(features_json: Dict, gender: str, retry_count: int = 3) -> Optional[str]:
    """
    Generate prompt text using Gemini API from feature JSON
    
    Args:
        features_json: Dictionary containing facial features
        gender: 'male' or 'female'
        retry_count: Number of retries on failure
    
    Returns:
        Generated prompt text or None on failure
    """
    prompt_template = f"""Read the JSON feature of a Japanese's {gender} below, then generate a detailed text prompt for creating a high-quality portrait image of this person. Note that the focus should be on facial features, expression, and style. Use vivid and descriptive language suitable for generating realistic images.

Features JSON:
{json.dumps(features_json, ensure_ascii=False, indent=2)}

Generate a single detailed prompt text describing this person. Return ONLY a JSON object with this exact format:
{{"prompt": "your detailed prompt text here"}}"""
    
    for attempt in range(retry_count):
        try:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=prompt_template,
                config={
                    "response_mime_type": "application/json",
                },
            )
            
            # Extract text from response
            # print(response)
            if response and response.text:
                response_text = response.text.strip()
                response_json = json.loads(response_text)
                
                # Handle different response formats
                if isinstance(response_json, dict) and 'prompt' in response_json:
                    return response_json['prompt']
                elif isinstance(response_json, list) and len(response_json) > 0:
                    if isinstance(response_json[0], dict) and 'prompt' in response_json[0]:
                        return response_json[0]['prompt']
                    
                print(f"  Unexpected response format: {type(response_json)}")
                return None
                
        except json.JSONDecodeError as e:
            print(f"  JSON decode error on attempt {attempt + 1}: {str(e)}")
            print(f"  Response text: {response.text[:200] if response and response.text else 'None'}")
        except Exception as e:
            print(f"  Error on attempt {attempt + 1}: {str(e)}")
            
        if attempt < retry_count - 1:
            time.sleep(2 ** attempt)  # Exponential backoff
    
    return None

print("✓ Gemini API configured successfully")
print(f"✓ Model: gemini-2.0-flash")
print(f"✓ Ready to generate prompts from JSON features")

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


✓ Gemini API configured successfully
✓ Model: gemini-2.0-flash
✓ Ready to generate prompts from JSON features


In [17]:
# Test Gemini prompt generation with first variation
print("Testing Gemini API with first variation...")

test_json = parsed_jsons[0]
test_gender = df.loc[0, 'gender']

print(f"\nTest input:")
print(f"  Gender: {test_gender}")
print(f"  Features: {list(test_json.keys())}")
print(f"\nGenerating prompt...")

test_prompt = generate_prompt_with_gemini(test_json, test_gender)

if test_prompt:
    print(f"\n✓ Successfully generated prompt:")
    print(f"{'='*80}")
    print(test_prompt[:500])
    if len(test_prompt) > 500:
        print(f"... (truncated, total length: {len(test_prompt)} chars)")
    print(f"{'='*80}")
else:
    print("\n✗ Failed to generate prompt")
    print("Please check your API key and internet connection")

Testing Gemini API with first variation...

Test input:
  Gender: male
  Features: ['gender', 'age', 'face_shape', 'skin_tone', 'skin_texture', 'eye', 'eyebrow', 'nose', 'mouth', 'lip', 'cheek', 'chin', 'forehead', 'ear', 'jawline', 'face_proportion', 'hair', 'expression', 'angle', 'lighting']

Generating prompt...

✓ Successfully generated prompt:
A captivating portrait of a joyful Japanese boy, aged 7-9. He has a rounded oval face with soft contours and a gentle jawline. His skin is light with a warm, East Asian undertone, appearing smooth and flawless. His eyes are medium-sized, almond-shaped, with a slight upward tilt, and dark brown irises. He has single eyelids. A radiant, joyful wide smile illuminates his face, revealing his teeth. His lips are moderately full, with a slightly fuller lower lip. His cheeks are high volume and round, 
... (truncated, total length: 1184 chars)


In [18]:
test_prompt

'A captivating portrait of a joyful Japanese boy, aged 7-9. He has a rounded oval face with soft contours and a gentle jawline. His skin is light with a warm, East Asian undertone, appearing smooth and flawless. His eyes are medium-sized, almond-shaped, with a slight upward tilt, and dark brown irises. He has single eyelids. A radiant, joyful wide smile illuminates his face, revealing his teeth. His lips are moderately full, with a slightly fuller lower lip. His cheeks are high volume and round, giving him a chubby, adorable appearance. His nose is small with a low bridge and rounded tip. His eyebrows are medium thickness, straight to slightly arched, and symmetrical. His forehead is high and broad with a smooth curvature. He sports a classic black bowl cut with straight bangs. The lighting is natural and soft daylight, coming from above and front, evenly illuminating his face with minimal shadows. His expression is one of pure, unadulterated happiness, his eyes squinting slightly with

In [19]:
test_json

{'gender': 'male',
 'age': '7–9',
 'face_shape': 'rounded oval with soft contours and gentle jawline curvature',
 'skin_tone': 'light with warm undertone typical of East Asian complexion',
 'skin_texture': 'smooth and even surface with no visible blemishes or texture irregularities',
 'eye': 'medium-sized almond-shaped eyes with slight upward tilt at outer corners, moderately spaced, dark brown iris color, single eyelids without prominent crease, good left–right symmetry',
 'eyebrow': 'medium thickness with straight to slightly arched shape, gentle curvature, horizontally aligned, moderately spaced from eyes, symmetrical on left and right',
 'nose': 'small size with low bridge height, rounded tip, narrow to medium width appropriate for child proportions',
 'mouth': 'medium width with upward curvature due to smiling, open shape revealing teeth, symmetrical on left and right',
 'lip': 'both upper and lower lips moderately full with balanced proportion, lower lip slightly fuller',
 'cheek

In [28]:
def create_full_prompt_with_template(variation_prompt: str, reference_image: str) -> str:
    """Create the full prompt following the specified template"""
    
    template = f"""Preserve the light, background, expression of referrence image and change the face based on the following feature:
    ```
    {variation_prompt}
    ```
    """
    return template

print("✓ Template function defined")

✓ Template function defined


In [21]:
# Generate all prompts using Gemini API
print("Generating prompts for all variations using Gemini API...")
print(f"Total rows to process: {len(df)}")
print(f"Expected API calls: {len(df) * 4}")
print("\nThis may take several minutes...")

success_count = 0
failed_indices = []
total_api_calls = 0
failed_api_calls = 0

for idx in df.index:
    try:
        if pd.notna(df.loc[idx, 'JSON_variations']):
            variations = json.loads(df.loc[idx, 'JSON_variations'])
            gender = df.loc[idx, 'gender']
            ref_image = df.loc[idx, 'Images']
            
            # Generate prompt for each variation using Gemini
            for i, var_obj in enumerate(variations, 1):
                total_api_calls += 1
                
                # Generate prompt text with Gemini
                var_prompt = generate_prompt_with_gemini(var_obj, gender)
                
                if var_prompt:
                    # Wrap in template
                    # full_prompt = create_full_prompt_with_template(var_prompt, ref_image)
                    df.at[idx, f'variation_{i}_prompt'] = var_prompt
                else:
                    failed_api_calls += 1
                    print(f"  ✗ Failed to generate prompt for row {idx}, variation {i}")
                
                # Rate limiting: small delay between requests
                time.sleep(0.1)
            
            success_count += 1
        else:
            failed_indices.append(idx)
            
    except Exception as e:
        print(f"Error at index {idx}: {str(e)}")
        failed_indices.append(idx)
    
    # Progress update every 10 rows
    if (idx + 1) % 10 == 0:
        print(f"  Processed {idx + 1}/{len(df)} rows... ({total_api_calls} API calls, {failed_api_calls} failures)")

print(f"\n{'='*80}")
print(f"Prompt generation results:")
print(f"  Successfully processed rows: {success_count}")
print(f"  Failed rows: {len(failed_indices)}")
print(f"  Total API calls: {total_api_calls}")
print(f"  Failed API calls: {failed_api_calls}")
print(f"  Success rate: {((total_api_calls - failed_api_calls) / total_api_calls * 100):.1f}%")
 
if failed_indices:
    print(f"\nFailed indices: {failed_indices[:10]}{'...' if len(failed_indices) > 10 else ''}")

Generating prompts for all variations using Gemini API...
Total rows to process: 199
Expected API calls: 796

This may take several minutes...
  Processed 10/199 rows... (40 API calls, 0 failures)
  Processed 20/199 rows... (80 API calls, 0 failures)
  Processed 30/199 rows... (120 API calls, 0 failures)
  Processed 40/199 rows... (160 API calls, 0 failures)
  Processed 50/199 rows... (200 API calls, 0 failures)
  Processed 60/199 rows... (240 API calls, 0 failures)
  Processed 70/199 rows... (280 API calls, 0 failures)
  Processed 80/199 rows... (320 API calls, 0 failures)
  Processed 90/199 rows... (360 API calls, 0 failures)
  Processed 100/199 rows... (400 API calls, 0 failures)
  Processed 110/199 rows... (440 API calls, 0 failures)
  Processed 120/199 rows... (480 API calls, 0 failures)
  Processed 130/199 rows... (520 API calls, 0 failures)
  Processed 140/199 rows... (560 API calls, 0 failures)
  Processed 150/199 rows... (600 API calls, 0 failures)
  Processed 160/199 rows... 

In [24]:
df[['variation_1_prompt', 'variation_2_prompt', 'variation_3_prompt', 'variation_4_prompt']].head()

Unnamed: 0,variation_1_prompt,variation_2_prompt,variation_3_prompt,variation_4_prompt
0,"A hyperrealistic portrait of a Japanese boy, a...",A stunningly realistic portrait of a joyful Ja...,A hyperrealistic portrait of a 7-9 year old Ja...,"A hyperrealistic portrait of a Japanese boy, a..."
1,"A hyperrealistic portrait of a Japanese boy, a...","A captivating, ultra-realistic portrait of a J...","A breathtaking, hyperrealistic portrait of a j...","A hyperrealistic portrait of a Japanese boy, a..."
2,A hyperrealistic portrait of a cheerful Japane...,A hyperrealistic portrait of a charming Japane...,A hyperrealistic portrait of a joyful Japanese...,"A captivating, ultra-realistic portrait of a d..."
3,"A captivating, high-resolution portrait of a d...","A captivating, high-resolution portrait of a J...","A stunning, hyperrealistic portrait of a Japan...",A breathtakingly realistic portrait of a Japan...
4,"A hyperrealistic portrait of a Japanese boy, a...","A hyperrealistic portrait of a Japanese boy, 7...","A breathtaking, hyperrealistic portrait of a y...","A hyperrealistic portrait of a Japanese boy, a..."


In [26]:
# Verify generated prompts
print("Verifying generated prompts...")
print(f"\n{'='*80}")
print("Sample prompts from first row:")
print(f"{'='*80}")

for i in range(1, 5):
    prompt = df.loc[0, f'variation_{i}_prompt']
    if prompt:
        # split_point = prompt.find("```")
        # if split_point != -1:
            # variation_prompt = prompt[split_point + 4:prompt.rfind("```")].strip()
            variation_prompt = prompt.strip()
            print(f"\n--- Variation {i} (Generated by Gemini) ---")
            print(variation_prompt[:300])
            if len(variation_prompt) > 300:
                print(f"... (truncated, total: {len(variation_prompt)} chars)")
    #     else:
    #         print(f"\n--- Variation {i} ---")
    #         print("Warning: Template markers not found")
    # else:
    #     print(f"\n--- Variation {i} ---")
    #     print("No prompt generated")

# Check for missing prompts
missing_count = 0
for i in range(1, 5):
    col_name = f'variation_{i}_prompt'
    missing = df[col_name].isna().sum()
    if missing > 0:
        missing_count += missing
        print(f"\nWarning: {missing} missing prompts in {col_name}")

if missing_count == 0:
    print(f"\n{'='*80}")
    print("✓ All prompts generated successfully!")
else:
    print(f"\n{'='*80}")
    print(f"⚠ Total missing prompts: {missing_count}")

Verifying generated prompts...

Sample prompts from first row:

--- Variation 1 (Generated by Gemini) ---
A hyperrealistic portrait of a Japanese boy, aged 7-9. His face is an ovoid shape, soft and youthful, with full cheeks. He has light skin with warm undertones, perfectly smooth and even. His eyes are almond-shaped, tilted slightly upwards, with dark brown irises and narrow eyelid folds. Thin, faint 
... (truncated, total: 1487 chars)

--- Variation 2 (Generated by Gemini) ---
A stunningly realistic portrait of a joyful Japanese boy, aged 7-9. He has a soft, rounded oval face with a light, warm-toned East Asian complexion that is flawlessly smooth. His medium-sized, almond-shaped eyes, a deep, dark brown, tilt gently upwards, and he possesses single eyelids. His eyebrows 
... (truncated, total: 1431 chars)

--- Variation 3 (Generated by Gemini) ---
A hyperrealistic portrait of a 7-9 year old Japanese boy. He has a rounded oval face with soft contours, light skin with warm undertones

### Step 9: Create CSV with Base Prompt and 4 Variations

In [29]:
import re

def extract_variation_prompt_from_full(full_prompt):
    """Extract the variation prompt text from the full prompt template"""
    if pd.isna(full_prompt):
        return None
    
    # Find content between ``` markers
    match = re.search(r'```\s*\n(.*?)\n```', full_prompt, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None


# Create new dataframe with base prompt + 4 variations
print("Creating CSV with base Prompt_text + 4 variations...")

output_df = pd.DataFrame()
output_df['Images'] = df['Images']
output_df['gender'] = df['gender']
output_df['image_path'] = df['image_path']

# add json variation
output_df['JSON_variations'] = df['JSON_variations']


# Add base prompt WITHOUT template (just the text)
output_df['base_prompt'] = df['Prompt_text']

# Extract and add 4 variation prompts (full template)
for i in range(1, 5):
    col_name = f'variation_{i}_prompt'
    output_df[f'variation_{i}_full'] = df[col_name]

# Extract just the text part (between ```) for variations
for i in range(1, 5):
    col_name = f'variation_{i}_prompt'
    # output_df[f'variation_{i}_text'] = df[col_name].apply(extract_variation_prompt_from_full)
    output_df[f'variation_{i}_text'] = df[col_name]
    
    # add prompt template to full prompt
    output_df[f'variation_{i}_full'] = output_df[f'variation_{i}_text'].apply(
        lambda x: create_full_prompt_with_template(x, "") if pd.notna(x) else None
    )

# Display statistics
print(f"\n✓ Created dataframe with {len(output_df)} rows")
print(f"\nColumns: {output_df.columns.tolist()}")

# Check for missing data
for col in ['base_prompt', 'variation_1_full', 'variation_2_full', 'variation_3_full', 'variation_4_full']:
    missing = output_df[col].isna().sum()
    print(f"  {col}: {len(output_df) - missing}/{len(output_df)} valid ({missing} missing)")

output_df.head()

Creating CSV with base Prompt_text + 4 variations...

✓ Created dataframe with 199 rows

Columns: ['Images', 'gender', 'image_path', 'JSON_variations', 'base_prompt', 'variation_1_full', 'variation_2_full', 'variation_3_full', 'variation_4_full', 'variation_1_text', 'variation_2_text', 'variation_3_text', 'variation_4_text']
  base_prompt: 199/199 valid (0 missing)
  variation_1_full: 199/199 valid (0 missing)
  variation_2_full: 199/199 valid (0 missing)
  variation_3_full: 199/199 valid (0 missing)
  variation_4_full: 199/199 valid (0 missing)


Unnamed: 0,Images,gender,image_path,JSON_variations,base_prompt,variation_1_full,variation_2_full,variation_3_full,variation_4_full,variation_1_text,variation_2_text,variation_3_text,variation_4_text
0,1.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""...",A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","A hyperrealistic portrait of a Japanese boy, a...",A stunningly realistic portrait of a joyful Ja...,A hyperrealistic portrait of a 7-9 year old Ja...,"A hyperrealistic portrait of a Japanese boy, a..."
1,2.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""...",A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","A hyperrealistic portrait of a Japanese boy, a...","A captivating, ultra-realistic portrait of a J...","A breathtaking, hyperrealistic portrait of a j...","A hyperrealistic portrait of a Japanese boy, a..."
2,3.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""8–10"", ""face_shape...",A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...",A hyperrealistic portrait of a cheerful Japane...,A hyperrealistic portrait of a charming Japane...,A hyperrealistic portrait of a joyful Japanese...,"A captivating, ultra-realistic portrait of a d..."
3,4.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""6–8"", ""face_shape""...",A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","A captivating, high-resolution portrait of a d...","A captivating, high-resolution portrait of a J...","A stunning, hyperrealistic portrait of a Japan...",A breathtakingly realistic portrait of a Japan...
4,5.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,"[{""gender"": ""male"", ""age"": ""7–9"", ""face_shape""...",A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","A hyperrealistic portrait of a Japanese boy, a...","A hyperrealistic portrait of a Japanese boy, 7...","A breathtaking, hyperrealistic portrait of a y...","A hyperrealistic portrait of a Japanese boy, a..."


In [30]:
# Save the CSV with base + variations
output_file = './prompt_base_and_variations_3_full.csv'
output_df.to_csv(output_file, index=False)
print(f"✓ Saved CSV with base + 4 variations to: {output_file}")

# Also create a more detailed version with FULL prompts (with template)
# detailed_output_df = pd.DataFrame()
# detailed_output_df['Images'] = df['Images']
# detailed_output_df['gender'] = df['gender']
# detailed_output_df['image_path'] = df['image_path']

# # Base prompt with template (same format as variations)
# detailed_output_df['base_prompt'] = output_df['base_prompt']

# # All 4 variations with full template
# detailed_output_df['variation_1_prompt'] = df['variation_1_prompt']
# detailed_output_df['variation_2_prompt'] = df['variation_2_prompt']
# detailed_output_df['variation_3_prompt'] = df['variation_3_prompt']
# detailed_output_df['variation_4_prompt'] = df['variation_4_prompt']

# detailed_file = './prompt_base_and_variations_full_2.csv'
# detailed_output_df.to_csv(detailed_file, index=False)
# print(f"✓ Saved detailed CSV with full prompts to: {detailed_file}")

# print(f"\n{'='*80}")
# print(f"SUMMARY:")
# print(f"  Total rows: {len(output_df)}")
# print(f"  Prompts per row: 5 (1 base + 4 variations)")
# print(f"  Total prompts: {len(output_df) * 5}")
# print(f"  ALL prompts now have SAME template format (2 objects + reference image)")
# print(f"\nFiles created:")
# print(f"  1. {output_file} - Includes both full template and text-only versions")
# print(f"  2. {detailed_file} - Full prompts with template (ready for batch generation)")
# print(f"{'='*80}")

✓ Saved CSV with base + 4 variations to: ./prompt_base_and_variations_3_full.csv


In [27]:
# Verify the content - both base and variations should have same template
print("Verification - checking template format:")
print(f"\n{'='*80}")
print(f"BASE PROMPT (with template):")
print(f"{'='*80}")
print(output_df.loc[0, 'base_prompt'][:400])

print(f"\n{'='*80}")
print(f"VARIATION 1 (with template):")
print(f"{'='*80}")
print(output_df.loc[0, 'variation_1_full'][:400])

print(f"\n{'='*80}")
print("COMPARISON:")
print(f"{'='*80}")
print(f"Base prompt has template: {'Create a image with two object' in str(output_df.loc[0, 'base_prompt'])}")
print(f"Variation 1 has template: {'Create a image with two object' in str(output_df.loc[0, 'variation_1_full'])}")

# Check IDs are present in variations
print(f"\n{'='*80}")
print("Checking IDs in variations:")
for i in range(1, 5):
    var_text = output_df.loc[0, f'variation_{i}_text']
    id_match = re.search(r'ID:(\d+)', var_text) if var_text else None
    if id_match:
        print(f"  Variation {i}: ID = {id_match.group(1)}")
    else:
        print(f"  Variation {i}: No ID found")

Verification - checking template format:

BASE PROMPT (with template):
A highly realistic portrait photograph of a cute 7-9 year old Japanese boy, frontal view with slight upward head tilt, joyful wide smile showing teeth and squinted eyes expressing happiness. 

Face: rounded oval shape with soft contours and gentle jawline, childlike proportions with large eyes relative to face and short midface, high rounded cheeks with prominent chubby appearance, small rounded c

VARIATION 1 (with template):
Preserve the light, background, expression of referrence image and change the face based on the following feature:
    ```
    "{\"gender\": \"male\", \"age\": \"7–9\", \"face_shape\": \"softly rounded with a gentle taper toward the chin, typical of juvenile bone structure\", \"skin_tone\": \"light with warm undertone typical of East Asian complexion\", \"skin_texture\": \"smooth and even surface 

COMPARISON:
Base prompt has template: False
Variation 1 has template: False

Checking IDs in var

In [28]:
# Display structure of the final CSV
print("\n{'='*80}")
print("FINAL CSV STRUCTURE:")
print(f"{'='*80}")
print("\nColumns in prompt_base_and_variations.csv:")
for col in output_df.columns:
    print(f"  - {col}")

print("\n\nColumns in prompt_base_and_variations_full.csv:")
for col in detailed_output_df.columns:
    print(f"  - {col}")

print(f"\n{'='*80}")
print("SAMPLE ROW (first image):")
print(f"{'='*80}")
print(f"Image: {detailed_output_df.loc[0, 'Images']}")
print(f"Gender: {detailed_output_df.loc[0, 'gender']}")
print(f"\nBase prompt (first 200 chars):")
print(detailed_output_df.loc[0, 'base_prompt'][:200] + "...")
print(f"\nVariation 1 prompt (first 200 chars):")
print(detailed_output_df.loc[0, 'variation_1_prompt'][:200] + "...")
print(f"{'='*80}")


{'='*80}
FINAL CSV STRUCTURE:

Columns in prompt_base_and_variations.csv:
  - Images
  - gender
  - image_path
  - JSON_variations
  - base_prompt
  - variation_1_full
  - variation_2_full
  - variation_3_full
  - variation_4_full
  - variation_1_text
  - variation_2_text
  - variation_3_text
  - variation_4_text


Columns in prompt_base_and_variations_full.csv:
  - Images
  - gender
  - image_path
  - base_prompt
  - variation_1_prompt
  - variation_2_prompt
  - variation_3_prompt
  - variation_4_prompt

SAMPLE ROW (first image):
Image: 1.jpg
Gender: male

Base prompt (first 200 chars):
A highly realistic portrait photograph of a cute 7-9 year old Japanese boy, frontal view with slight upward head tilt, joyful wide smile showing teeth and squinted eyes expressing happiness. 

Face: r...

Variation 1 prompt (first 200 chars):
Preserve the light, background, expression of referrence image and change the face based on the following feature:
    ```
    "{\"gender\": \"male\", \"age\": \

## Continue Generating Missing Variation Prompts

In [14]:
# Load the existing CSV file
csv_file = './prompt_base_and_variations_full.csv'
df_prompts = pd.read_csv(csv_file)

print(f"Loaded CSV with {len(df_prompts)} rows")
print(f"\nColumns: {df_prompts.columns.tolist()}")
print(f"\nFirst few rows:")
df_prompts.head()

Loaded CSV with 199 rows

Columns: ['Images', 'gender', 'image_path', 'base_prompt', 'variation_1_prompt', 'variation_2_prompt', 'variation_3_prompt', 'variation_4_prompt']

First few rows:


Unnamed: 0,Images,gender,image_path,base_prompt,variation_1_prompt,variation_2_prompt,variation_3_prompt,variation_4_prompt
0,1.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ..."
1,2.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ..."
2,3.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...","Preserve the light, background, expression of ...",
3,4.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,A highly realistic portrait photograph of a cu...,,,,
4,5.jpg,male,/home/hungnq/hungnq/sd_stuff/datahub/dataset_2...,A highly realistic portrait photograph of a cu...,"Preserve the light, background, expression of ...","Preserve the light, background, expression of ...",,


In [15]:
# Check which variation prompts are missing
print("Checking for missing variation prompts...")
print(f"{'='*80}")

missing_stats = {}
for i in range(1, 5):
    col_name = f'variation_{i}_prompt'
    missing = df_prompts[col_name].isna().sum()
    missing_stats[col_name] = missing
    print(f"{col_name}: {missing} missing ({(missing/len(df_prompts)*100):.1f}%)")

total_missing = sum(missing_stats.values())
print(f"\n{'='*80}")
print(f"Total missing prompts: {total_missing}")
print(f"Total prompts to generate: {total_missing}")

if total_missing == 0:
    print("\n✓ All variation prompts are already complete!")
else:
    print(f"\n⚠ Need to generate {total_missing} prompts")
    
    # Show sample of rows with missing prompts
    for i in range(1, 5):
        col_name = f'variation_{i}_prompt'
        if missing_stats[col_name] > 0:
            missing_rows = df_prompts[df_prompts[col_name].isna()].index.tolist()
            print(f"\n{col_name} missing in rows: {missing_rows[:10]}{'...' if len(missing_rows) > 10 else ''}")

Checking for missing variation prompts...
variation_1_prompt: 195 missing (98.0%)
variation_2_prompt: 195 missing (98.0%)
variation_3_prompt: 196 missing (98.5%)
variation_4_prompt: 197 missing (99.0%)

Total missing prompts: 783
Total prompts to generate: 783

⚠ Need to generate 783 prompts

variation_1_prompt missing in rows: [3, 5, 6, 7, 8, 9, 10, 11, 12, 13]...

variation_2_prompt missing in rows: [3, 5, 6, 7, 8, 9, 10, 11, 12, 13]...

variation_3_prompt missing in rows: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]...

variation_4_prompt missing in rows: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]...


In [18]:
# Generate missing variation prompts using Gemini API
print("Starting to generate missing variation prompts...")
print(f"{'='*80}")

success_count = 0
failed_count = 0
api_calls = 0


for idx in df_prompts.index:
    # Check if any variation is missing
    needs_generation = False
    for i in range(1, 5):
        if pd.isna(df_prompts.loc[idx, f'variation_{i}_prompt']):
            needs_generation = True
            break
    
    if not needs_generation:
        continue
    
    try:
        # Get variations JSON
        if pd.notna(df_prompts.loc[idx, 'JSON_variations']):
            variations = json.loads(df_prompts.loc[idx, 'JSON_variations'])
            gender = df_prompts.loc[idx, 'gender']
            ref_image = df_prompts.loc[idx, 'Images']
            
            # Generate prompts for missing variations
            for i, var_obj in enumerate(variations, 1):
                col_name = f'variation_{i}_prompt'
                
                # Skip if already exists
                if pd.notna(df_prompts.loc[idx, col_name]):
                    continue
                
                api_calls += 1
                
                # Generate prompt with Gemini
                var_prompt = generate_prompt_with_gemini(var_obj, gender)
                
                if var_prompt:
                    # Wrap in template
                    full_prompt = create_full_prompt_with_template(var_prompt, ref_image)
                    df_prompts.at[idx, col_name] = full_prompt
                    success_count += 1
                    print(f"  ✓ Generated row {idx}, variation {i}")
                else:
                    failed_count += 1
                    print(f"  ✗ Failed row {idx}, variation {i}")
                
                # Rate limiting
                time.sleep(0.1)
        
    except Exception as e:
        print(f"  Error at row {idx}: {str(e)}")
        failed_count += 1
    
    # Progress update every 10 rows
    if (idx + 1) % 10 == 0:
        print(f"\nProgress: Processed {idx + 1}/{len(df_prompts)} rows")
        print(f"  API calls: {api_calls}, Success: {success_count}, Failed: {failed_count}")

print(f"\n{'='*80}")
print(f"Generation completed!")
print(f"  Total API calls: {api_calls}")
print(f"  Successful: {success_count}")
print(f"  Failed: {failed_count}")
print(f"  Success rate: {(success_count/api_calls*100 if api_calls > 0 else 0):.1f}%")

Starting to generate missing variation prompts...
  Error at row 2: 'JSON_variations'
  Error at row 3: 'JSON_variations'
  Error at row 4: 'JSON_variations'
  Error at row 5: 'JSON_variations'
  Error at row 6: 'JSON_variations'
  Error at row 7: 'JSON_variations'
  Error at row 8: 'JSON_variations'
  Error at row 9: 'JSON_variations'

Progress: Processed 10/199 rows
  API calls: 0, Success: 0, Failed: 8
  Error at row 10: 'JSON_variations'
  Error at row 11: 'JSON_variations'
  Error at row 12: 'JSON_variations'
  Error at row 13: 'JSON_variations'
  Error at row 14: 'JSON_variations'
  Error at row 15: 'JSON_variations'
  Error at row 16: 'JSON_variations'
  Error at row 17: 'JSON_variations'
  Error at row 18: 'JSON_variations'
  Error at row 19: 'JSON_variations'

Progress: Processed 20/199 rows
  API calls: 0, Success: 0, Failed: 18
  Error at row 20: 'JSON_variations'
  Error at row 21: 'JSON_variations'
  Error at row 22: 'JSON_variations'
  Error at row 23: 'JSON_variations'
 

In [None]:
# Verify completion status
print("Verifying completion status...")
print(f"{'='*80}")

for i in range(1, 5):
    col_name = f'variation_{i}_prompt'
    missing = df_prompts[col_name].isna().sum()
    complete = len(df_prompts) - missing
    print(f"{col_name}:")
    print(f"  Complete: {complete}/{len(df_prompts)} ({(complete/len(df_prompts)*100):.1f}%)")
    print(f"  Missing: {missing}")

total_missing = sum(df_prompts[f'variation_{i}_prompt'].isna().sum() for i in range(1, 5))
print(f"\n{'='*80}")
if total_missing == 0:
    print("✓ ALL variation prompts are now complete!")
else:
    print(f"⚠ Still {total_missing} prompts missing")
    print("You may need to re-run the generation cell for failed prompts")

In [None]:
# Save the updated CSV file
output_file = './prompt_base_and_variations_full.csv'

# Keep only the necessary columns (remove JSON_variations if it was merged)
columns_to_save = ['Images', 'gender', 'image_path', 'base_prompt', 
                   'variation_1_prompt', 'variation_2_prompt', 
                   'variation_3_prompt', 'variation_4_prompt']

df_to_save = df_prompts[columns_to_save]
df_to_save.to_csv(output_file, index=False)

print(f"✓ Saved updated CSV to: {output_file}")
print(f"\nFile details:")
print(f"  Total rows: {len(df_to_save)}")
print(f"  Total columns: {len(df_to_save.columns)}")
print(f"  File size: {os.path.getsize(output_file) / 1024:.2f} KB")

# Show summary
print(f"\n{'='*80}")
print("FINAL SUMMARY:")
print(f"{'='*80}")
for col in ['variation_1_prompt', 'variation_2_prompt', 'variation_3_prompt', 'variation_4_prompt']:
    complete = df_to_save[col].notna().sum()
    print(f"{col}: {complete}/{len(df_to_save)} complete")

print(f"\n✓ File saved successfully!")