# Image metadata scraping and CSV dataset creation

In [10]:
import os
import json
import csv
import requests
from datetime import datetime
import time
from pathlib import Path
import hashlib
import pandas as pd
import sys

## image metadata obtainment

In [2]:
## path thingy
try: #scripts
    current_dir = Path(__file__).resolve().parent
except NameError:
    # jupyter
    current_dir = Path.cwd()

In [3]:
from datetime import datetime, timedelta

# Define the input timestamp in ISO 8601 format
input_timestamp = "2024-09-30T06:25:02.809Z"  # Replace with your desired timestamp

# Function to convert an ISO 8601 date string to a Unix timestamp in milliseconds with a 2-hour offset
def iso_to_timestamp(iso_str):
    # Parse the ISO 8601 date string (including milliseconds and 'Z' indicating UTC)
    date = datetime.strptime(iso_str, "%Y-%m-%dT%H:%M:%S.%fZ")
    # Add 2 hours to the parsed date
    adjusted_date = date + timedelta(hours=2)
    # Convert to Unix timestamp (in milliseconds)
    timestamp = int(adjusted_date.timestamp() * 1000)
    return timestamp

# Convert the input timestamp to an initial cursor
initial_cursor = iso_to_timestamp(input_timestamp)


In [4]:
max_images = 2000000000 # some huge number now redundant because API limit at 50'000

def get_image_metadata():
    base_url = "https://civitai.com/api/v1/images"
    headers = {
        "Accept": "application/json",
        "Authorization": "Bearer APITOKEN"  # Replace with your actual API token
    }
    params = {
        "sort": "Most Reactions",
        "nsfw": "Soft",
        "cursor": f"0|{initial_cursor}"
    }

    # Use pathlib to create the base directory for saving files
    current_dir = Path.cwd()
    base_directory_path = current_dir / f"data/raw/001/{input_timestamp.replace(':', '').replace('T', '_').replace('.', '_').replace('Z', '')}"
    file_counter = 0

    # Create a folder based on the input timestamp
    base_directory_path.mkdir(parents=True, exist_ok=True)
    sub_directory_path = base_directory_path

    retry_delay = 300  # 300 seconds / 5 minutes
    retries_without_cursor = 0  # Track consecutive retries without a new cursor

    while True:
        response = requests.get(base_url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            items = data.get('items', [])
            if not items:
                print("No more data available.")
                retries_without_cursor += 1
                if retries_without_cursor > 5:  # Allow up to 5 retries before stopping
                    print("Reached the end of the data after multiple retries.")
                    break
                time.sleep(retry_delay)
                continue

            retries_without_cursor = 0  # Reset if we get data

            next_cursor = data.get('metadata', {}).get('nextCursor')
            if next_cursor:
                # Increment the cursor by 50 (e.g., "0|1722470401000" -> "50|1722470401000")
                cursor_value = int(params['cursor'].split("|")[0])
                new_cursor_value = cursor_value + 50
                params['cursor'] = f"{new_cursor_value}|{params['cursor'].split('|')[1]}"
            else:
                print("No new cursor returned, stopping.")
                break

            file_counter += 1
            if file_counter % max_images == 0:
                time_stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                sub_directory_path = base_directory_path / f"{input_timestamp.replace(':', '').replace('T', '_').replace('.', '_').replace('Z', '')}_session_{time_stamp}"
                sub_directory_path.mkdir(parents=True, exist_ok=True)

            file_path = sub_directory_path / f'most_recent_{file_counter}.json'
            with open(file_path, 'w', encoding='utf-8') as file:
                json.dump(data, file, indent=4)

        elif response.status_code == 502:
            print(f"Received HTTP 502. Retrying in {retry_delay // 60} minutes.")
            time.sleep(retry_delay)  # Wait for 5 minutes before retrying
        else:
            print(f"Failed to fetch data: HTTP {response.status_code}")
            break


In [5]:
get_image_metadata()

KeyboardInterrupt: 

## json chronological data sorting

In [11]:
def organize_files(source_dir, target_dir, max_items_per_file=100):
    print(f"Starting to organize files from {source_dir} to {target_dir}")
    item_buffer = []
    file_count = 0

    # Walk through all files in the source directory
    for root, dirs, files in os.walk(source_dir):
        if '.ipynb_checkpoints' in root:
            continue  # Skip .ipynb_checkpoints directories
        print(f"Checking directory: {root}")
        for filename in files:
            #print(f"Found files: {files}")
            if filename.lower().endswith('.json'):
                file_path = os.path.join(root, filename)
                try:
                    with open(file_path, 'r') as file:
                        data = json.load(file)
                        items = data.get('items', [])  # Get the list of items
                        for item in items:
                            item_buffer.append(item)
                            # Write out the buffer if it has reached the maximum size
                            if len(item_buffer) >= max_items_per_file:
                                write_items(item_buffer[:max_items_per_file], target_dir)
                                item_buffer = item_buffer[max_items_per_file:]
                                file_count += 1
                except json.JSONDecodeError:
                    print(f"Error decoding JSON from file {file_path}")
                except Exception as e:
                    print(f"An error occurred with file {file_path}: {e}")

    # Write any remaining items in the buffer
    if item_buffer:
        write_items(item_buffer, target_dir)
        file_count += 1

    #print(f"Processed {file_count} files.")

def write_items(items, target_dir):
    # Use the createdAt from the first item to determine the directory
    created_at = items[0].get('createdAt')
    if created_at:
        date_obj = datetime.fromisoformat(created_at.rstrip("Z"))
        new_dir = os.path.join(target_dir, f"{date_obj.year}", f"{date_obj.year}-{date_obj.month:02d}", f"{date_obj.year}-{date_obj.month:02d}-{date_obj.day:02d}")
        os.makedirs(new_dir, exist_ok=True)
        new_file_path = os.path.join(new_dir, f"batch_{date_obj.strftime('%Y%m%dT%H%M%S')}.json")
        with open(new_file_path, 'w', encoding='utf-8') as new_file:
            json.dump(items, new_file, indent=4)
        #print(f"Wrote {len(items)} items to {new_file_path}")


In [12]:
source_dir = current_dir / 'data/raw/001'
target_dir = current_dir / 'data/image-metadata/json-sorted'

organize_files(source_dir, target_dir, max_items_per_file=100)

Starting to organize files from /home/lauhp/000_PHD/000_005_COURSES/SD-Social/data/raw/001 to /home/lauhp/000_PHD/000_005_COURSES/SD-Social/data/image-metadata/json-sorted
Checking directory: /home/lauhp/000_PHD/000_005_COURSES/SD-Social/data/raw/001
Checking directory: /home/lauhp/000_PHD/000_005_COURSES/SD-Social/data/raw/001/2024-09-30_062502_809


## CSV creation and user name hashing¶

### walk through json files and write to csv functions


In [13]:
try: #scripts
    current_dir = Path(__file__).resolve().parent
except NameError:
    # jupyter
    current_dir = Path.cwd()

def find_json_files(directory):
    """Walk through the directory and its subdirectories to find all JSON files."""
    json_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))
    return json_files

def write_to_csv(json_files, output_csv):
    """Read JSON files, extract data, and write to a CSV file."""
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = [
            'createdAt', 'url', 'positivePrompt', 'negativePrompt', 'nsfw', 'nsfwLevel',
            'browsingLevel', 'likeCount', 'dislikeCount', 'heartCount', 'cryCount', 
            'laughCount', 'commentCount', 'username', 'Model', 'Meta', 'VAE'
        ] + [f'Resource{i+1}' for i in range(6)]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for json_file in json_files:
            with open(json_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
                for item in data:
                    meta = item.get('meta', {}) or {}
                    stats = item.get('stats', {}) or {}
                    resources = meta.get('resources', []) if isinstance(meta, dict) else []

                    # Simplified Meta details without VAE
                    meta_details = (
                        f"Size: {meta.get('Size', '')}, Seed: {meta.get('seed', '')}, Steps: {meta.get('steps', '')}, "
                        f"Sampler: {meta.get('sampler', '')}, Version: {meta.get('Version', '')}"
                    )

                    row = {
                        'createdAt': item.get('createdAt', ''),
                        'url': item.get('url', ''),
                        'positivePrompt': meta.get('prompt', '').replace(',', '') if isinstance(meta, dict) else '',
                        'negativePrompt': meta.get('negativePrompt', '') if isinstance(meta, dict) else '',
                        'nsfw': item.get('nsfw', ''),
                        'nsfwLevel': item.get('nsfwLevel', ''),
                        'browsingLevel': item.get('browsingLevel', ''),
                        'likeCount': stats.get('likeCount', 0),
                        'dislikeCount': stats.get('dislikeCount', 0),
                        'heartCount': stats.get('heartCount', 0),
                        'cryCount': stats.get('cryCount', 0),
                        'laughCount': stats.get('laughCount', 0),
                        'commentCount': stats.get('commentCount', 0),
                        'username': item.get('username', ''),
                        'Model': meta.get('Model', ''),
                        'Meta': meta_details,
                        'VAE': meta.get('VAE', 'N/A')  # Dedicated column for VAE
                    }

                    # Handle up to six resources
                    for i in range(6):
                        if i < len(resources):
                            resource = resources[i]
                            resource_detail = (
                                f"Name: {resource.get('name', '')}, Type: {resource.get('type', '')}, Weight: {resource.get('weight', 'N/A')}"
                            )
                            row[f'Resource{i+1}'] = resource_detail
                        else:
                            row[f'Resource{i+1}'] = None

                    writer.writerow(row)



In [14]:
csv_output = current_dir / 'data/csv/preprocess_civiverse_social.csv'
directory = current_dir / 'data/image-metadata/json-sorted'
json_files = find_json_files(directory)
write_to_csv(json_files, csv_output)


### anonymize usernames

In [15]:
def anonymize_usernames(input_csv, anonymized_csv):
    """Anonymize usernames in the input CSV while keeping the rest of the data unchanged."""
    # Increase the maximum field size limit
    csv.field_size_limit(sys.maxsize)

    with open(input_csv, 'r', encoding='utf-8') as infile, \
         open(anonymized_csv, 'w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames  # Captures all existing column names
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            # Hash the username
            if row['username']:  # Ensure there's a username to hash
                hash_object = hashlib.sha256(row['username'].encode())
                username_hash = hash_object.hexdigest()
                row['username'] = username_hash  # Replace the username in the row with its hash
            
            writer.writerow(row)

In [16]:
input_csv = current_dir / 'data/csv/preprocess_civiverse_social.csv'
anonymized_csv = current_dir / 'data/csv/preprocess_civiverse_social_an.csv'
anonymize_usernames(input_csv, anonymized_csv)

## Sum-up reactions

In [17]:
output_csv = current_dir / 'data/csv/Civiverse_social_scores.csv'

# Load the CSV file
file_path = anonymized_csv  # Replace with the path to your file
data = pd.read_csv(file_path)

# Convert 'createdAt' to datetime, explicitly handling ISO8601 formats
data['createdAt'] = pd.to_datetime(data['createdAt'], format='ISO8601', errors='coerce')

# Define the cutoff date as timezone-aware
cutoff_date = pd.Timestamp('2024-09-30', tz='UTC')

# Calculate total social reactions
data['socialReactions'] = data[['likeCount', 'dislikeCount', 'heartCount']].sum(axis=1)

# Calculate days on the platform
data['daysOnPlatform'] = (cutoff_date - data['createdAt']).dt.days

# Sort by social reactions
sorted_data = data.sort_values(by='socialReactions', ascending=False)

# Save or display the processed data
sorted_data.to_csv(output_csv, index=False)  # Replace with your desired save path


## Normalization and time penalty

In [18]:
input_csv = current_dir / 'data/csv/Civiverse_social_scores.csv'
output_csv = current_dir / 'data/csv/Civiverse_normalized_social_scores.csv'

In [19]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Load the dataset
file_path =  input_csv  # Replace with the path to your file

data = pd.read_csv(file_path)

# Ensure 'socialReactions' is numeric
data['socialReactions'] = pd.to_numeric(data['socialReactions'], errors='coerce')

# Normalize the social reactions (0-1 scale)
scaler = MinMaxScaler()
data['normalizedReactions'] = scaler.fit_transform(data[['socialReactions']])

# Handle 'daysOnPlatform' to avoid zero or negative values
offset = 1  # To control the penalty and avoid zero division
data['daysOnPlatform'] = data['daysOnPlatform'].apply(lambda x: max(x, 1))

# Calculate the time penalty
data['timePenaltyRaw'] = 1 / (1 + np.log(data['daysOnPlatform'] + offset))

# Normalize the time penalty (0-1 scale)
data['timePenalty'] = scaler.fit_transform(data[['timePenaltyRaw']])

# Adjust the social engagement score by subtracting the normalized time penalty
data['socialEngagement'] = data['normalizedReactions'] - data['timePenalty']

# Ensure no negative engagement scores (optional, if needed)
data['socialEngagement'] = data['socialEngagement'].apply(lambda x: max(x, 0))

# Display the first few rows
print(data[['socialReactions', 'normalizedReactions', 'daysOnPlatform', 'timePenalty', 'socialEngagement']].head())

data.to_csv(output_csv, index=False)


   socialReactions  normalizedReactions  daysOnPlatform  timePenalty  \
0            11415             1.000000             178     0.056541   
1            11413             0.999825             178     0.056541   
2            10406             0.911577             142     0.069926   
3            10406             0.911577             142     0.069926   
4            10393             0.910437              94     0.097085   

   socialEngagement  
0          0.943459  
1          0.943283  
2          0.841651  
3          0.841651  
4          0.813353  
