### **Step 1: Import Necessary Libraries**

Start by importing the required libraries:

In [None]:
import json
import os
import time
from tqdm import tqdm
import random
import re
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize
import logging
from openai import OpenAI
from dotenv import load_dotenv
import pandas as pd

load_dotenv('../../vars.env')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

### **Step 2: Define File Paths**

Set the input and output file paths:

In [None]:
input_file = '../../data/raw/fullrjokesdata.json'
output_file = '../../data/processed/joke_selection_untaged.json'
joke_embeddings_file='../../data/processed/joke_embeddings.json'


### **Step 3: Specify Relevant Columns**

List the columns we want to retain:

In [None]:
relevant_columns = ['id', 'title', 'selftext', 'ups', 'score', 'created_utc']


### **Step 4: Process and clean the Data**

Since the dataset is large, we'll read and process it line by line:

In [None]:
import re
import json
from tqdm import tqdm

def clean_joke_text(title, selftext):
    """
    Clean the full text of a joke to prepare it for embeddings and classification.
    Skips the joke if a URL is found, if selftext is missing/empty, or if selftext is '[deleted]'.
    """
    if not selftext.strip() or selftext == '[deleted]':
        return None
    
    full_text = f"{title} {selftext}".strip()
    
    url_indicators = ['http://', 'https://', 'www.']
    if any(indicator in full_text for indicator in url_indicators):
        return None
    
    clean_text = full_text.lower()
    clean_text = re.sub(r'http\S+|www.\S+', '', clean_text)
    
    # Replace triple dots with a space
    clean_text = clean_text.replace('...', ' ')
    
    # Remove any remaining special characters
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text)
    
    # Replace multiple spaces, newlines, or tabs with a single space
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    
    return clean_text

# Initialize a counter for the number of jokes processed
jokes_count = 0

# Open the input and output files
with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8') as outfile:
    
    # Iterate over each line (each joke)
    for line in tqdm(infile, desc='Processing jokes'):
        try:
            joke = json.loads(line)
            
            if joke.get('score', 0) > 50:
                title = joke.get('title', '')
                selftext = joke.get('selftext', '')
                
                clean_joke = clean_joke_text(title, selftext)
                if clean_joke is None:
                    continue
                
                filtered_joke = {
                    'id': joke.get('id'),
                    'full_joke': clean_joke,
                    'ups': joke.get('ups'),
                    'score': joke.get('score'),
                    'created_utc': joke.get('created_utc')
                }
                
                json.dump(filtered_joke, outfile)
                outfile.write('\n')
                
                jokes_count += 1
        except json.JSONDecodeError:
            continue

print(f"Total jokes after filtering: {jokes_count}")

### **Step 5: Verify the Output**

To make sure the data has been correctly processed, we will read a few lines from the processed file to ensure everything worked correctly:


In [None]:
with open(output_file, 'r', encoding='utf-8') as f:
    for _ in range(5):
        line = f.readline()
        joke = json.loads(line)
        print(joke)

### **Generate Embeddings via OpenAI**


In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def prepare_batch_files(input_file, batch_size=10000, output_folder='./batch_files'):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    batch_number = 0
    batch = []
    total_jokes = 0

    try:
        with open(input_file, 'r', encoding='utf-8') as infile:
            for line in tqdm(infile, desc="Preparing batches"):
                try:
                    joke = json.loads(line)
                    # Create the JSONL structure with `custom_id`, `method`, `url`, and `body`
                    payload = {
                        "custom_id": joke['id'],
                        "method": "POST",
                        "url": "/v1/embeddings",
                        "body": {
                            "model": "text-embedding-3-small",
                            "input": joke['full_joke'],
                            "encoding_format": "float"
                        }
                    }
                    batch.append(payload)
                    total_jokes += 1

                    # Write the batch to a file if it hits the batch size limit
                    if len(batch) >= batch_size:
                        output_file = f'{output_folder}/jokes_batch_{batch_number}.jsonl'
                        with open(output_file, 'w', encoding='utf-8') as outfile:
                            for item in batch:
                                json.dump(item, outfile)
                                outfile.write('\n')
                        batch = []  # Reset batch
                        batch_number += 1  # Increment batch number

                except json.JSONDecodeError:
                    logging.warning(f"Invalid JSON in line: {line}")
                    continue

        # Write any remaining jokes in the last batch
        if batch:
            output_file = f'{output_folder}/jokes_batch_{batch_number}.jsonl'
            with open(output_file, 'w', encoding='utf-8') as outfile:
                for item in batch:
                    json.dump(item, outfile)
                    outfile.write('\n')

    except FileNotFoundError:
        logging.error(f"Input file not found: {input_file}")
        raise

    return total_jokes


def create_batch_jobs(batch_folder):
    batch_input_files = []
    for file in os.listdir(batch_folder):
        if file.endswith('.jsonl'):
            # Create a batch job using the new purpose, 'batch'
            created_file = client.files.create(
                file=open(f'{batch_folder}/{file}', "rb"),
                purpose="batch"  # Correct purpose here
            )
            batch_input_files.append(created_file)  # Append the `FileObject` directly

    # Create embedding jobs from uploaded files
    batch_file_ids = [batch_file.id for batch_file in batch_input_files]  # Access the `id` attribute
    job_creations = []
    for i, file_id in enumerate(batch_file_ids):
        job_creations.append(client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/embeddings",
            completion_window="24h",
            metadata={
                "description": f"part_{i}_joke_embeddings"
            }
        ))
        logging.info(f"Batch {i} has been submitted. Waiting for 26 minutes before submitting the next batch...")
        time.sleep(1600)  # 1200 seconds = 20 minutes
    return job_creations





def main(input_file, output_file, batch_folder='./batch_files'):
    # Ensure input file exists
    if not os.path.exists(input_file):
        logging.error(f"Input file not found: {input_file}")
        return

    # Prepare batch files
    total_jokes = prepare_batch_files(input_file, output_folder=batch_folder)
    logging.info(f"Total jokes processed: {total_jokes}")

    # Create and monitor batch jobs
    job_creations = create_batch_jobs(batch_folder)
    

    

# Define your input and output file paths
input_file = '../../data/processed/joke_selection_untaged.json'
output_file = '../../data/processed/joke_embeddings.json'
batch_folder = './batch_files'

# Run the main function
main(input_file, output_file, batch_folder)


### **Join Embedding batches together and merge them with data set**


In [None]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths to the necessary files and directories
batch_folder = '../../data/processed/Embedded_batches'  # Adjust this to the actual path where the batch files are stored
input_jokes_file = '../../data/processed/joke_selection_untaged.json'  # Path to the jokes file
output_file = '../../data/processed/merged_joke_embeddings.json'  # Output file for the merged data
missing_embeddings_file = '../../data/processed/missing_embeddings.json'  # File to store jokes missing embeddings

# Function to load all batch files and extract embeddings
def load_batch_files(batch_folder):
    embedding_data = []
    
    # Loop through each file in the batch folder
    for batch_file in os.listdir(batch_folder):
        if batch_file.endswith('.jsonl'):
            with open(os.path.join(batch_folder, batch_file), 'r', encoding='utf-8') as file:
                # Read each line in the batch file
                for line in file:
                    data = json.loads(line)
                    # Extract custom_id (joke id) and the embedding from the batch file
                    custom_id = data['custom_id']
                    embedding = data['response']['body']['data'][0]['embedding']
                    embedding_data.append({
                        'id': custom_id, 
                        'embedding': embedding
                    })
    
    # Convert embedding data to a DataFrame for easy merging later
    return pd.DataFrame(embedding_data)

# Function to load jokes from the jokes file
def load_jokes(input_jokes_file):
    jokes_data = []
    with open(input_jokes_file, 'r', encoding='utf-8') as infile:
        for line in infile:
            joke = json.loads(line)
            jokes_data.append(joke)
    
    # Convert joke data to a DataFrame for easy merging
    return pd.DataFrame(jokes_data)



# Function to find jokes missing embeddings and save them to a file
def find_missing_embeddings(merged_df, missing_embeddings_file):
    # Find jokes where the embedding column is NaN
    missing_embeddings_df = merged_df[merged_df['embedding'].isnull()]
    
    # Save the jokes missing embeddings to a file
    if not missing_embeddings_df.empty:
        missing_embeddings_df.to_json(missing_embeddings_file, orient='records', lines=True)
        print(f"Jokes missing embeddings saved to {missing_embeddings_file}")
    else:
        print("No missing embeddings found.")

# Main function to perform the entire process
def main(batch_folder, input_jokes_file, output_file, missing_embeddings_file=None):
    print("Loading batch files and extracting embeddings...")
    embeddings_df = load_batch_files(batch_folder)
    
    print("Loading jokes from jokes file...")
    jokes_df = load_jokes(input_jokes_file)
    
    print("Merging jokes with embeddings...")
    merged_df=jokes_df.merge(embeddings_df, on='id', how='left')
    
    print(f"Saving merged data to {output_file}...")
    # Save the merged data to a new JSON file, each joke on a new line
    merged_df.to_json(output_file, orient='records', lines=True)
    
    # Optionally check for missing embeddings
    if missing_embeddings_file:
        print("Checking for missing embeddings...")
        find_missing_embeddings(merged_df, missing_embeddings_file)
    
    print("Process completed!")

# Run the main function
main(batch_folder, input_jokes_file, output_file, missing_embeddings_file)


### **Categorizing the Jokes**

In this step, we'll use OpenAI's GPT model to categorize our jokes

In [None]:
import json
import os
import time
import logging
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm

# Load .env file for the OpenAI API key
load_dotenv()

# Set up the OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def prepare_batch_files(input_file, batch_size=7000, output_folder='./batch_files'):
    print("hello")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    batch_number = 0
    batch = []
    total_jokes = 0

    prompt_template = """Classify the following joke into one of these categories:
Animal, Puns, Dad, Knock-Knock, One-Liners, Dark Humor, Political, Marriage, Work/Office, Tech, Ethnic, Kids, Doctor/Health, Lawyer, Food, Blonde, Yo Mama, School, Relationship, Religious, Sports, Punishments/Consequences, Celebrity, In-Law, Science, Insult

Joke: {joke}

Output: Return only the category name, without explanation.
"""

    try:
        with open(input_file, 'r', encoding='utf-8') as infile:
            for line in tqdm(infile, desc="Preparing batches"):
                try:
                    joke = json.loads(line)
                    joke_text = joke['full_joke']
                    joke_id = joke['id']

                    prompt = prompt_template.format(joke=joke_text)

                    # Create the JSONL structure for classification
                    payload = {
                        "custom_id": joke_id,
                        "method": "POST",
                        "url": "/v1/chat/completions",
                        "body": {
                            "model": "gpt-3.5-turbo-0125",
                            "messages": [
                                {"role": "system", "content": "You are a helpful assistant."},
                                {"role": "user", "content": prompt}
                            ],
                            "max_tokens": 100
                        }
                    }
                    batch.append(payload)
                    total_jokes += 1

                    # Write the batch to a file if it hits the batch size limit
                    if len(batch) >= batch_size:
                        output_file = f'{output_folder}/jokes_batch_{batch_number}.jsonl'
                        logging.info(f'Writing batch file {output_file} with {len(batch)} jokes')  # Log batch size
                        with open(output_file, 'w', encoding='utf-8') as outfile:
                            for item in batch:
                                json.dump(item, outfile)
                                outfile.write('\n')
                        batch = []  # Reset batch
                        batch_number += 1  # Increment batch number

                except json.JSONDecodeError:
                    logging.warning(f"Invalid JSON in line: {line}")
                    continue

        # Write any remaining jokes in the last batch
        if batch:
            output_file = f'{output_folder}/jokes_batch_{batch_number}.jsonl'
            logging.info(f'Writing final batch file {output_file} with {len(batch)} jokes')
            with open(output_file, 'w', encoding='utf-8') as outfile:
                for item in batch:
                    json.dump(item, outfile)
                    outfile.write('\n')

    except FileNotFoundError:
        logging.error(f"Input file not found: {input_file}")
        raise

    return total_jokes

def create_batch_jobs(batch_folder):
    batch_input_files = []
    for file in os.listdir(batch_folder):
        if file.endswith('.jsonl'):
            # Create a batch job using OpenAI
            created_file = client.files.create(
                file=open(f'{batch_folder}/{file}', "rb"),
                purpose="batch"
            )
            batch_input_files.append(created_file)  # Append the `FileObject` directly

    # Create chat completions batch jobs from uploaded files
    batch_file_ids = [batch_file.id for batch_file in batch_input_files]
    job_creations = []
    for i, file_id in enumerate(batch_file_ids):
        job_creations.append(client.batches.create(
            input_file_id=file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={
                "description": f"part_{i}_joke_classifications"
            }
        ))
        logging.info(f"Batch {i} has been submitted. Waiting for 33 minutes before submitting the next batch...")
        time.sleep(400)  # 900 seconds = ~33 minutes
    return job_creations

def main(input_file, batch_folder='./batch_files'):
    # Ensure input file exists
    if not os.path.exists(input_file):
        logging.error(f"Input file not found: {input_file}")
        return

    # Prepare batch files
    total_jokes = prepare_batch_files(input_file, output_folder=batch_folder)
    logging.info(f"Total jokes processed: {total_jokes}")

    # Create and submit batch jobs
    create_batch_jobs(batch_folder)

# Define your input file path
input_file = '../../data/processed/joke_selection_untaged.json'
batch_folder = './batch_files'

# Run the main function
main(input_file, batch_folder)


### **Merge Embeddings with Categories to produce the final dataset**

In [None]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Paths to the necessary files and directories
categorized_batches_folder = '../../data/processed/Categorized_batches'  # Path to the categorized batches
input_file = '../../data/processed/merged_joke_embeddings.json'  # Path to the jokes file with the embeddings
final_output_file = '../../data/processed/final_joke_dataset.json'  # Output file for the merged data
missing_categories_file = '../../data/processed/missing_categories.json'  # File to store jokes missing categories

# Function to load categorized joke batches
def load_categorized_batches(batch_folder):
    categorized_data = []
    
    # Loop through each file in the batch folder
    for batch_file in os.listdir(batch_folder):
        if batch_file.endswith('.jsonl'):
            with open(os.path.join(batch_folder, batch_file), 'r', encoding='utf-8') as file:
                # Read each line in the batch file
                for line in file:
                    data = json.loads(line)
                    # Extract custom_id (joke id) and the categorized content from the batch file
                    custom_id = data['custom_id']
                    category = data['response']['body']['choices'][0]['message']['content']
                    categorized_data.append({
                        'id': custom_id,
                        'category': category.strip()  # Strip to remove extra spaces
                    })
    
    # Convert categorized data to a DataFrame for easy merging later
    return pd.DataFrame(categorized_data)

# Function to load jokes with embeddings
def load_jokes_with_embeddings(input_file):
    jokes_data = []
    with open(input_file, 'r', encoding='utf-8') as infile:
        for line in infile:
            joke = json.loads(line)
            jokes_data.append(joke)
    
    # Convert joke data to a DataFrame for easy merging
    return pd.DataFrame(jokes_data)

# Function to find jokes missing categories and save them to a file
def find_missing_categories(merged_df, missing_categories_file):
    # Find jokes where the category column is NaN
    missing_categories_df = merged_df[merged_df['category'].isnull()]
    
    # Save the jokes missing categories to a file
    if not missing_categories_df.empty:
        missing_categories_df.to_json(missing_categories_file, orient='records', lines=True)
        print(f"Jokes missing categories saved to {missing_categories_file}")
    else:
        print("No jokes missing categories found.")

# Main function to perform the entire process
def main(categorized_batches_folder, input_file, final_output_file, missing_categories_file=None):
    print("Loading categorized batch files and extracting categories...")
    categorized_df = load_categorized_batches(categorized_batches_folder)
    
    print("Loading jokes with embeddings...")
    jokes_df = load_jokes_with_embeddings(input_file)
    
    print("Merging jokes with categories...")
    merged_df = jokes_df.merge(categorized_df, on='id', how='left')
    
    print(f"Saving merged data to {final_output_file}...")
    # Save the merged data to a new JSON file, each joke on a new line
    merged_df.to_json(final_output_file, orient='records', lines=True)
    
    # Optionally check for missing categories
    if missing_categories_file:
        print("Checking for jokes missing categories...")
        find_missing_categories(merged_df, missing_categories_file)
    
    print("Process completed!")

# Run the main function
main(categorized_batches_folder, input_file, final_output_file, missing_categories_file)
