### **Step 1: Import Necessary Libraries**

Start by importing the required libraries:

In [70]:
#import openai
import json
import os
import time
from tqdm import tqdm
import random
import re
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize



print("hello")

hello


### **Step 2: Define File Paths**

Set the input and output file paths:

In [71]:
input_file = '../../data/raw/fullrjokesdata.json'
output_file = '../../data/processed/joke_selection_untaged.json'
joke_embeddings_file='../../data/processed/joke_embeddings.json'


### **Step 3: Specify Relevant Columns**

List the columns we want to retain:

In [72]:
relevant_columns = ['id', 'title', 'selftext', 'ups', 'score', 'created_utc']


### **Step 4: Process and clean the Data**

Since the dataset is large, we'll read and process it line by line:

In [73]:
import re
import json
from tqdm import tqdm

# Define the cleaning function
def clean_joke_text(title, selftext):
    """
    Clean the full text of a joke to prepare it for embeddings and classification.
    Skips the joke if a URL is found, if selftext is missing/empty, or if selftext is '[deleted]'.
    """
    # Check if selftext is missing, empty, or marked as '[deleted]'
    if not selftext.strip() or selftext == '[deleted]':  # Skips if selftext is empty or marked as '[deleted]'
        return None
    # Combine title and selftext
    full_text = f"{title} {selftext}".strip()
    # List of URL indicators
    url_indicators = ['http://', 'https://', 'www.']
    # Check if the joke contains a URL
    if any(indicator in full_text for indicator in url_indicators):
        return None  # Return None if a URL is found
    # Convert text to lowercase
    clean_text = full_text.lower()
    # Remove URLs (if any slipped through)
    clean_text = re.sub(r'http\S+|www.\S+', '', clean_text)
    # Remove any unwanted special characters (optional: modify according to your needs)
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text)
    # Replace multiple spaces, newlines, or tabs with a single space
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text


# Initialize a counter for the number of jokes processed
jokes_count = 0

# Open the input and output files
with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8') as outfile:

    # Iterate over each line (each joke)
    for line in tqdm(infile, desc='Processing jokes'):
        try:
            # Parse the JSON line
            joke = json.loads(line)

            # Filter jokes with more than 50 score
            if joke.get('score', 0) > 50:
                title = joke.get('title', '')
                selftext = joke.get('selftext', '')

                # Clean the joke text (this will skip jokes with '[deleted]' selftext, URLs, or empty selftext)
                clean_joke = clean_joke_text(title, selftext)
                if clean_joke is None:
                    continue  # Skip jokes with invalid selftext or URLs

                # Create a new joke dictionary with the cleaned text
                filtered_joke = {
                    'id': joke.get('id'),
                    'full_joke': clean_joke,
                    'ups': joke.get('ups'),
                    'score': joke.get('score'),
                    'created_utc': joke.get('created_utc')
                }

                # Write the filtered joke to the output file
                json.dump(filtered_joke, outfile)
                outfile.write('\n')  # Write each joke on a new line
                
                jokes_count += 1
        except json.JSONDecodeError:
            # Skip lines that are not valid JSON
            continue

print(f"Total jokes after filtering: {jokes_count}")


Processing jokes: 1064928it [00:37, 28711.18it/s]

Total jokes after filtering: 88833





### **Step 5: Verify the Output**

To make sure the data has been correctly processed, we will read a few lines from the processed file to ensure everything worked correctly:


In [74]:
with open(output_file, 'r', encoding='utf-8') as f:
    for _ in range(5):
        line = f.readline()
        joke = json.loads(line)
        print(joke)

{'id': '9p7em', 'full_joke': 'husband asks wife what would you do if i hit the lottery a husband asks his wifewhat would you do if i hit the lotto she replies id take half and leave your assthe husband saysokay i just won 12 bucks on this scratch offheres 6 bux now get the fuck out', 'ups': 62, 'score': 62, 'created_utc': 1254242875.0}
{'id': 'a0ut6', 'full_joke': 'dont do that one day a little girl is outside with her father she claps her hands together and said daddy i killed a butterfly her father replied dont do that butterflies are our friends no butter for a week a little while later the girl was playing and she clapped her hands and said daddy daddy i killed a honeybee her father said dont do that honeybees are our friends no honey for a week later on that day the girl and her father were in the kitchen the girls mother joined them the mother stamped her foot and said i killed a cockroach then the little girl said to her father should i tell her or do you want to', 'ups': 62, 's

### **Step 6: Generate embeddings**

Generate embeddings for each joke and save them to a new file.

In [75]:
# Load the pre-trained gte-Qwen2-1.5B-instruct model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)
model = AutoModel.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True).eval()

# Set the maximum sequence length (adjust based on the model)
max_length = 8192

# Function to embed jokes
def embed_joke(joke_text):
    # Tokenize the joke text
    input_data = tokenizer(joke_text, padding="longest", truncation=True, max_length=max_length, return_tensors="pt")
    
    with torch.no_grad():
        # Get model outputs
        outputs = model(**input_data)
        
        # Use last token pooling as described in the documentation
        attention_mask = input_data["attention_mask"]
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            joke_embedding = outputs.last_hidden_state[:, -1]
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            joke_embedding = outputs.last_hidden_state[torch.arange(outputs.last_hidden_state.shape[0]), sequence_lengths]

        # Normalize the embedding
        joke_embedding = F.normalize(joke_embedding, p=2, dim=1)

    return joke_embedding.squeeze().tolist()

# Limit the number of jokes to 10 for testing
max_jokes_to_process = 10
jokes_processed = 0

# Open the cleaned jokes file and prepare the embeddings output file
with open('cleaned_jokes.jsonl', 'r', encoding='utf-8') as infile, \
     open('joke_embeddings_sample.jsonl', 'w', encoding='utf-8') as outfile:

    for line in tqdm(infile, desc='Processing jokes'):
        if jokes_processed >= max_jokes_to_process:
            break  # Stop after processing 10 jokes

        try:
            joke = json.loads(line)
            full_joke = joke['full_joke']

            # Generate embedding for the joke
            embedding = embed_joke(full_joke)

            # Add the embedding to the joke dictionary
            joke['embedding'] = embedding

            # Write the joke with the embedding to the new output file
            json.dump(joke, outfile)
            outfile.write('\n')

            # Increment the counter
            jokes_processed += 1

        except json.JSONDecodeError:
            # Skip lines that are not valid JSON
            continue

print(f"Embedding generation completed! Processed {jokes_processed} jokes.")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct:
- tokenization_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


ImportError: This modeling file requires the following packages that were not found in your environment: flash_attn. Run `pip install flash_attn`

### Step 6: Select a Test Set of High-Scoring Jokes

In this step, we will load the processed jokes from the file `joke_selection_untaged.json` and extract a small test set of jokes with high scores. The purpose of this test set is to explore tagging strategies and test our heuristics on jokes that received positive feedback from users. We'll define a threshold for "high-scoring" jokes and randomly select a small subset for experimentation.

#### Criteria for Selecting the Test Set:
- Jokes with a score above a certain threshold (e.g., `score > 90`).
- Randomly select 10 jokes to form the test set.


In [None]:

# Define the threshold for high-scoring jokes
score_threshold = 90
test_set_size = 20

# Load the filtered jokes and gather high-scoring jokes
high_scoring_jokes = []

with open(output_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        joke = json.loads(line)
        if joke.get('score', 0) > score_threshold:
            high_scoring_jokes.append(joke)

# Randomly select a subset of high-scoring jokes for the test set
test_set = random.sample(high_scoring_jokes, min(test_set_size, len(high_scoring_jokes)))

# Display the selected test set of jokes
print(test_set)


[{'id': '5iwrnf', 'full_joke': 'i dont believe in hitting my children as punishment so i send them to school in a justin beiber shirt and crocs and let the other kids beat them instead', 'ups': 107, 'score': 107, 'created_utc': 1482010581.0}, {'id': '26c070', 'full_joke': 'a man finds a penguin on the road a man finds a penguin on the road side thinking the penguin is lost the man takes it drives until finding a police officer and asks what to do take it to the zoo replies the officer one week latter the policer officer sees the man driving with the penguin by his side what are you doing with that penguin i told you to bring him to the zoo i did exactly that he loved it and now we are going to the movies', 'ups': 492, 'score': 492, 'created_utc': 1400882930.0}, {'id': '6q7ikn', 'full_joke': 'an old lady dies and goes to heaven shes chatting it up with stpeter at the pearly gates when all of a sudden she hears the most awful bloodcurdling screams dont worry about that says st peter its 

### **Step 7: Tagging Jokes Using ChatGPT**

In this step, we'll use OpenAI's GPT model to automatically tag our selected test set of jokes based on predefined categories. We'll utilize the OpenAI API to send each joke to the model and receive the corresponding tags.