### **0. Import Statements**

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report


from tqdm import tqdm
import json
import pandas as pd
import numpy as np
import joblib
import time

import ast


import seaborn as sns
import matplotlib.pyplot as plt

### **1. Load the Processed Data**

In [3]:
# Processed JSON file path
final_joke_dataset = r'..\..\data\processed\final_joke_dataset.json'

# Open the file to count the number of lines (for progress tracking)
with open(final_joke_dataset, 'r') as f:
    total_lines = sum(1 for _ in f)

# Load data from JSON file with progress bar
data = []
with open(final_joke_dataset, 'r') as f:
    for line in tqdm(f, total=total_lines, desc="Loading JSON data"):
        data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)


Loading JSON data: 100%|██████████| 88832/88832 [00:54<00:00, 1632.61it/s]


### **2. Prepare the Data for Modeling**

**Encode the Categories**

In [4]:
# Check for NaN values in 'category'
missing_categories = df['category'].isnull().sum()
print(f"Number of missing values in 'category': {missing_categories}")

# Drop rows where 'category' is NaN
df = df.dropna(subset=['category'])

# Function to validate embeddings
def is_valid_embedding(embedding):
    return (
        isinstance(embedding, list) and
        len(embedding) == 1536 and
        not any(pd.isnull(embedding)) and
        not any(np.isnan(embedding))
    )

# Filter out invalid embeddings
df = df[df['embedding'].apply(is_valid_embedding)]

# Encode the categories to numerical labels
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Calculate category counts
category_counts = df['category_encoded'].value_counts()

# Filter the DataFrame to keep only categories with more than one instance
df_filtered = df[df['category_encoded'].isin(category_counts[category_counts > 1].index)]


# Expand the embedding lists into separate columns
embedding_cols = pd.DataFrame(df_filtered['embedding'].tolist())
embedding_cols.columns = [f'embedding_{i}' for i in range(embedding_cols.shape[1])]


# Drop unnecessary columns
df_cleaned = df_filtered.drop(columns=['id', 'full_joke', 'embedding', 'category'])

# Combine the expanded embeddings with the rest of the DataFrame
df_final = pd.concat([embedding_cols, df_cleaned.reset_index(drop=True)], axis=1)


# Features (X) are the embeddings
X = df_final.drop(columns=['category_encoded'])

# Labels (y) are the category_encoded
y = df_final['category_encoded']

# Combine X and y to drop rows with any missing values
data = pd.concat([X, y.reset_index(drop=True)], axis=1)

# Drop rows with missing values
data = data.dropna()

# Separate X and y again
X = data.drop(columns=['category_encoded'])
y = data['category_encoded']

# Perform train-test split with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)


Number of missing values in 'category': 0


In [5]:
print(df['category_encoded'].value_counts())


category_encoded
371    16390
122    15802
381    11998
165     5674
340     4416
       ...  
434        1
8          1
48         1
7          1
214        1
Name: count, Length: 476, dtype: int64


### **3. Set Up the Model**

In [9]:
# Initialize the Random Forest classifier with warm_start=True to allow incremental training
rf_classifier = RandomForestClassifier(
    n_estimators=0,  # Start with 0 trees, we'll increment manually
    max_depth=None,
    random_state=42,
    n_jobs=-1,       # Use all available cores
    warm_start=True  # Allow incremental tree addition
)

n_total_trees = 100  # Total number of trees you want to train
n_increment = 10     # Number of trees to add in each increment

start_time = time.time()

# Fit the model incrementally and monitor progress
for i in tqdm(range(0, n_total_trees, n_increment), desc="Training Progress"):
    rf_classifier.n_estimators += n_increment  # Add more trees in increments
    rf_classifier.fit(X_train, y_train)        # Fit the new trees
    
    # Measure time so far
    elapsed_time = time.time() - start_time
    avg_time_per_tree = elapsed_time / (i + n_increment)
    remaining_trees = n_total_trees - (i + n_increment)
    remaining_time_estimate = avg_time_per_tree * remaining_trees
    
    print(f"Trained {i + n_increment} trees in {elapsed_time:.2f} seconds")
    print(f"Estimated time remaining: {remaining_time_estimate:.2f} seconds")

end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds")

Training Progress:  10%|█         | 1/10 [00:28<04:16, 28.46s/it]

Trained 10 trees in 28.47 seconds
Estimated time remaining: 256.19 seconds


Training Progress:  20%|██        | 2/10 [01:04<04:22, 32.78s/it]

Trained 20 trees in 64.27 seconds
Estimated time remaining: 257.08 seconds


Training Progress:  30%|███       | 3/10 [01:41<04:02, 34.71s/it]

Trained 30 trees in 101.27 seconds
Estimated time remaining: 236.30 seconds


Training Progress:  40%|████      | 4/10 [02:21<03:42, 37.03s/it]

Trained 40 trees in 141.87 seconds
Estimated time remaining: 212.81 seconds


Training Progress:  50%|█████     | 5/10 [03:05<03:16, 39.33s/it]

Trained 50 trees in 185.28 seconds
Estimated time remaining: 185.28 seconds


Training Progress:  60%|██████    | 6/10 [03:51<02:46, 41.70s/it]

Trained 60 trees in 231.57 seconds
Estimated time remaining: 154.38 seconds


Training Progress:  70%|███████   | 7/10 [04:34<02:06, 42.02s/it]

Trained 70 trees in 274.24 seconds
Estimated time remaining: 117.53 seconds


Training Progress:  80%|████████  | 8/10 [05:14<01:22, 41.39s/it]

Trained 80 trees in 314.30 seconds
Estimated time remaining: 78.58 seconds


Training Progress:  90%|█████████ | 9/10 [05:57<00:42, 42.08s/it]

Trained 90 trees in 357.88 seconds
Estimated time remaining: 39.76 seconds


Training Progress: 100%|██████████| 10/10 [06:36<00:00, 39.69s/it]

Trained 100 trees in 396.86 seconds
Estimated time remaining: 0.00 seconds
Training completed in 396.89 seconds





**Predict on the Validation Set**

In [11]:
y_pred = rf_classifier.predict(X_val)

**Classification Report**

In [13]:
# Find the unique labels in y_val
unique_labels = np.unique(y_val)

# Filter target names to match the unique labels in y_val
target_names = [label_encoder.inverse_transform([label])[0] for label in unique_labels]

# Generate the classification report
report = classification_report(
    y_val, y_pred, labels=unique_labels, target_names=target_names
)

print(report)


                          precision    recall  f1-score   support

                   Adult       0.65      0.25      0.36       141
            Adult (NSFW)       0.00      0.00      0.00         1
             Adult humor       0.00      0.00      0.00         1
                  Adult.       0.00      0.00      0.00         1
            Adult/Erotic       0.00      0.00      0.00         2
            Adult/Ethnic       0.00      0.00      0.00         3
             Adult/Humor       0.00      0.00      0.00         1
     Adult/Inappropriate       0.00      0.00      0.00         1
              Adult/Puns       0.00      0.00      0.00         1
            Adult/Sexual       0.00      0.00      0.00         1
                     Age       0.00      0.00      0.00         1
                 Alcohol       0.00      0.00      0.00         3
                  Animal       0.79      0.51      0.62       344
                     Bar       1.00      0.33      0.50         3
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
