In [1]:
with open("words.txt") as f:
    words = f.read().splitlines()

In [8]:
new_ls = []
for word in words:
    if word and " " not in word and "/" not in word and "," not in word and "\n" not in word and "." not in word:
        new_ls.append(word)

In [7]:
new_ls[24]

''

In [None]:
with open("word_list.txt","w") as f:
    for word in new_ls:
        f.write(word + ",")

In [5]:
request_ls = []
with open("word_list.txt","r") as f:
    content = f.read()
request_ls = content.split(",")

In [6]:
import requests
import time
from tqdm import tqdm
meaning_ls = []
def get_word_meanings(words: str):
    word_list = words.split(",")
    dictionary_api = "https://api.dictionaryapi.dev/api/v2/entries/en/"
    words_meanings = []

    for word in word_list:
        word = word.strip()
        response = requests.get(dictionary_api + word)
        if response.status_code != 200:
            words_meanings.append(f"{word} - not found")
            continue

        data = response.json()

        # format: word \n phonetic \n ";".join all meanings
        entry = data[0]
        text = f"{entry.get('word', '')}\n{entry.get('phonetic', '')}\n"

        for meaning in entry.get("meanings", []):
            for definition in meaning.get("definitions", []):
                text += definition.get("definition", "") + "; "

        words_meanings.append(text.strip())

    return words_meanings

for word in tqdm(request_ls):
    meaning = get_word_meanings(word)
    meaning_ls.append(meaning)
    time.sleep(0.5)  # to avoid hitting the API rate limit
    
with open("meanings.txt","a",encoding="utf-8") as f:
    f.write(meaning[0] + "\n")
    

100%|██████████| 422/422 [04:02<00:00,  1.74it/s]


In [11]:
new_ls = [i[0] for i in meaning_ls if "not found" not in i[0]]

In [14]:
new_ls = [i[0] for i in meaning_ls if "not found" not in i[0]]
with open("meanings.txt","w",encoding="utf-8") as f:
    for meaning in new_ls:
        f.write(meaning + "\n----\n")

In [16]:
import numpy as np
import re
import json
from tqdm import tqdm

# Get embeddings for all texts in new_ls with batching
def get_embeddings_batch(texts, batch_size=16):
    """Get embeddings for a batch of texts using Replicate proxy"""
    url = "https://itp-ima-replicate-proxy.web.app/api/create_n_get"
    auth_token = ""
    model_name = "beautyyuyanli/multilingual-e5-large:a06276a89f1a902d5fc225a9ca32b6e8e6292b7f3b136518878da97c458e2bad"
    
    # Clean texts - remove brackets, quotes, backslashes
    cleaned_texts = [re.sub(r"[\[\]\\\'\"]", '', text) for text in texts]
    
    data = {
        "version": model_name,
        "input": {
            "texts": str(cleaned_texts).replace('\'', '\"'),
            "batch_size": batch_size,
            "normalize_embeddings": False
        }
    }
    
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Authorization": f"Bearer {auth_token}"
    }
    
    response = requests.post(url, headers=headers, json=data)
    
    if response.status_code == 200:
        return np.array(response.json()['output'])
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Process all 412 texts with batching
print(f"Processing {len(new_ls)} texts...")
batch_size = 16
all_embeddings = []

# Split into batches
for i in tqdm(range(0, len(new_ls), batch_size), desc="Processing batches"):
    batch_texts = new_ls[i:i + batch_size]
    
    try:
        batch_embeddings = get_embeddings_batch(batch_texts, batch_size)
        
        if batch_embeddings is not None:
            all_embeddings.extend(batch_embeddings)
            print(f"Processed batch {i//batch_size + 1}, got {len(batch_embeddings)} embeddings")
        else:
            print(f"Failed to get embeddings for batch {i//batch_size + 1}")
            
    except Exception as e:
        print(f"Error processing batch {i//batch_size + 1}: {e}")
    
    # Sleep to avoid rate limiting
    time.sleep(1.0)

print(f"Total embeddings collected: {len(all_embeddings)}")

# Convert to numpy array
if all_embeddings:
    embeddings = np.array(all_embeddings)
    print(f"Embeddings shape: {embeddings.shape}")
    
    # L2 normalization
    l2_normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    # MinMax normalization
    minmax_normalized_embeddings = (embeddings - embeddings.min(axis=0)) / (embeddings.max(axis=0) - embeddings.min(axis=0))
    
    print("Embeddings processed successfully!")
    print(f"L2 normalized shape: {l2_normalized_embeddings.shape}")
    print(f"MinMax normalized shape: {minmax_normalized_embeddings.shape}")
else:
    print("No embeddings were successfully collected.")

Processing 413 texts...


Processing batches:   0%|          | 0/26 [00:00<?, ?it/s]

Processed batch 1, got 16 embeddings


Processing batches:   4%|▍         | 1/26 [00:05<02:23,  5.75s/it]

Processed batch 2, got 16 embeddings


Processing batches:   8%|▊         | 2/26 [00:08<01:29,  3.72s/it]

Processed batch 3, got 16 embeddings


Processing batches:  12%|█▏        | 3/26 [00:10<01:09,  3.02s/it]

Processed batch 4, got 16 embeddings


Processing batches:  15%|█▌        | 4/26 [00:12<01:00,  2.77s/it]

Processed batch 5, got 16 embeddings


Processing batches:  19%|█▉        | 5/26 [00:15<00:55,  2.63s/it]

Processed batch 6, got 16 embeddings


Processing batches:  23%|██▎       | 6/26 [00:17<00:50,  2.53s/it]

Processed batch 7, got 16 embeddings


Processing batches:  27%|██▋       | 7/26 [00:19<00:46,  2.42s/it]

Processed batch 8, got 16 embeddings


Processing batches:  31%|███       | 8/26 [00:22<00:44,  2.49s/it]

Processed batch 9, got 16 embeddings


Processing batches:  35%|███▍      | 9/26 [00:24<00:41,  2.42s/it]

Processed batch 10, got 16 embeddings


Processing batches:  38%|███▊      | 10/26 [00:26<00:37,  2.37s/it]

Processed batch 11, got 16 embeddings


Processing batches:  42%|████▏     | 11/26 [00:29<00:35,  2.39s/it]

Processed batch 12, got 16 embeddings


Processing batches:  46%|████▌     | 12/26 [00:31<00:33,  2.37s/it]

Processed batch 13, got 16 embeddings


Processing batches:  50%|█████     | 13/26 [00:33<00:30,  2.35s/it]

Processed batch 14, got 16 embeddings


Processing batches:  54%|█████▍    | 14/26 [00:36<00:28,  2.34s/it]

Processed batch 15, got 16 embeddings


Processing batches:  58%|█████▊    | 15/26 [00:38<00:25,  2.35s/it]

Processed batch 16, got 16 embeddings


Processing batches:  62%|██████▏   | 16/26 [00:40<00:24,  2.41s/it]

Processed batch 17, got 16 embeddings


Processing batches:  65%|██████▌   | 17/26 [00:43<00:21,  2.38s/it]

Processed batch 18, got 16 embeddings


Processing batches:  69%|██████▉   | 18/26 [00:45<00:19,  2.39s/it]

Processed batch 19, got 16 embeddings


Processing batches:  73%|███████▎  | 19/26 [00:48<00:16,  2.40s/it]

Processed batch 20, got 16 embeddings


Processing batches:  77%|███████▋  | 20/26 [00:50<00:14,  2.41s/it]

Processed batch 21, got 16 embeddings


Processing batches:  81%|████████  | 21/26 [00:52<00:11,  2.39s/it]

Processed batch 22, got 16 embeddings


Processing batches:  85%|████████▍ | 22/26 [00:55<00:09,  2.36s/it]

Processed batch 23, got 16 embeddings


Processing batches:  88%|████████▊ | 23/26 [00:57<00:06,  2.33s/it]

Processed batch 24, got 16 embeddings


Processing batches:  92%|█████████▏| 24/26 [00:59<00:04,  2.35s/it]

Processed batch 25, got 16 embeddings


Processing batches:  96%|█████████▌| 25/26 [01:02<00:02,  2.32s/it]

Processed batch 26, got 13 embeddings


Processing batches: 100%|██████████| 26/26 [01:04<00:00,  2.48s/it]

Total embeddings collected: 413
Embeddings shape: (413, 1024)
Embeddings processed successfully!
L2 normalized shape: (413, 1024)
MinMax normalized shape: (413, 1024)





In [19]:
# Export embeddings to JavaScript format
import json

# Option 1: Create a JavaScript file with a variable (Recommended)
def export_to_js_file(embeddings, filename="embeddings.js", var_name="embeddings"):
    """Export embeddings as a JavaScript variable in a .js file"""
    embeddings_list = embeddings.tolist()  # Convert numpy array to list
    
    js_content = f"// Generated embeddings data\n"
    js_content += f"// Shape: {embeddings.shape}\n"
    js_content += f"const {var_name} = {json.dumps(embeddings_list)};\n\n"
    js_content += f"// Export for use in other files\n"
    js_content += f"if (typeof module !== 'undefined' && module.exports) {{\n"
    js_content += f"    module.exports = {var_name};\n"
    js_content += f"}}\n"
    
    with open(filename, 'w') as f:
        f.write(js_content)
    
    print(f"Embeddings exported to {filename}")
    print(f"File size: {len(js_content) / 1024:.1f} KB")

# Option 2: Create JSON file (for fetch/import)
def export_to_json(embeddings, filename="embeddings.json"):
    """Export embeddings as JSON file"""
    data = {
        "embeddings": embeddings.tolist(),
        "shape": embeddings.shape,
        "length": len(embeddings),
        "embedding_dim": embeddings.shape[1] if len(embeddings.shape) > 1 else 0
    }
    
    with open(filename, 'w') as f:
        json.dump(data, f, separators=(',', ':'))  # Compact format
    
    print(f"Embeddings exported to {filename}")

# Option 3: Create HTML script tag (for direct inclusion)
def export_to_html_script(embeddings, filename="embeddings_script.html", var_name="embeddings"):
    """Export as HTML script tag"""
    embeddings_list = embeddings.tolist()
    
    html_content = f"<!-- Include this in your HTML file -->\n"
    html_content += f"<script>\n"
    html_content += f"const {var_name} = {json.dumps(embeddings_list)};\n"
    html_content += f"console.log('Loaded embeddings:', {var_name}.length);\n"
    html_content += f"</script>\n"
    
    with open(filename, 'w') as f:
        f.write(html_content)
    
    print(f"HTML script exported to {filename}")

# Export using all methods if embeddings exist
if 'embeddings' in locals() and embeddings is not None:
    print("Exporting embeddings to JavaScript formats...")
    
    # Method 1: JavaScript file (most flexible)
    export_to_js_file(embeddings, "embeddings.js", "wordEmbeddings")
    
    # # Method 2: JSON file (for loading via fetch)
    # export_to_json(embeddings, "embeddings.json")
    
    # Method 3: HTML script (for direct inclusion)
    export_to_html_script(embeddings, "embeddings_script.html", "wordEmbeddings")
    
    print("\n✅ All exports completed!")
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Total data points: {embeddings.size}")
    
else:
    print("❌ No embeddings found. Run the previous cell first.")

Exporting embeddings to JavaScript formats...
Embeddings exported to embeddings.js
File size: 9148.6 KB
HTML script exported to embeddings_script.html

✅ All exports completed!
Embeddings shape: (413, 1024)
Total data points: 422912
HTML script exported to embeddings_script.html

✅ All exports completed!
Embeddings shape: (413, 1024)
Total data points: 422912
