In [46]:
from dotenv import load_dotenv
from openai import OpenAI, AsyncOpenAI
import lexiconia.system_messages as sm
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
import asyncio
import time
import pickle
import json
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import string

In [None]:
# set up the OpenAI clent and define some helper functions
load_dotenv()

client = OpenAI(
    # Defaults to os.environ.get("OPENAI_API_KEY")
    # Otherwise use: api_key="Your_API_Key",
)

def get_completion(comment, system_message, model_class='gpt-3.5', temperature=0, frequency_penalty=1, force_json:bool=True):

    models = {'gpt-4': 'gpt-4-0125-preview',
              'gpt-3.5': 'gpt-3.5-turbo-0125'}

    input_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.01,
                                'gpt-3.5-turbo-0125': 0.0005}
    output_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.03,
                                'gpt-3.5-turbo-0125': 0.0015}

    model = models[model_class]
    input_price = input_cost_per_1k_tokens[model]
    output_price = output_cost_per_1k_tokens[model]

    if force_json:
        response_format = { "type": "json_object" }
    else:
        response_format = None
    completion = client.chat.completions.create(model=model,
                                                temperature=temperature,
                                                frequency_penalty=frequency_penalty,
                                                messages=[
                                                    {"role": "system", "content": system_message},
                                                    {"role": "user", "content": comment}
                                                ],
                                                response_format=response_format
                                                )
    input_cost = input_price * completion.usage.prompt_tokens / 1000
    output_cost = output_price * completion.usage.completion_tokens / 1000
    return completion.choices[0].message.content, completion.choices[0].message.tool_calls, input_cost + output_cost

async_client = AsyncOpenAI()

async def get_completion_async(comment, system_message, model_class='gpt-3.5', temperature=0, frequency_penalty=1, force_json:bool=True):

    models = {'gpt-4': 'gpt-4-0125-preview',
              'gpt-3.5': 'gpt-3.5-turbo-0125'}

    input_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.01,
                                'gpt-3.5-turbo-0125': 0.0005}
    output_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.03,
                                'gpt-3.5-turbo-0125': 0.0015}

    model = models[model_class]
    input_price = input_cost_per_1k_tokens[model]
    output_price = output_cost_per_1k_tokens[model]

    if force_json:
        response_format = { "type": "json_object" }
    else:
        response_format = None
    completion = await async_client.chat.completions.create(model=model,
                                                temperature=temperature,
                                                frequency_penalty=frequency_penalty,
                                                messages=[
                                                    {"role": "system", "content": system_message},
                                                    {"role": "user", "content": comment}
                                                ],
                                                response_format=response_format
                                                )
    input_cost = input_price * completion.usage.prompt_tokens / 1000
    output_cost = output_price * completion.usage.completion_tokens / 1000
    return completion.choices[0].message.content, completion.choices[0].message.tool_calls, input_cost + output_cost

def convert_entry_to_dataframe(entry):
    
    dictionary = json.loads(entry)
    if not dictionary: return

    df = pd.DataFrame(dictionary["definitions"])
    df.insert(0, "variants", ', '.join(dictionary["variants"]))
    df.insert(0, "root_word", dictionary["root_word"])
    df.insert(0, "term", dictionary["term"])
    return df

def get_chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# Specify the number of words to use.
# To reproduce published results use all 20k words
n_words_to_use = 20000

In [None]:
# Load words list
with open('20k.txt', 'r') as file:
    text = file.read()
words = text.split()
words = words[:n_words_to_use]
print(f'There are {len(words):,} words.')

In [None]:
# Test out getting defintions for a single word
# Here you can compare defintions provided by gpt-3.5 and gpt-4. gpt-3.5 provides satisfactory defintions (maybe even superior defintions because gpt-4 can get a little too creative/detailed)
test_word = 'tree'
model = 'gpt-3.5' # 'gpt-4'
entry, _, cost = await get_completion_async(comment=test_word, system_message=sm.samuel_johnson, frequency_penalty=0.1, model_class=model)
print(f'{model} entry (cost: ${cost:0.4f}):')
convert_entry_to_dataframe(entry)

In [None]:
# Get definitons for every word in the words list
# Each get takes awhile, but mulitple can be requested at once by using async operations, so long as we don't hit our rate limit.
# Your rate limit may differ depending on how much you spend on the OpenAI API per month (https://platform.openai.com/docs/guides/rate-limits/usage-tiers)

async def gather_results(my_list):
    # WARNING: Using gpt-4 here to get defintions for all 20k words could cost hundreds of dollars (and will take days)!
    # gpt-3.5 is much cheaper and its definitions are satisfactory
    tasks = [get_completion_async(comment=v, system_message=sm.samuel_johnson, frequency_penalty=0.1, model_class='gpt-3.5') for v in my_list]
    return await asyncio.gather(*tasks)

# batch_size=200 and sleep_time=20 was chosen to not hit the rate limit.
# If you hit your rate limit, either decrease batch_size or increase sleep_time
batch_size = 200
sleep_time = 20
results = []
for chunk in get_chunks(words, batch_size):
    print('getting another batch...')
    result = await gather_results(chunk)
    print(f'got a batch of {len(result)} results!')
    time.sleep(sleep_time)
    results += result

In [None]:
# Convert json entries to dataframes and concate into one df.
# Also summ up the total cost
df_list = []
total_cost = 0
for result in results:
    entry, _, cost = result
    total_cost += cost
    try:
        df_list.append(convert_entry_to_dataframe(entry))
    except:
        print(f'There was a failure for:\n{entry}')

print(f'Total cost: ${total_cost:0.4f}:')
df = pd.concat(df_list).reset_index(drop=True)

print(f'There are {len(df.term.unique())} unique terms.')

In [None]:
# Get embeddings for each word-definition pair.
# The exact text that is embedded has the form:
# <WORD>: <DEFINITION>
text_list = df[['term','definition']].apply(lambda row: f'{row.term}: {row.definition}', axis=1).to_list()

chunk_size = 2048 #2048 is max batch size of text-embedding-3-small model
embeddings = []
for chunk in get_chunks(text_list, chunk_size):
    response = client.embeddings.create(input = chunk, model="text-embedding-3-small")
    chunk_embeddings = [d.embedding for d in response.data]
    embeddings += chunk_embeddings

df['embedding'] = embeddings

In [None]:
# Save the dataframe
df.to_parquet('semantics.parquet')

### Everything above here is time consuming and expensive. Just load a precalculated dataset, if you have one, to modify the UMAP projection and clustering.

In [None]:
df = pd.read_parquet('semantics_2024-02-07.parquet')
# df = pd.read_parquet('semantics.parquet')

In [None]:
# UMAP projection
# Projecting 41207 word defintion embeddings takes approximately 30s on Macbook Air/M2 SoC.

reducer = umap.UMAP(n_neighbors=50, min_dist=0.0, random_state=42)
embedding = reducer.fit_transform(df['embedding'].to_list())

df['umap_1'] = embedding[:, 0]
df['umap_2'] = embedding[:, 1]

In [None]:
# Save projection
with open('umap.pkl', 'wb') as file:
    pickle.dump(reducer, file)

In [None]:
# Load projection
with open('umap.pkl', 'rb') as file:
    reducer = pickle.load(file)

In [None]:
# Cluster the points, in the 2D umap projection space, using DBSCAN.
# These clusters will be the 'Lands'.
# The parameters of DBSCAN were choosen by hand to approximately maximize the number of clusters
# DBSCAN does not necessarily cluster all points. It finds 'high density' groups of points 
# where 'high' means at least min_samples points are within eps distance of each other. 
# DBSCAN is deterministic so no random sed/state needs to be specified to get reproducable results.

# hand-tuned values to ~maximize number of clusters when n_words_to_use is 20k
if n_words_to_use == 20000:
    eps = 0.07
    min_samples = 27
else: # pick your own values
    eps = 0.07
    min_samples = int(27 * (n_words_to_use/20000)**0.5)

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
df['cluster'] = dbscan.fit_predict(df[['umap_1', 'umap_2']].to_numpy())
n_lands = len(df['cluster'].unique()) - 1 #minus one because the -1 cluster is not a real cluster, it is the unclustered points
print(f"There are {n_lands} 'Land' clusters.")

In [None]:
# Use kmeans to cluster the points, again in the 2D umap projection space, that were not clustered by dbscan (cluster ID was -1).
# These clusters will be the 'Seas'.
# To distinguish these sea clusters from land clusters, we will give them negative cluster IDs, and since 
# the clusters IDs will start at zero, subtract one to start at -1.
# The number of clusters was picked by trial and error such that there is roughly one sea between each group of lands, so roughly n_lands/3
n_seas = max(1, int(n_lands/3))
unclustered = df['cluster']==-1
kmeans = KMeans(n_clusters=n_seas, random_state=42)
df.loc[unclustered, 'cluster'] = -kmeans.fit_predict(df.loc[unclustered, ['umap_1', 'umap_2']].to_numpy()) - 1
print(f"There are {len(df[df['cluster']<0]['cluster'].unique())} 'Sea' clusters.")

In [None]:
# Convert clusters to catagorical so plotly uses a qualittative colormap
df["cluster"] = df["cluster"].astype('category')

In [None]:
# Plot Land clusters 
df['annotation'] = df[['term','definition']].apply(lambda row: f'{row.term}: {row.definition}', axis=1)

color_map = {cluster: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, cluster in enumerate(df.cluster.unique())}
for key in color_map:
    if key < 0:
        color_map[key] = 'grey'

fig = px.scatter(df, x='umap_1', y='umap_2',
                 color='cluster',
                 hover_name='annotation',
                 hover_data={'umap_1': False, 'umap_2': False},
                 title='Land Clusters',
                 color_discrete_map=color_map)

fig.update_traces(marker={'size': 2.5})
fig.update_layout(width=900,
                  height=700,
                  plot_bgcolor='white',
                  showlegend=False)

fig.show()
df.drop(columns=['annotation'],inplace=True)

In [None]:
# Plot Sea clusters 
df['annotation'] = df[['term','definition']].apply(lambda row: f'{row.term}: {row.definition}', axis=1)

color_map = {cluster: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, cluster in enumerate(df.cluster.unique())}
for key in color_map:
    if key >= 0:
        color_map[key] = 'grey'

fig = px.scatter(df, x='umap_1', y='umap_2',
                 color='cluster',
                 hover_name='annotation',
                 hover_data={'umap_1': False, 'umap_2': False},
                 title='Sea Clusters',
                 color_discrete_map=color_map)

fig.update_traces(marker={'size': 2.5})
fig.update_layout(width=900,
                  height=700,
                  plot_bgcolor='white',
                  showlegend=False)

fig.show()
df.drop(columns=['annotation'],inplace=True)

In [None]:
# Get the topic of each cluster from gpt
# We will use gpt-3.5 here because it is faster and cheaper than gpt-4 and results are satisfactory
# This will take ~5 minutes because we are not using the async option.
def get_topics(group):
    terms = ', '.join(list(group['term'].unique()))
    topic, _, cost = get_completion(comment=terms, system_message=sm.identify_topic, force_json=False, model_class='gpt-3.5')
    group['cluster_topic'] = topic
    return group

df = df.groupby('cluster', observed=True).apply(get_topics)
df.reset_index(inplace=True, drop=True)

In [None]:
# Get scores for the emotional valence, physicality, and humanity (see 'words_valence' prompt for defintions of these terms) of each cluster from gpt
# We will use gpt-4 here because gpt-3.5 results were not satisfactory
# This will take ~3 minutes because we are not using the async option.
def get_scores(group):
    terms = ', '.join(list(group['term'].unique()))
    if group['cluster'].to_list()[0]>=0:
        score, _, cost = get_completion(comment=terms, system_message=sm.words_valence, force_json=True, model_class='gpt-4')
        try:
            score = json.loads(score)
            assert set(score.keys()).issubset({'valence', 'physicality', 'humanity'})
        except:
            print(f"gpt gave us a bad score json for the topic {group['cluster_topic'].to_list()[0]}. It gave us:")
            print(score)
            print('The scores for this topic will all be set to -1.')
            score = {'valence': -1, 'physicality': -1, 'humanity': -1}
    else: # we won't get valences for the seas
        score = {'valence': 3, 'physicality': 3, 'humanity': 3}

    try: #gpt can fail to return json with the expected keys
        group['valence'] = score['valence']
        group['physicality'] = score['physicality']
        group['humanity'] = score['humanity']
    except:
        group['valence'] = 3
        group['physicality'] = 3
        group['humanity'] = 3
    
    return group

df = df.groupby('cluster', observed=True).apply(get_scores)
df.reset_index(inplace=True, drop=True)

In [None]:
# Get a fun name for the Land and Sea clusters from gpt.
# This will take ~3 minutes because we are not using the async option.
is_land = df.cluster.astype(int)>=0
land_topics = df[is_land].cluster_topic.unique()
topic_to_land_name = {topic: get_completion(comment=topic, system_message=sm.land_name, force_json=False, temperature=0.2, model_class='gpt-3.5')[0] for topic in land_topics}

is_sea = df.cluster.astype(int)<0
sea_topics = df[is_sea].cluster_topic.unique()
topic_to_sea_name = {topic: get_completion(comment=topic, system_message=sm.sea_name, force_json=False, temperature=0.2, model_class='gpt-3.5')[0] for topic in sea_topics}

df.loc[is_land, 'territory_name'] = df.loc[is_land, 'cluster_topic'].map(topic_to_land_name)
df.loc[is_sea, 'territory_name'] = df.loc[is_sea, 'cluster_topic'].map(topic_to_sea_name)

In [None]:
# Optionally rename a cluster's territory name by hand here
# df.loc[df.cluster==147, 'territory_name'] = 'The Land of Crisis'

In [49]:
# Finally, filter out NSFW words
with open('dirty-words-en.txt', 'r') as file:
    text = file.read()
dirty_words = text.split('\n')

def is_dirty(word):
    word = word.lower().strip(string.punctuation)
    word_no_s = word[:-1] if word.endswith('s') else word # make the word singular
    return word in dirty_words or word_no_s in dirty_words

def replace_vowels_with_star(word):
    return ''.join('*' if c in 'aeiouyAEIOUY' else c for c in word)

def replace_center_letters(word):
    return word[0] + '*' * (len(word)-2) + word[-1]

def clean_word(word):
    if is_dirty(word):
        return replace_center_letters(word), True
    return word, False

def make_sfw(row):
    sfw_word, word_is_nsfw  = clean_word(row.term)
    sfw_definition = ' '.join([clean_word(word)[0] if clean_word(word)[1] else word for word in row.definition.split()])
    definition_is_nsfw = sfw_definition != row.definition
    return word_is_nsfw, definition_is_nsfw, sfw_word, sfw_definition

df[['term_is_nsfw', 'definition_is_nsfw', 'sfw_term', 'sfw_definition']] = df[['term', 'definition']].apply(make_sfw, axis=1, result_type='expand')

In [50]:
df["cluster"] = df["cluster"].astype('category')
df["valence"] = df["valence"].astype('category')
df["physicality"] = df["physicality"].astype('category')
df["humanity"] = df["humanity"].astype('category')

df.to_parquet('semantics.parquet')