In [None]:
from dotenv import load_dotenv
from openai import OpenAI, AsyncOpenAI
import chat_system_messages as sm
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
import asyncio
import time
import pickle
import json
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import random

In [None]:

load_dotenv()

client = OpenAI(
    # Defaults to os.environ.get("OPENAI_API_KEY")
    # Otherwise use: api_key="Your_API_Key",
)

def get_completion(comment, system_message, model_class='gpt-3.5', temperature=0, frequency_penalty=1, force_json:bool=True):

    models = {'gpt-4': 'gpt-4-0125-preview',
              'gpt-3.5': 'gpt-3.5-turbo-0125'}#'gpt-3.5-turbo-1106'}

    input_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.01,
                                'gpt-3.5-turbo-0125': 0.0005}
    output_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.03,
                                'gpt-3.5-turbo-0125': 0.0015}

    model = models[model_class]
    input_price = input_cost_per_1k_tokens[model]
    output_price = output_cost_per_1k_tokens[model]

    if force_json:
        response_format = { "type": "json_object" }
    else:
        response_format = None
    completion = client.chat.completions.create(model=model,
                                                temperature=temperature,
                                                frequency_penalty=frequency_penalty,
                                                messages=[
                                                    {"role": "system", "content": system_message},
                                                    {"role": "user", "content": comment}
                                                ],
                                                response_format=response_format
                                                )

    # print(f'Got {completion.usage} for "{comment}".')
    input_cost = input_price * completion.usage.prompt_tokens / 1000
    output_cost = output_price * completion.usage.completion_tokens / 1000
    return completion.choices[0].message.content, completion.choices[0].message.tool_calls, input_cost + output_cost

async_client = AsyncOpenAI()

async def get_completion_async(comment, system_message, model_class='gpt-3.5', temperature=0, frequency_penalty=1, force_json:bool=True):

    models = {'gpt-4': 'gpt-4-0125-preview',
              'gpt-3.5': 'gpt-3.5-turbo-0125'}#'gpt-3.5-turbo-1106'}

    input_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.01,
                                'gpt-3.5-turbo-0125': 0.0005}
    output_cost_per_1k_tokens = {'gpt-4-0125-preview': 0.03,
                                'gpt-3.5-turbo-0125': 0.0015}

    model = models[model_class]
    input_price = input_cost_per_1k_tokens[model]
    output_price = output_cost_per_1k_tokens[model]

    if force_json:
        response_format = { "type": "json_object" }
    else:
        response_format = None
    completion = await async_client.chat.completions.create(model=model,
                                                temperature=temperature,
                                                frequency_penalty=frequency_penalty,
                                                messages=[
                                                    {"role": "system", "content": system_message},
                                                    {"role": "user", "content": comment}
                                                ],
                                                response_format=response_format
                                                )

    # print(f'Got {completion.usage} for "{comment}".')
    input_cost = input_price * completion.usage.prompt_tokens / 1000
    output_cost = output_price * completion.usage.completion_tokens / 1000
    return completion.choices[0].message.content, completion.choices[0].message.tool_calls, input_cost + output_cost

def convert_entry_to_dataframe(entry):
    
    dictionary = json.loads(entry)
    if not dictionary: return

    df = pd.DataFrame(dictionary["definitions"])
    df.insert(0, "variants", ', '.join(dictionary["variants"]))
    df.insert(0, "root_word", dictionary["root_word"])
    df.insert(0, "term", dictionary["term"])
    return df

def get_chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# Load words list
with open('../google-10000-english-master/20k.txt', 'r') as file:
    text = file.read()
words = text.split()
print(f'There are {len(words):,} words.')

In [None]:
test_word = 'bowl'
model = 'gpt-3.5'
entry, _, cost = await get_completion_async(comment=test_word, system_message=sm.samuel_johnson_2, frequency_penalty=0.1, model_class=model)
print(f'{model} entry (cost: ${cost:0.4f}):')
convert_entry_to_dataframe(entry)

In [None]:
# Get definitons for every word in the words list
# Each get takes awhile, but mulitple can be requested at once, so long as we don't hit the rate limit
async def gather_results(my_list):
    tasks = [get_completion_async(comment=v, system_message=sm.samuel_johnson_2, frequency_penalty=0.1) for v in my_list]
    return await asyncio.gather(*tasks)

batch_size = 200
results = []
for chunk in get_chunks(words, batch_size):
    print('getting another batch...')
    result = await gather_results(chunk)
    print(f'got a batch of {len(result)} results!')
    time.sleep(20)
    results += result

In [None]:
# Convert json entries to dataframes and concate into one df.
# Also summ up the total cost
df_list = []
total_cost = 0
for result in results:
    entry, _, cost = result
    total_cost += cost
    try:
        df_list.append(convert_entry_to_dataframe(entry))
    except:
        print(f'There was a failure for:\n{entry}')

print(f'total cost: ${total_cost:0.4f}:')
df = pd.concat(df_list).reset_index(drop=True)

print(f'There are {len(df.term.unique())} unique terms.')

In [None]:
text_list = df[['term','definition']].apply(lambda row: f'{row.term}: {row.definition}', axis=1).to_list()

chunk_size = 2048 #2048 is max batch size of text-embedding-3-small model
embeddings = []
for chunk in get_chunks(text_list, chunk_size): #2048 is max batch size of text-embedding-3-small model
    response = client.embeddings.create(input = chunk, model="text-embedding-3-small")
    chunk_embeddings = [d.embedding for d in response.data]
    embeddings += chunk_embeddings

df['embedding'] = embeddings

### Everything above here is time consuming and expensive. Just load a precalculated dataset, if you have one, to modify the UMAP projection and clustering.

In [None]:
df = pd.read_parquet('../semantics_2024-02-07.parquet')
df["cluster"] = df["cluster"].astype('category')
df["valence"] = df["valence"].astype('category')
df["physicality"] = df["physicality"].astype('category')
df["humanity"] = df["humanity"].astype('category')

In [None]:
# UMAP projection
random.seed(42)
reducer = umap.UMAP(n_neighbors=50, min_dist=0.0)
embedding = reducer.fit_transform(df['embedding'].to_list())

df['umap_1'] = embedding[:, 0]
df['umap_2'] = embedding[:, 1]

In [None]:
# Save projection
with open('../data/umap.pkl', 'wb') as file:
    pickle.dump(reducer, file)

In [None]:
# Load projection
with open('../data/umap.pkl', 'rb') as file:
    reducer = pickle.load(file)

In [None]:
# Cluster the points, in umap projection, using dbscan. These will be the Lands.
random.seed(42)
dbscan = DBSCAN(eps=0.08, min_samples=30)  # Adjust eps and min_samples as needed
df['cluster'] = dbscan.fit_predict(df[['umap_1', 'umap_2']].to_numpy())

len(df["cluster"].unique())

In [None]:
# use kmeans to cluster the points that were not clustered by dbscan (cluster ID was -1). These will be the Seas.
# To distinguish these sea clusters from land clusters, we will give them negative cluster IDs.
# And since the clusters IDs will start at zero, subtract one to start at -1.
# Number of clusters picked by trial and error.
unclustered = df['cluster']==-1

kmeans = KMeans(n_clusters=75, random_state=0)
df.loc[unclustered, 'cluster'] = -kmeans.fit_predict(df.loc[unclustered, ['umap_1', 'umap_2']].to_numpy()) - 1

df["cluster"] = df["cluster"].astype('category')
len(df["cluster"].unique())

In [None]:
# Plot Land clusters 
df['annotation'] = df[['term','definition']].apply(lambda row: f'{row.term}: {row.definition}', axis=1)

color_map = {cluster: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, cluster in enumerate(df.cluster.unique())}
for key in color_map:
    if key < 0:
        color_map[key] = 'grey'

fig = px.scatter(df, x='umap_1', y='umap_2',
                 color='cluster',
                 hover_name='annotation',
                 hover_data={'umap_1': False, 'umap_2': False},
                 title='Land Clusters',
                 color_discrete_map=color_map)

fig.update_traces(marker={'size': 2.5})
fig.update_layout(width=900,
                  height=700,
                  plot_bgcolor='white',
                  showlegend=False)

fig.show()

In [None]:
# Plot Sea clusters 
df['annotation'] = df[['term','definition']].apply(lambda row: f'{row.term}: {row.definition}', axis=1)

color_map = {cluster: px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)] for i, cluster in enumerate(df.cluster.unique())}
for key in color_map:
    if key >= 0:
        color_map[key] = 'grey'

fig = px.scatter(df, x='umap_1', y='umap_2',
                 color='cluster',
                 hover_name='annotation',
                 hover_data={'umap_1': False, 'umap_2': False},
                 title='Sea Clusters',
                 color_discrete_map=color_map)

fig.update_traces(marker={'size': 2.5})
fig.update_layout(width=900,
                  height=700,
                  plot_bgcolor='white',
                  showlegend=False)

fig.show()

In [None]:
# Get the topic of each cluster from gpt
def custom_function(group):
    terms = ', '.join(list(group['term'].unique()))
    topic, _, cost = get_completion(comment=terms, system_message=sm.identify_topic, force_json=False, model_class='gpt-3.5')
    group['cluster_topic'] = topic
    return group

df = df.groupby('cluster', observed=True).apply(custom_function)
df.reset_index(inplace=True, drop=True)

In [None]:
# get the valence of each cluster from gpt
def custom_function(group):
    terms = ', '.join(list(group['term'].unique()))
    if group['cluster'].to_list()[0]>=0:
        score, _, cost = get_completion(comment=terms, system_message=sm.words_valence, force_json=True, model_class='gpt-4')
        print(f'cost: {cost}')
        try:
            score = json.loads(score)
            print(group['cluster_topic'].to_list()[0])
        except:
            print(score)
            score = {'valence': -1, 'physicality': -1, 'humanity': -1}
    else: # we won't get valences for the seas
        score = {'valence': 3, 'physicality': 3, 'humanity': 3}

    try: #gpt can fail to return json with the expected keys
        group['valence'] = score['valence']
        group['physicality'] = score['physicality']
        group['humanity'] = score['humanity']
    except:
        group['valence'] = 3
        group['physicality'] = 3
        group['humanity'] = 3
    
    return group

df = df.groupby('cluster', observed=True).apply(custom_function)
df.reset_index(inplace=True, drop=True)

df["cluster"] = df["cluster"].astype('category')
df["valence"] = df["valence"].astype('category')
df["physicality"] = df["physicality"].astype('category')
df["humanity"] = df["humanity"].astype('category')

In [None]:
# get a fun name for the region of each cluster from gpt
is_land = df.cluster.astype(int)>=0
land_topics = df[is_land].cluster_topic.unique()
topic_to_land_name = {topic: get_completion(comment=topic, system_message=sm.land_name, force_json=False, temperature=0.2, model_class='gpt-3.5')[0] for topic in land_topics}

is_sea = df.cluster.astype(int)<0
sea_topics = df[is_sea].cluster_topic.unique()
topic_to_sea_name = {topic: get_completion(comment=topic, system_message=sm.sea_name, force_json=False, temperature=0.2, model_class='gpt-3.5')[0] for topic in sea_topics}

df.loc[is_land, 'territory_name'] = df.loc[is_land, 'cluster_topic'].map(topic_to_land_name)
df.loc[is_sea, 'territory_name'] = df.loc[is_sea, 'cluster_topic'].map(topic_to_sea_name)

In [None]:
# Rename a cluster's territory
# df.loc[df.cluster==147, 'territory_name'] = 'The Land of Crisis'

In [None]:
df.to_parquet('../data/semantics_2024-02-07-2.parquet')