### Imports

In [1]:
from utils_art import *
import openai
import os
from openai import OpenAI
import tiktoken
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE
import hashlib

imported 'utils_art'


## Functions

In [2]:
# LMM Querries

def apply_completions(input_dict,display=False):
    chat_completion = llm_client.chat.completions.create(
        messages=[
            {
                "role": input_dict["i_role"],
                "content": input_dict["i_content"],
                "name": input_dict["i_name"],
            }
        ],
        model=input_dict["i_model"],
        temperature=input_dict["i_temperature"],
        max_tokens=input_dict["i_token_max"],
        n=input_dict["i_n"],
        seed=input_dict["i_seed"],
        frequency_penalty=input_dict["i_frequency_penalty"],
        presence_penalty=input_dict["i_presence_penalty"]
    )
    if display :
        print(chat_completion)
    return chat_completion

def apply_embeddings(input_dict,display=False):
    text_embeddings = llm_client.embeddings.create(
        input=input_dict["i_text"],
        model=input_dict["i_model"],
        encoding_format=input_dict["i_encoding_format"],
        dimensions=input_dict["i_dimensions"],
        user=input_dict["i_user"])
    if display :
        print(text_embeddings)
    return text_embeddings

## Ceate Input Conf

def llmInputConfCompletion(content,role_num=0,model_num=0,temperature=1,max_tokens=2000,num_answer=1,seed=0, hash_key=None) :
    return {"i_content":content,
            "i_role":role_list[role_num],
            "i_model":model_list[model_num],
            "i_temperature":temperature,
            "i_token_max":max_tokens,
            "i_n":num_answer,
            "i_seed":seed,
            "i_name":"name_test",
            "i_frequency_penalty":0,
            "i_presence_penalty":0,
            "hash_key":hash_key}

def llmInputConfEmbeddings(content, model="text-embedding-3-small", encoding_format="float", dimensions=10, hash_key=None) :
    return {"i_text":content,
            "i_model":model,
            "i_encoding_format":encoding_format,
            "i_dimensions":dimensions,
            "i_user":"name_test",
            "hash_key":hash_key}

## LLM Querry output Parsing

def outputDictParseCompletion(output,display=False) :
    out_dict = {}
    gpt_dict = dict(output)
    out_dict["o_id"] = gpt_dict["id"]
    out_dict["o_system_fingerprint"] = gpt_dict["system_fingerprint"]
    out_dict["o_logprobs"] = dict(gpt_dict["choices"][0])["logprobs"]
    out_dict["o_model"] = gpt_dict["model"]
    out_dict["o_object"] = gpt_dict["object"]
    out_dict["o_created"] = gpt_dict["created"]
    out_dict["o_finish_reason"] = dict(gpt_dict["choices"][0])["finish_reason"]
    out_dict["o_index"] = dict(gpt_dict["choices"][0])["index"]
    out_dict["o_content"] = dict(dict(gpt_dict["choices"][0])["message"])["content"]
    out_dict["o_role"] = dict(dict(gpt_dict["choices"][0])["message"])["role"]
    out_dict["o_object"] = gpt_dict["object"]
    out_dict["o_token_output"] = dict(gpt_dict["usage"])["completion_tokens"]
    out_dict["o_token_input"] = dict(gpt_dict["usage"])["prompt_tokens"]
    out_dict["o_token_total"] = dict(gpt_dict["usage"])["total_tokens"]
    if display :
        print(out_dict)
    return out_dict
        
def outputDictParseEmbeddings(output,display=False) :
    out_dict = {}
    gpt_dict = dict(output)
    out_dict["o_data"] = dict(gpt_dict["data"][0])["embedding"]
    out_dict["o_index"] = dict(gpt_dict["data"][0])["index"]
    out_dict["o_object"] = dict(gpt_dict["data"][0])["object"]
    out_dict["o_model"] = gpt_dict["model"]
    out_dict["o_object_list"] = gpt_dict["object"]
    out_dict["o_object_list"] = gpt_dict["object"]
    out_dict["o_token_input"] = dict(gpt_dict["usage"])["prompt_tokens"]
    out_dict["o_token_total"] = dict(gpt_dict["usage"])["total_tokens"]
    if display :
        print(out_dict)
    return out_dict

def parseList(list_par) :
    output_str = ""
    if type(list_par) == type([]) :
        for i in list_par :
            output_str = output_str + str(i)
    elif type(list_par) == type("") :
        output_str = list_par
    return str(output_str)


def textListToText(text_list) :
    out_list = ""
    for text in text_list :
        out_list = out_list + text
    return out_list

def llmInputConfArticle(article_text,llm_prompt) :
    context_prompt = "\nHere is the article :\n"
    final_prompt = str(llm_prompt)+str(context_prompt)+article_text
    return llmInputConf(final_prompt)

    
def num_tokens_from_string(text="", encoding_name="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens


def saveNP(data,fmt='%f'): #path,
    np.savetxt("C:/Users/User/OneDrive/Desktop/article/file_2/test_llm_output/test_save.txt",data, fmt=fmt)

def loadNP(): #path
    return np.loadtxt('C:/Users/User/OneDrive/Desktop/article/file_2/test_llm_output/test_save.txt', dtype=float)

def plot3Dpn(np_data):
    fig = px.scatter_3d(x=np_data[:, 0], y=np_data[:, 1], z=np_data[:, 2],color=np_data[:, 3], opacity=0.8)
    fig.show()

def plotTSNE(data,n_components=2,perplexity=3,random_state=10):
    tsne = TSNE(n_components=n_components,perplexity=perplexity,random_state=random_state) # , random_state=100
    X_tsne = tsne.fit_transform(data)
    print(tsne.kl_divergence_)
    fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1]) #, color=np.array(range(69))
    fig.update_layout(
        title="t-SNE visualization of Custom Classification dataset",
        xaxis_title="First t-SNE",
        yaxis_title="Second t-SNE",
    )
    fig.show()
    
def getDataToQuerryListLLM(max_prompt=5,articleTrueQuestionsFalse=True) :
    out_dict_List = []
    if articleTrueQuestionsFalse :
        getNumberOfArticles(open_path)
        filename_list = loadArticleFolderList(open_path,max_prompt) # ["fa897c02295f34ce2e15f602769edf204ea00be7.txt"]
        out_dict_List = loadListArticleHash(open_path,filename_list)
    else :
        prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt)
        for prompt in prompt_list:
            hash_key = hashlib.shake_256(str(prompt).encode()).hexdigest(20)
            out_dict_List.append({"hash_key":hash_key,"text":prompt})
    return out_dict_List

def testQuestionsBatchCompletion():
    articleTrueQuestionsFalse = True
    completionTrueEmbeddingFalse = False
    model_list = [0] # [0,1,2]
    temperature_list = [0.5] # [0,0.25,0.5,0.75,1]
    max_prompt = 45000# 51500 #  #100
    save_every = 100
    token_max_emb = 7500
    cara_max_emb = 100
    dim=50 # 100
    df=None
    set_index_key = "hash_key" #'o_created' #"hash_key"
    prompt_list = getDataToQuerryListLLM(max_prompt,articleTrueQuestionsFalse)
    prompt_list = prompt_list[9000:45000]
    # prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt) #
    count = 0
    for prompt in prompt_list:
        for model_n in model_list:
            for temperature_n in temperature_list :
                valid_dict = {"valid":"VALID"}
                if completionTrueEmbeddingFalse :
                    
                    input_dict = llmInputConfCompletion(prompt["text"],model_num=model_n,temperature=temperature_n,hash_key=prompt["hash_key"])
                    out_raw = apply_completions(input_dict)
                    out_dict = outputDictParseCompletion(out_raw)
                    selected_fields = selected_fields_comp
                else :
                    num_tokens = num_tokens_from_string(prompt["text"])
                    if num_tokens > token_max_emb :
                        valid_dict = {"valid":"WARNING"}
                        prompt["text"] = prompt["text"][0:cara_max_emb]
                    print(" - #"+str(count),"- ",valid_dict,"-",num_tokens,"-",len(prompt["text"]),"-",prompt["hash_key"])
                    input_dict = llmInputConfEmbeddings(prompt["text"],dimensions=dim,hash_key=prompt["hash_key"])
                    out_raw = apply_embeddings(input_dict)
                    out_dict = outputDictParseEmbeddings(out_raw)
                    selected_fields = selected_fields_emp
                final_dict = input_dict | out_dict | valid_dict
                df = addDictToDF(df,final_dict,selected_fields)
                if count%save_every == 0  and count != 0:
                    saveDFcsv(df.set_index(set_index_key), save_path, filename_save+str(count),True)
                count = count + 1
    saveDFcsv(df.set_index(set_index_key), save_path, filename_save+"final",True)
    return df

## Article Functions

In [4]:
def loadArticleFolderList(folder_path="",cutoff=99999999) :
    root_path = Path(folder_path)
    file_list = os.listdir(root_path)
    file_list = file_list[:cutoff]
    return file_list

def getNumberOfArticles(folder_path="", display=True) :
    root_path = Path(folder_path)
    file_list = os.listdir(root_path)
    file_list_len = len(file_list)
    if display : 
        print("In folder : ",folder_path," found ",file_list_len," article files.")
    return 

def loadListArticleHash(folder_path="",list_hash=[]) :
    text_dict_list = []
    for i in list_hash:
        hash_name = i.replace(".txt","")
        text = openSTRtxt(folder_path+"/",hash_name)
        text_loaded_list_len = len(text)
        dict_entry = {"hash_key":hash_name,"text":str(textListToText(text))} #textListToText(
        text_dict_list.append(dict_entry)
    return text_dict_list

def getStatsOnArticleText(article_text_list) :
    out_dict = {}
    out_dict["line_num"] = len(article_text_list)
    article_text = textListToText(article_text_list)
    out_dict["char_num"] =len(article_text)
    char_list = ["\n", ".","?","!",'"',",","“","”",":","–","-",";","http://","https://","$","€","|"]
    for char in char_list :
        out_dict[char] = article_text.count(char)
    return out_dict

def getDataToQuerryListLLM(max_prompt=5,articleTrueQuestionsFalse=True) :
    out_dict_List = []
    if articleTrueQuestionsFalse :
        getNumberOfArticles(open_path)
        filename_list = loadArticleFolderList(open_path,max_prompt) # ["fa897c02295f34ce2e15f602769edf204ea00be7.txt"]
        out_dict_List = loadListArticleHash(open_path,filename_list)
    else :
        prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt)
        for prompt in prompt_list:
            hash_key = hashlib.shake_256(str(prompt).encode()).hexdigest(20)
            out_dict_List.append({"hash_key":hash_key,"text":prompt})
    return out_dict_List

def testQuestionsBatchCompletion():
    articleTrueQuestionsFalse = True
    completionTrueEmbeddingFalse = False
    model_list = [0] # [0,1,2]
    temperature_list = [0.5] # [0,0.25,0.5,0.75,1]
    max_prompt = 51500# 51500 #  #100
    save_every = 100
    token_max_emb = 7500
    cara_max_emb = 100
    dim=50 # 100
    df=None
    set_index_key = "hash_key" #'o_created' #"hash_key"
    prompt_list = getDataToQuerryListLLM(max_prompt,articleTrueQuestionsFalse)
    prompt_list = prompt_list[45000:51500]
    # prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt) #
    count = 0
    for prompt in prompt_list:
        for model_n in model_list:
            for temperature_n in temperature_list :
                valid_dict = {"valid":"VALID"}
                if completionTrueEmbeddingFalse :
                    
                    input_dict = llmInputConfCompletion(prompt["text"],model_num=model_n,temperature=temperature_n,hash_key=prompt["hash_key"])
                    out_raw = apply_completions(input_dict)
                    out_dict = outputDictParseCompletion(out_raw)
                    selected_fields = selected_fields_comp
                else :
                    num_tokens = num_tokens_from_string(prompt["text"])
                    if num_tokens > token_max_emb :
                        valid_dict = {"valid":"WARNING"}
                        prompt["text"] = prompt["text"][0:cara_max_emb]
                    print(" - #"+str(count),"- ",valid_dict,"-",num_tokens,"-",len(prompt["text"]),"-",prompt["hash_key"])
                    input_dict = llmInputConfEmbeddings(prompt["text"],dimensions=dim,hash_key=prompt["hash_key"])
                    out_raw = apply_embeddings(input_dict)
                    out_dict = outputDictParseEmbeddings(out_raw)
                    selected_fields = selected_fields_emp
                final_dict = input_dict | out_dict | valid_dict
                df = addDictToDF(df,final_dict,selected_fields)
                if count%save_every == 0  and count != 0:
                    saveDFcsv(df.set_index(set_index_key), save_path, filename_save+str(count),True)
                count = count + 1
    saveDFcsv(df.set_index(set_index_key), save_path, filename_save+"final",True)
    return df

def dictSelectKeyList(input_dict,selected_fields) :
    out_dict = {}
    for key, value in input_dict.items():
        if key in selected_fields :
            out_dict[key] = value
    return out_dict

def getStandardDfnumColumn(num=10):
    return pd.DataFrame([], columns=[""+str(x) for x in range(num)])

def addDictToDF(df=None, ar_dict={},selected_fields=[]):
    if selected_fields == [] :
        selected_fields = ar_dict.keys()
    else :
        ar_dict = dictSelectKeyList(ar_dict,selected_fields)
    if type(df) == type(None) :
        df = pd.DataFrame([], columns = selected_fields) 
    df_add = pd.DataFrame([ar_dict], columns = selected_fields)
    if True : # list(df_add.columns) == (df.columns) :
        df = pd.concat([df,df_add]).reset_index(drop=True)
    else :
        print("WARNING : df could not be added because the columns list is different")
    return df

## Variables

In [5]:
# Chat GPT
openai.api_key = "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"
# model_list = ["gpt-3.5-turbo-16k","gpt-4","gpt-3.5-turbo-16k","gpt-3.5-turbo-0125","gpt-4-0125-preview","gpt-3.5-turbo","gpt-4-turbo-preview","text-embedding-3-small","gpt-4","gpt-3.5-turbo-16k-1106"]
model_list = ["gpt-3.5-turbo-0125", "gpt-3.5-turbo-16k","gpt-4-0125-preview"]
role_list = ["user","system", "assistant", "tool"]
max_token=100

llm_client = OpenAI(api_key=openai.api_key)

# Paths
open_path = "C:/Users/User/OneDrive/Desktop/article/file_2/article_download_main/"
# path_model_list = "C:/Users/User/OneDrive/Desktop/article/file_2/.code_control/"
# filename_model_lis = "code_conf_excel"

save_path = "C:/Users/User/OneDrive/Desktop/article/file_2/test_llm_output/"
filename_save = "llm_output_"
# select_fields = ['content', 'role', 'model', 'temperature', 'max_tokens', 'n', 'seed', 'id', 'model_o', 'object', 'finish_reason', 'index', 'content_o', 'role_o', 'token_c', 'token_p', 'token_t']
select_fields_comp = ['o_id','i_content', 'o_content', 'i_role', 'i_model', 'i_temperature', 'i_token_max', 'i_n', 'i_seed', 'i_name', 'i_frequency_penalty', 'i_presence_penalty','o_system_fingerprint', 'o_logprobs', 'o_model', 'o_object', 'o_created', 'o_finish_reason', 'o_index', 'o_role', 'o_token_output', 'o_token_input', 'o_token_total', "valid"]
select_fields_emb = ['o_index','i_text', 'i_model', 'o_object', 'o_object_list', 'i_encoding_format', 'i_dimensions', 'o_data', 'o_token_input', 'o_token_total', "valid"]
# select_fields_emb = ['hash_key', 'i_model', 'i_dimensions','i_encoding_format','o_object', 'o_object_list','o_token_input', 'o_token_total','o_data']


#selected_fields_comp = ["hash_key",'i_content', 'i_role', 'i_model', 'i_temperature', 'i_token_max', 'i_n', 'i_seed', 'i_name', 'i_frequency_penalty', 'i_presence_penalty', 'o_id', 'o_system_fingerprint', 'o_logprobs', 'o_model', 'o_object', 'o_created', 'o_finish_reason', 'o_index', 'o_content', 'o_role', 'o_token_output', 'o_token_input', 'o_token_total']
selected_fields_comp = ["hash_key",'o_id', 'i_role', 'i_model', 'i_temperature', 'i_token_max', 'i_frequency_penalty', 'i_presence_penalty', 'o_created','i_content','o_content', 'o_token_output', 'o_token_input', 'o_token_total',"valid"]
#selected_fields_emp = ["hash_key",'i_text', 'i_model', 'i_encoding_format', 'i_dimensions', 'i_user', 'o_data', 'o_index', 'o_object', 'o_model', 'o_object_list', 'o_token_input', 'o_token_total']
selected_fields_emp = ["hash_key",'i_model', 'i_dimensions', 'i_encoding_format', 'i_user', 'o_data', 'o_token_input', 'o_token_total','valid']

In [None]:
df = testQuestionsBatchCompletion()
display(df)

In [None]:
h_data,"text","h_name","h_data","c_data"

In [None]:
di = {"x":0,"y":1,'c':"category",'size':'text_len','symbol':"&","h_name":"&","h_data":"&","c_data":"&","text":"&","facet_row":"&","facet_col":"&","facet_col_wrap":"&","facet_row_spacing":"&","facet_col_spacing":"&","error_x":"&","error_x_minus":"&","error_y":"&","error_y_minus":"&","animation_frame":"&","animation_group":"&","range_color":"&","opacity":"&","size_max":"&","marginal_x":"&","marginal_y":"&","log_x":"&","log_y":"&","range_x":"&","range_y":"&","render_mode":"&","title":"&","xtitle":"&","ytitle":"&","width":"&","height":"&"}
                                                                                                        

In [None]:
data = loadNP()
print(type(data))
print(data)
print(len(data))
plot3Dpn(data)
tsne = TSNE(n_components=2,perplexity=2) # , random_state=100
X_tsne = tsne.fit_transform(data)
tsne.kl_divergence_

In [None]:
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=np.array(range(69)))
fig.update_layout(
    title="t-SNE visualization of Custom Classification dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()

In [None]:
# prompt = cfn_field("prompts","prompt_name","email","prompt_value")
prompt = cfn_index("prompts",16,"prompt_value")
# prompt = prompt_story
# input_dict = {"role":role_list[0],"model":"gpt-3.5-turbo-16k","temperature":0.5,"max_tokens":10000 ,"n":1,"seed":0,"content":prompt}
input_dict = llmInputConf(prompt)
out_raw = apply_completions(input_dict)
out_dict = outputDictParseCompletion(out_raw) #model_list[0]
final_dict = input_dict | out_dict

print(final_dict["i_content"])
print(num_tokens_from_string(final_dict["i_content"]))

print(final_dict["o_content"])
print(num_tokens_from_string(final_dict["o_content"]))

final_dict["i_content"] = ""
final_dict["o_content"] = ""
print(final_dict.keys())

print(final_dict)

import os
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key="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",
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user", #role_list[0],
            "content": "Say this is a test"
        }
    ],
    model=model_list[0],
    temperature=0.5,
    max_tokens=max_token,
    n=1,
    seed=0
)

print(str(chat_completion))




def chat_with_chatgpt(prompt, model="gpt-3.5-turbo"):
    response = openai.Completion.create(
        engine=model,
        prompt=prompt,
        max_tokens=100,
        n=1,
        stop=None,
        temperature=0.5,
    )

    message = response.choices[0].text.strip()
    return message

user_prompt = "Write a summary of the benefits of exercise."
chatbot_response = chat_with_chatgpt(user_prompt)
print(chatbot_response)


from openai import OpenAI

client = OpenAI(
  organization='org-JC0VmXs611nLY2FmI4JVhO5k',
)

        # print(dict_entry)
#     hash_list = []
#     for i in range(n) :
#         text = openSTRtxt(folder_path+"/",file_list[index].replace(".txt",""))
#     print(text)
#     return text
# for i in range(10) :
#     loadArticleFolder(i)


def loadArticleFolder(length=0) :
    folder_path = "C:/Users/User/OneDrive/Desktop/article/file_2/article_download_main"
    root_path = Path(folder_path)
    file_list = os.listdir(root_path)
    print(len(file_list))
    # text = openSTRtxt(folder_path,file_list[index])
# for i in range(10) :
#     prompt = str(loadArticleFolder(i))+"\n"+parseList(text)
#     print(prompt)
loadArticleFolder(0)


prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",10) #
df = getStandardDfComp()
count = 0
for prompt in prompt_list :
    input_dict = llmInputConf(prompt)
    out_raw = apply_completions(input_dict)
    out_dict = outputDictParseCompletion(out_raw) #model_list[0]
    final_dict = input_dict | out_dict
    df = addDictToDF(df,final_dict)
    if count%2 == 0 :
        saveDFcsv(df, save_path, filename_save+str(count),True)
    count = count + 1
# prompt = cfn_field("prompts","prompt_type","prompt_name")
#print(prompt)



## Prompts

In [None]:
prompt_small_answer ="""Answer all the following questions with a short answer.
Your answer should be in csv format with a comma as separator and quotes around each fields.
Here is the list of questions :
What occasion corresponds to the longest day of the year?
What is the distance from earth to the sun?
What sport was featured on the first curved U.S. coin in 2014?
Which country is the largest in the world?
M&M’S Fruit Chews would eventually become what popular candy?
According to Guinness World Records, what's the best-selling book of all time?
What U.S. state is home to Acadia National Park?
What is the only food that can never go bad?
What was the first animal to ever be cloned?
What is the name of the pet dinosaur on the TV cartoon 'The Flintstones'?
What identity document is required to travel to different countries around the world?
Who is considered the 'Father of Relativity?'
Edie Falco and James Gandolfini star in what series about the life of a New Jersey mob boss?
Nearly all fossils are preserved in what type of rock?
What guitarist notably performed on the Michael Jackson song 'Beat It'?
What is August’s birthstone?
What is Prince Harry’s official first name?
What is the fifth sign of the zodiac?
Which branch of the U.S. armed forces used the slogan 'It’s not just a job, it’s an adventure'?
By U.S. law, exit signs must be one of what two colors?
What is an eight-sided shape called?
"""

prompt_yes_no ="""Answer all the following questions either "True" or "False".
Your answer should be in csv format with a comma as separator and quotes around each fields.
Here is the list of questions :
Sharks are mammals.
Sea otters have a favorite rock they use to break open food.
The blue whale is the biggest animal to have ever lived.
The hummingbird egg is the world's smallest bird egg.
Pigs roll in the mud because they don't like being clean.
Bats are blind.
New York City is composed of between 36 and 42 islands.
South Africa has one capital.
The Atlantic Ocean is the biggest ocean on Earth.
Mount Everest is the tallest mountain in the world.
You can find the 'Desert of Death' in California.
The total length of the Great Wall of China adds up to 13,171 miles.
"""

prompt_open ="""Answer all the following questions with maximum of 10 sentences per answer.
Your answer should be in csv format with a comma as separator and quotes around each fields.
Here is the list of questions :
Can you help me plan a week's worth of dinner for two adults?
Generate a meal plan for two days and give me the shopping list?
I have tomato, lettuce, and broccoli. What can I prepare with them for a vegan lunch?
What is an easy way to make a pasta recipe that features white sauce and mushroom?
What would be a good bottle of wine to serve with Chicken roast dinner?
I have only three ingredients - Onion, tomato, and spinach. Can you show me 3 meals that I can cook with these ingredients?
What is a good food suggestion for someone who has had a bad day?
I am a vegan and I am looking for healthy dinner ideas.
Can you give a dessert suggestion on a stressful day?
Suggest a multi-course dinner party menu with winter ingredients?
Write a persuasive message to a potential employer explaining my relocation for a chef role?
"""

prompt_email = """Write an email to a supervisor requesting time off for a vacation, including the dates of the requested time off and a plan for ensuring that work will be covered during your absence."""

prompt_story = """Write a story about a young girl who discovers she has the power to control fire, but struggles with the responsibility that comes with it She must navigate the challenges of her new abilities while trying to keep her secret hidden from the world"""


# Prompts closed answer
prompt_date = "Guess the exact date this article was written. I want you to give me an date with a specific year, month and day. The answer should only be this date in this format 'yyyy-mm-dd'."
prompt_politics = "Tell me if this article includes political topics. Answer with 'True' or 'False'."
prompt_pos = "Tell me if this article has a positive outlook. Answer with 'True' or 'False'."
prompt_neg = "Tell me if this article has a negative outlook. Answer with 'True' or 'False'."
prompt_facts = "Tell me if this article looks to be factual. Answer with 'True' or 'False'."
prompt_sources = "Tell me if this article includes sources on what is written. Answer with 'True' or 'False'."
prompt_date_num = "How many dates or time periods are referenced in this article. Answer with only the number of dates or time periods you found."
prompt_spectrum="Guess on wich end of the political spectrum is the person who wrote this article situated.Answer with 'Left' or 'Right'."
prompt_occurence="Tel me the percentage of appearence of this article's topîc."
prompt_words="Tell me the 5 most cited words in the articles, except words like 'the', 'he', 'she', 'it', 'or', 'and', etc"


In [None]:
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib

# df = pd.read_csv('output/embedded_1k_reviews.csv')
# matrix = df.ada_embedding.apply(eval).to_list()
val = openDFcsv(save_path,filename_save+"final")
val = df["o_data"].to_numpy()
for ent in val :
    new_list = ent.str.strip('()').str.split(',')
# np_mat = df["embedding_list"]
# print(np_mat)
# df["embedding_list"] = df["o_data"] .apply(np.array)
# df = df[["embedding_list"]].to_numpy()
# matrix = df["o_data"].apply(np.array)

#.str.strip('()').str.split(',')
# print(val.dtypes)
# display(val)

print(type(val))
print(val.shape)
print(val)


# # Create a t-SNE model and transform the data
# tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
# vis_dims = tsne.fit_transform(matrix)

# colors = ["red", "darkorange", "gold", "turquiose", "darkgreen"]
# x = [x for x,y in vis_dims]
# y = [y for x,y in vis_dims]
# color_indices = df.Score.values - 1

# colormap = matplotlib.colors.ListedColormap(colors)
# plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3)
# plt.title("Amazon ratings visualized in language using t-SNE")

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

df = openDFcsv(save_path,filename_save+"final")
new = df.embedding.apply(literal_eval).apply(np.array)

In [None]:
from utils.embeddings_utils import get_embedding

In [None]:
import ast.literal_eval
val = openDFcsv(save_path,filename_save+"final")
df.o_data = df.o_data.apply(ast.literal_eval)

In [None]:
li = [[0.1,0.2,0.3],[0.1,0.2,0.3]]
li2 = np.array(li)
print(li2.shape)

In [None]:
li1 = np.ndarray((10,10))
print(li1)

In [None]:
# Templates

In [None]:
# df = getStandardDfComp()
df = getStandardDfEmb()
getNumberOfArticles(open_path)
filename_list = loadArticleFolderList(open_path,5000) # ["fa897c02295f34ce2e15f602769edf204ea00be7.txt"]
article_dict_list = loadListArticleHash(open_path,filename_list)
count = 0
for article_dict in article_dict_list :
    # stats_dict = getStatsOnArticleText(article_dict["text"])
    article_dict["text"] = textListToText(article_dict["text"])
    # print(article_dict["hash"])
    # print(stats_dict)
        # llm_input_dict = llmInputConfArticle(str(article_dict["text"]),prompt_spectrum)
        # out_raw = apply_completions(llm_input_dict)
        # out_dict = outputDictParseCompletion(out_raw) #model_list[0]
    input_dict = llmInputConfEmbeddings(article_dict["text"],dimensions=100)
    out_raw = apply_embeddings(input_dict)
    out_dict = outputDictParseEmbeddings(out_raw) #model_list[0]
    # mat.append(out_dict["o_data"])
    del article_dict["text"]
    del input_dict["i_text"]
    final_dict = article_dict | out_dict | input_dict
    # final_dict["content"] = ""
    # final_dict["hash_key"] = article_dict["hash"]
    df = addDictToDF(df,final_dict,select_fields_emb)
    
    if count%5 == 0 :
        saveDFcsv(df, save_path, filename_save+str(count),True)
    count = count + 1
    # print(final_dict.keys())
    # print(final_dict)
saveDFcsv(df, save_path, filename_save+"final",True)
display(df)


In [None]:
# def testQuestionsBatchEmbedding():
#     max_prompt=10 # 100
#     dim=10 # 100
#     save_every = 1000
#     df = None
#     # mat = []
#     prompt_list = getDataToQuerryListLLM(max_prompt,True)
#     # prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt) #
#     # dfemb = getStandardDfnumColumn(dim)
#     count = 0
#     for prompt in prompt_list :
#         print(prompt["hash_key"])
#         input_dict = llmInputConfEmbeddings(prompt["text"],dimensions=dim,hash_key=prompt["hash_key"])
#         out_raw = apply_embeddings(input_dict)
#         out_dict = outputDictParseEmbeddings(out_raw)
#         # mat.append(out_dict["o_data"])
#         final_dict = input_dict | out_dict
#         df = addDictToDF(df,final_dict,selected_fields_emp)
#         if count%save_every == 0 and count != 0:
#             saveDFcsv(df.set_index('hash_key'), save_path, filename_save+str(count),True)
#         count = count + 1
#     saveDFcsv(df.set_index('hash_key'), save_path, filename_save+"final_test_emb",True)
#     return df

# df = testQuestionsBatchEmbedding()
# display(df)


def testEmbeddings():
    max_prompt =100
    dim=100
    save_every = 0
    mat = []
    prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt) #
    df = getStandardDfEmb()
    # dfemb = getStandardDfnumColumn(dim)
    count = 0
    for prompt in prompt_list :
        input_dict = llmInputConfEmbeddings(prompt,dimensions=dim)
        out_raw = apply_embeddings(input_dict)
        out_dict = outputDictParseEmbeddings(out_raw) #model_list[0]
        mat.append(out_dict["o_data"])
        final_dict = input_dict | out_dict
        df = addDictToDF(df,final_dict)
        # if count%save_every == 0 :
            # saveDFcsv(df, save_path, filename_save+str(count),True)
        count = count + 1
    # saveDFcsv(df, save_path, filename_save+"final",True)
    # display(df)
    # df_em = pd.DataFrame(np.array(mat))
    # ndarr = np.ndarray(mat)
    # display(df_em)
    # df_out = df.join(df_em, how="inner")
    # df_out["o_data"] = ""
    return mat
main_out = testEmbeddings()
np_out = np.array(main_out)
saveNP(np_out)

def testQuestionsDifferentParameters():
    model_list = [0,1,2]
    temperature_list = [0,0.25,0.5,0.75,1]
    max_prompt = 100
    save_every = 5
    prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt) #
    df = getStandardDfComp()
    count = 0
    for prompt in prompt_list :
        for model_n in model_list:
            for temperature_n in temperature_list :
                input_dict = llmInputConf(prompt,model_num=model_n,temperature=temperature_n)
                out_raw = apply_completions(input_dict)
                out_dict = outputDictParseCompletion(out_raw) #model_list[0]
                final_dict = input_dict | out_dict
                df = addDictToDF(df,final_dict)
                if count%save_every == 0 :
                    saveDFcsv(df, save_path, filename_save+str(count),True)
                count = count + 1
    saveDFcsv(df, save_path, filename_save+"final",True)
# testQuestionsDifferentParameters()

# df = openDFcsv(path_model_list,filename_model_lis)
# display(df)

getNumberOfArticles(open_path)
filename_list = loadArticleFolderList(open_path,20)
article_dict_list = loadListArticleHash(open_path,filename_list)
for article_dict in article_dict_list :
    stats_dict = getStatsOnArticleText(article_dict["text"])
    print(article_dict["hash_key"])
    print(stats_dict)
    
    
# df = getStandardDfComp()
df = getStandardDfEmb()
getNumberOfArticles(open_path)
filename_list = loadArticleFolderList(open_path,5000) # ["fa897c02295f34ce2e15f602769edf204ea00be7.txt"]
article_dict_list = loadListArticleHash(open_path,filename_list)
count = 0
for article_dict in article_dict_list :
    # stats_dict = getStatsOnArticleText(article_dict["text"])
    article_dict["text"] = textListToText(article_dict["text"])
    # print(article_dict["hash"])
    # print(stats_dict)
        # llm_input_dict = llmInputConfArticle(str(article_dict["text"]),prompt_spectrum)
        # out_raw = apply_completions(llm_input_dict)
        # out_dict = outputDictParseCompletion(out_raw) #model_list[0]
    input_dict = llmInputConfEmbeddings(article_dict["text"],dimensions=100)
    out_raw = apply_embeddings(input_dict)
    out_dict = outputDictParseEmbeddings(out_raw) #model_list[0]
    # mat.append(out_dict["o_data"])
    del article_dict["text"]
    del input_dict["i_text"]
    final_dict = article_dict | out_dict | input_dict
    # final_dict["content"] = ""
    # final_dict["hash_key"] = article_dict["hash"]
    df = addDictToDF(df,final_dict,select_fields_emb)
    
    if count%5 == 0 :
        saveDFcsv(df, save_path, filename_save+str(count),True)
    count = count + 1
    # print(final_dict.keys())
    # print(final_dict)
saveDFcsv(df, save_path, filename_save+"final",True)
display(df)

# 
# out_dict = outputDictParseCompletion(out_raw) #model_list[0]
# final_dict = input_dict | out_dict


# out_raw = outputDictParseCompletion(apply_completions(input_dict))
# out_dict = outputDictParseCompletion(out_raw) #model_list[0]
# final_dict = input_dict | out_dict 
input_comp=llmInputConf("By U.S. law, exit signs must be one of what two colors?")
out_dict_comp = outputDictParseCompletion(apply_completions(input_comp))

input_embed = llmInputConfEmbeddings("Your text string goes here")
out_dict_embed = outputDictParseEmbeddings(apply_embeddings(input_embed))
print(out_dict_comp | input_comp)
print(out_dict_embed | input_embed)
print(type(out_dict_embed["o_data"]))
# response = llm_client.embeddings.create(
#     input="Your text string goes here",
#     model="text-embedding-3-small"
# )
# print(response)

df = openDFcsv(save_path,filename_save+"final")
display(df)
print(df.dtypes)

C:\Users\User\OneDrive\Desktop\article\file_2\.bin\amazon
    
import plotly.express as px
def testEmbeddings():
    max_prompt =100
    dim=100
    save_every = 1000
    mat = []
    prompt_list = cfn_field("prompts","prompt_type","content","prompt_value",max_prompt) #
    df = getStandardDfEmb()
    # dfemb = getStandardDfnumColumn(dim)
    count = 0
    for prompt in prompt_list :
        input_dict = llmInputConfEmbeddings(prompt,dimensions=dim)
        out_raw = apply_embeddings(input_dict)
        out_dict = outputDictParseEmbeddings(out_raw) #model_list[0]
        mat.append(out_dict["o_data"])
        final_dict = input_dict | out_dict
        df = addDictToDF(df,final_dict)
        # if count%save_every == 0 :
            # saveDFcsv(df, save_path, filename_save+str(count),True)
        count = count + 1
    # saveDFcsv(df, save_path, filename_save+"final",True)
    # display(df)
    # df_em = pd.DataFrame(np.array(mat))
    # ndarr = np.ndarray(mat)
    # display(df_em)
    # df_out = df.join(df_em, how="inner")
    # df_out["o_data"] = ""
    return mat
main_out = testEmbeddings()
np_out = np.array(main_out)
saveNP(np_out)

print(main_out)
print(type(main_out))

print(np_out)
print(type(np_out))
print(np_out.shape)

print(b)


print(type(X_tsne))
print(X_tsne.shape)



X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
model = TSNE(n_components=2, random_state=0)
model.fit_transform(X) 


loadArticleFolder()
open_path = "C:/Users/User/OneDrive/Desktop/article/file_2/article_download_main/"
filename="4d54d5722e8e03a1e76159c2594ab4c83327a749"
text = openSTRtxt(open_path,filename)
prompt = str(prompt_date)+"\n"+parseList(text)
print(prompt)

print(final_dict["content_o"])
print("\n\n\n",final_dict["token_p"])
print(final_dict["token_c"])
print(final_dict["token_t"])
print(final_dict)

open_path = "C:/Users/User/OneDrive/Desktop/article/file_2/article_download_main"



filename="4d54d5722e8e03a1e76159c2594ab4c83327a749"
text = openSTRtxt(open_path,filename)
prompt = str(prompt_date)+"\n"+parseList(text)
print(prompt)