In [None]:
use_model  ="gpt-4-0613"

# Import

In [None]:
import tiktoken 
import openai
import numpy as np
import tqdm
import random
import pandas as pd
import json
import glob
import os
import datetime

########################
openai.api_key = "YOUR-KEY"
########################
model = tiktoken.get_encoding("cl100k_base")


# ChatGPT API

In [None]:
now = datetime.datetime.now()
LOG_FILE = os.path.join("log", "finance_api_" +  use_model  + "_"+ now.strftime('%Y-%m-%dT%H_%M_%S') + ".log") 

if os.path.exists("log") == False:
    os.makedirs("log")

print(LOG_FILE)

def get_completion(messages, model=use_model):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        #response_format={"type":"json_object"},
        response_format={"type":"text"},
        temperature=0, 
    )
    return response.choices[0].message["content"]

DOLLAR_PER_TOKEN = 0.002/1000
YEN_PER_DOLLAR= 139.69

# トークン数とコストを計算
def check_tokens(prompt, model):
    tokens = model.encode(prompt)
    num_tokens = len(tokens)
    cost_dollar = len(tokens)*DOLLAR_PER_TOKEN
    cost_yen = len(tokens)*DOLLAR_PER_TOKEN*YEN_PER_DOLLAR
    return num_tokens, cost_dollar, cost_yen

def load_record(path,record2hantei):

    with open(path, "r") as f:
        lines = f.readlines()
    out_edges = []
    
    for kkk in range(len(lines)):
        #kkk = 0
        data = json.loads(lines[kkk])
        answer = data["answer"]
        text = data['input']
        date = data["date"]
        now_future = data["type"]
        record = date + ";" + now_future + ";" + text
        record2hantei.update({record:1})

# Prompt

In [None]:
prompt_template = """
The given text contains information about whether the price of a 40% Stock, 60% Bond Portfolio is expected to move by more than 2% within 5 days, where class 1 denotes a fall, class 2 denotes a rise, and class 0 implies no change (tie), along with the underlying brief reasoning that led the AI to make that decision.
Based on this information, analyze the cause-and-effect relationships underlying the AI's decision and explain them in detail. It is possible that multiple cause-and-effect relationships exist within the sentences providing the reasoning. Extract all of these, and create a node list and an edge list. When connecting causes and effects, use "--*" for paradoxical connections like "despite" or "but," and use "-->" for logical connections like "therefore." Also, connect the final formed cause-and-effect relationships with the decision using either "-->" or "--*" depending on the connection. For the decision nodes, indicate only the numbers representing the decisions (1, 2, or 0).
Be sure to output only the answer. Since the output will be processed mechanically, strictly follow the format shown in the example, and do not forget to include connection types ("-->" or "--*") as well as the nodes representing causes and effects.
Additionally, ensure that no phrases that do not exist in the original text are included in the edge list and confirm that the occurrence of phrases not present in the original text is 0.
Finally, output the result in JSON format.
Example: 
Input
0;Given the mixed market sentiment, slight volatility in interest rates, and decreasing market volatility, the portfolio is likely to experience minor fluctuations.; 
Output
{
  "nodes": ["mixed market sentiment", "slight volatility in interest rates", "decreasing market volatility", "portfolio is likely to experience minor fluctuations", "0"],
  "edges": [
    ["mixed market sentiment", "-->", "portfolio is likely to experience minor fluctuations"],
    ["slight volatility in interest rates", "-->", "portfolio is likely to experience minor fluctuations"],
    ["decreasing market volatility", "-->", "portfolio is likely to experience minor fluctuations"],
    ["portfolio is likely to experience minor fluctuations", "-->", "0"]
  ]
}
=====
"""

# Load

In [None]:
def find_csv_files(directory, filename="df_output.csv"):
    csv_files = []
    # Walk through all subdirectories and files
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == filename:
                # Join root and file to get full path
                csv_files.append(os.path.join(root, file))
    return csv_files

###########
directory_path = '/your/path/gpt-finance'  # Change this to your directory path
###########
csv_files = find_csv_files(directory_path)

In [None]:
out_list = []
for i in range(len(csv_files)):
    persona = csv_files[i].split("/")[9]
    df_tmp = pd.read_csv(csv_files[i])
    df_tmp["persona"] = persona
    out_list.append(df_tmp)
    
df = pd.concat(out_list)

# RUN

In [None]:
################################################
START_FROM_SCRATCH = 1
START_INDEX = 0
print(START_INDEX)
################################################

record2hantei = dict()
file_list = []

if START_FROM_SCRATCH != 1:
    for file in file_list:
        load_record(file,record2hantei)
        
print(LOG_FILE)

# Main Loop

In [None]:
# A function that will convert numpy types to native types
def convert_to_python_type(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()  # Convert numpy array to list
    else:
        raise TypeError(f"Object of type {type(obj)} is not JSON serializable")


# Run

In [None]:
%%time
import time
for kkk in range(5):
    print("TRY: " + str(kkk) + ", Start " + str(START_INDEX))
    try:
        for i in tqdm.tqdm(range(START_INDEX,len(df))):
        
            prediction = df["prediction"].iloc[i]
            text = df["reason"].iloc[i]
            target = df["TargetPortDailyRtn"].iloc[i]
            date = df["Dates"].iloc[i]
            persona  = df["persona"].iloc[i]
            t = str(prediction) + ";" + text
            prompt = prompt_template + t 
            messages = [
                {
                    "role" : "user",
                    "content" : prompt
                }
            ]
            tmp_response1 = get_completion(messages)
            num_tokens1, cost_dollar1, cost_yen1 = check_tokens(tmp_response1, model)

            result = {
                "index": i,
                "input": t,
                "prediction":prediction,
                "target":target,
                "date": date,
                "persona": persona,
                "cost":{
                    1:{
                        "num_tokens1":num_tokens1,
                        "cost_dollar1":cost_dollar1,
                        "cost_yen1":cost_yen1
                    }
                },
                "answer":tmp_response1
            }

            with open(LOG_FILE, "a") as f:
                print(json.dumps(result,default=convert_to_python_type), file=f)
            START_INDEX = i

    except:
        START_INDEX = i
        time.sleep(5.4)
        