In [4]:
import os
import re
import csv
import math
import time
import json
import random
import finnhub
import datasets
import pandas as pd
import yfinance as yf
from datetime import datetime
from collections import defaultdict
from datasets import Dataset
from openai import OpenAI

In [11]:
# 必要なライブラリのインポート
import yfinance as yf
from datetime import date, timedelta
import pandas as pd

# ティッカーシンボルと期間設定
ticker = "NVDA"  # NVIDIAのティッカーシンボル
end_date = date.today()  # 今日の日付
start_date = end_date - timedelta(days=365)  # 1年前の日付

# 株価データ取得関数
def get_stock_data(ticker, start_date, end_date):
    """
    NVIDIA株価データを取得する関数
    """
    print(f"{ticker} の株価データを取得中: {start_date} から {end_date}")
    stock_data = yf.download(ticker, start=start_date.strftime("%Y-%m-%d"), end=end_date.strftime("%Y-%m-%d"))
    return stock_data

# 株価データを取得
stock_data = get_stock_data(ticker, start_date, end_date)

# データの出力（確認用）
print(stock_data)

# CSVに保存（オプション）
output_file = f"{ticker}_1year_stock_data.csv"
stock_data.to_csv(output_file)
print(f"データを {output_file} に保存しました。")


NVDA の株価データを取得中: 2023-12-05 から 2024-12-04


[*********************100%***********************]  1 of 1 completed

Price        Adj Close       Close        High         Low        Open  \
Ticker            NVDA        NVDA        NVDA        NVDA        NVDA   
Date                                                                     
2023-12-05   46.556007   46.566002   46.599998   45.271000   45.466000   
2023-12-06   45.493237   45.502998   47.387001   45.411999   47.215000   
2023-12-07   46.586002   46.596001   46.629002   45.604000   45.700001   
2023-12-08   47.495808   47.506001   47.741001   46.549999   46.595001   
2023-12-11   46.616997   46.626999   47.530998   45.830002   47.491001   
...                ...         ...         ...         ...         ...   
2024-11-26  136.919998  136.919998  139.300003  135.669998  137.699997   
2024-11-27  135.339996  135.339996  137.220001  131.800003  135.009995   
2024-11-29  138.250000  138.250000  139.350006  136.050003  136.779999   
2024-12-02  138.630005  138.630005  140.449997  137.820007  138.830002   
2024-12-03  140.259995  140.259995  14




# Raw Financial Data Acquisition

In [4]:
def bin_mapping(ret):
    
    up_down = 'U' if ret >= 0 else 'D'
    integer = math.ceil(abs(100 * ret))
    
    return up_down + (str(integer) if integer <= 5 else '5+')


def get_returns(stock_symbol):
    # ダウンロードした株価データを取得
    stock_data = yf.download(stock_symbol, start=START_DATE, end=END_DATE)
    
    # 週単位にリサンプリング
    weekly_data = stock_data['Adj Close'].resample('W').ffill()
    weekly_returns = weekly_data.pct_change()[1:]
    weekly_start_prices = weekly_data[:-1]
    weekly_end_prices = weekly_data[1:]

    # デバッグ用のprint文（必要に応じてコメントアウトしてください）
    # print(type(weekly_start_prices.values))
    # print(weekly_start_prices.values)

    # DataFrameの作成（1次元配列に変換）
    weekly_data = pd.DataFrame({
        'Start Date': weekly_start_prices.index,
        'Start Price': weekly_start_prices.values.flatten(),
        'End Date': weekly_end_prices.index,
        'End Price': weekly_end_prices.values.flatten(),
        'Weekly Returns': weekly_returns.values.flatten()
    })
    
    # Binラベルをマッピング
    weekly_data['Bin Label'] = weekly_data['Weekly Returns'].map(bin_mapping)

    return weekly_data



def get_news(symbol, data):
    
    news_list = []
    
    for end_date, row in data.iterrows():
        start_date = row['Start Date'].strftime('%Y-%m-%d')
        end_date = row['End Date'].strftime('%Y-%m-%d')
        print(symbol, ': ', start_date, ' - ', end_date)
        time.sleep(1) # control qpm
        weekly_news = finnhub_client.company_news(symbol, _from=start_date, to=end_date)
        weekly_news = [
            {
                "date": datetime.fromtimestamp(n['datetime']).strftime('%Y%m%d%H%M%S'),
                "headline": n['headline'],
                "summary": n['summary'],
            } for n in weekly_news
        ]
        weekly_news.sort(key=lambda x: x['date'])
        news_list.append(json.dumps(weekly_news))
    
    data['News'] = news_list
    
    return data


def get_basics(symbol, data, always=False):
    
    basic_financials = finnhub_client.company_basic_financials(symbol, 'all')
    
    final_basics, basic_list, basic_dict = [], [], defaultdict(dict)
    
    for metric, value_list in basic_financials['series']['quarterly'].items():
        for value in value_list:
            basic_dict[value['period']].update({metric: value['v']})

    for k, v in basic_dict.items():
        v.update({'period': k})
        basic_list.append(v)
        
    basic_list.sort(key=lambda x: x['period'])
            
    for i, row in data.iterrows():
        
        start_date = row['End Date'].strftime('%Y-%m-%d')
        last_start_date = START_DATE if i < 2 else data.loc[i-2, 'Start Date'].strftime('%Y-%m-%d')
        
        used_basic = {}
        for basic in basic_list[::-1]:
            if (always and basic['period'] < start_date) or (last_start_date <= basic['period'] < start_date):
                used_basic = basic
                break
        final_basics.append(json.dumps(used_basic))
        
    data['Basics'] = final_basics
    
    return data
    

def prepare_data_for_company(symbol, with_basics=True):
    
    data = get_returns(symbol)
    data = get_news(symbol, data)
    
    if with_basics:
        data = get_basics(symbol, data)
        data.to_csv(f"{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}.csv")
    else:
        data['Basics'] = [json.dumps({})] * len(data)
        data.to_csv(f"{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics.csv")
    
    return data


In [5]:
DOW_30 = [
    "AXP", "AMGN", "AAPL", "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON",
    "IBM", "INTC", "JNJ", "KO", "JPM", "MCD", "MMM", "MRK", "MSFT", "NKE",
    "PG", "TRV", "UNH", "CRM", "VZ", "V", "WBA", "WMT", "DIS", "DOW"
]

# prepare_data_for_company("DOW", False)

In [6]:
for symbol in DOW_30:
    prepare_data_for_company(symbol)
#     prepare_data_for_company(symbol, False)

[*********************100%***********************]  1 of 1 completed


AXP :  2023-01-08  -  2023-01-15
AXP :  2023-01-15  -  2023-01-22
AXP :  2023-01-22  -  2023-01-29
AXP :  2023-01-29  -  2023-02-05
AXP :  2023-02-05  -  2023-02-12
AXP :  2023-02-12  -  2023-02-19
AXP :  2023-02-19  -  2023-02-26
AXP :  2023-02-26  -  2023-03-05
AXP :  2023-03-05  -  2023-03-12
AXP :  2023-03-12  -  2023-03-19
AXP :  2023-03-19  -  2023-03-26
AXP :  2023-03-26  -  2023-04-02
AXP :  2023-04-02  -  2023-04-09
AXP :  2023-04-09  -  2023-04-16
AXP :  2023-04-16  -  2023-04-23
AXP :  2023-04-23  -  2023-04-30
AXP :  2023-04-30  -  2023-05-07
AXP :  2023-05-07  -  2023-05-14
AXP :  2023-05-14  -  2023-05-21
AXP :  2023-05-21  -  2023-05-28
AXP :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


AMGN :  2023-01-08  -  2023-01-15
AMGN :  2023-01-15  -  2023-01-22
AMGN :  2023-01-22  -  2023-01-29
AMGN :  2023-01-29  -  2023-02-05
AMGN :  2023-02-05  -  2023-02-12
AMGN :  2023-02-12  -  2023-02-19
AMGN :  2023-02-19  -  2023-02-26
AMGN :  2023-02-26  -  2023-03-05
AMGN :  2023-03-05  -  2023-03-12
AMGN :  2023-03-12  -  2023-03-19
AMGN :  2023-03-19  -  2023-03-26
AMGN :  2023-03-26  -  2023-04-02
AMGN :  2023-04-02  -  2023-04-09
AMGN :  2023-04-09  -  2023-04-16
AMGN :  2023-04-16  -  2023-04-23
AMGN :  2023-04-23  -  2023-04-30
AMGN :  2023-04-30  -  2023-05-07
AMGN :  2023-05-07  -  2023-05-14
AMGN :  2023-05-14  -  2023-05-21
AMGN :  2023-05-21  -  2023-05-28
AMGN :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


AAPL :  2023-01-08  -  2023-01-15
AAPL :  2023-01-15  -  2023-01-22
AAPL :  2023-01-22  -  2023-01-29
AAPL :  2023-01-29  -  2023-02-05
AAPL :  2023-02-05  -  2023-02-12
AAPL :  2023-02-12  -  2023-02-19
AAPL :  2023-02-19  -  2023-02-26
AAPL :  2023-02-26  -  2023-03-05
AAPL :  2023-03-05  -  2023-03-12
AAPL :  2023-03-12  -  2023-03-19
AAPL :  2023-03-19  -  2023-03-26
AAPL :  2023-03-26  -  2023-04-02
AAPL :  2023-04-02  -  2023-04-09
AAPL :  2023-04-09  -  2023-04-16
AAPL :  2023-04-16  -  2023-04-23
AAPL :  2023-04-23  -  2023-04-30
AAPL :  2023-04-30  -  2023-05-07
AAPL :  2023-05-07  -  2023-05-14
AAPL :  2023-05-14  -  2023-05-21
AAPL :  2023-05-21  -  2023-05-28
AAPL :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


BA :  2023-01-08  -  2023-01-15
BA :  2023-01-15  -  2023-01-22
BA :  2023-01-22  -  2023-01-29
BA :  2023-01-29  -  2023-02-05
BA :  2023-02-05  -  2023-02-12
BA :  2023-02-12  -  2023-02-19
BA :  2023-02-19  -  2023-02-26
BA :  2023-02-26  -  2023-03-05
BA :  2023-03-05  -  2023-03-12
BA :  2023-03-12  -  2023-03-19
BA :  2023-03-19  -  2023-03-26
BA :  2023-03-26  -  2023-04-02
BA :  2023-04-02  -  2023-04-09
BA :  2023-04-09  -  2023-04-16
BA :  2023-04-16  -  2023-04-23
BA :  2023-04-23  -  2023-04-30
BA :  2023-04-30  -  2023-05-07
BA :  2023-05-07  -  2023-05-14
BA :  2023-05-14  -  2023-05-21
BA :  2023-05-21  -  2023-05-28
BA :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


CAT :  2023-01-08  -  2023-01-15
CAT :  2023-01-15  -  2023-01-22
CAT :  2023-01-22  -  2023-01-29
CAT :  2023-01-29  -  2023-02-05
CAT :  2023-02-05  -  2023-02-12
CAT :  2023-02-12  -  2023-02-19
CAT :  2023-02-19  -  2023-02-26
CAT :  2023-02-26  -  2023-03-05
CAT :  2023-03-05  -  2023-03-12
CAT :  2023-03-12  -  2023-03-19
CAT :  2023-03-19  -  2023-03-26
CAT :  2023-03-26  -  2023-04-02
CAT :  2023-04-02  -  2023-04-09
CAT :  2023-04-09  -  2023-04-16
CAT :  2023-04-16  -  2023-04-23
CAT :  2023-04-23  -  2023-04-30
CAT :  2023-04-30  -  2023-05-07
CAT :  2023-05-07  -  2023-05-14
CAT :  2023-05-14  -  2023-05-21
CAT :  2023-05-21  -  2023-05-28
CAT :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


CSCO :  2023-01-08  -  2023-01-15
CSCO :  2023-01-15  -  2023-01-22
CSCO :  2023-01-22  -  2023-01-29
CSCO :  2023-01-29  -  2023-02-05
CSCO :  2023-02-05  -  2023-02-12
CSCO :  2023-02-12  -  2023-02-19
CSCO :  2023-02-19  -  2023-02-26
CSCO :  2023-02-26  -  2023-03-05
CSCO :  2023-03-05  -  2023-03-12
CSCO :  2023-03-12  -  2023-03-19
CSCO :  2023-03-19  -  2023-03-26
CSCO :  2023-03-26  -  2023-04-02
CSCO :  2023-04-02  -  2023-04-09
CSCO :  2023-04-09  -  2023-04-16
CSCO :  2023-04-16  -  2023-04-23
CSCO :  2023-04-23  -  2023-04-30
CSCO :  2023-04-30  -  2023-05-07
CSCO :  2023-05-07  -  2023-05-14
CSCO :  2023-05-14  -  2023-05-21
CSCO :  2023-05-21  -  2023-05-28
CSCO :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


CVX :  2023-01-08  -  2023-01-15
CVX :  2023-01-15  -  2023-01-22
CVX :  2023-01-22  -  2023-01-29
CVX :  2023-01-29  -  2023-02-05
CVX :  2023-02-05  -  2023-02-12
CVX :  2023-02-12  -  2023-02-19
CVX :  2023-02-19  -  2023-02-26
CVX :  2023-02-26  -  2023-03-05
CVX :  2023-03-05  -  2023-03-12
CVX :  2023-03-12  -  2023-03-19
CVX :  2023-03-19  -  2023-03-26
CVX :  2023-03-26  -  2023-04-02
CVX :  2023-04-02  -  2023-04-09
CVX :  2023-04-09  -  2023-04-16
CVX :  2023-04-16  -  2023-04-23
CVX :  2023-04-23  -  2023-04-30
CVX :  2023-04-30  -  2023-05-07
CVX :  2023-05-07  -  2023-05-14
CVX :  2023-05-14  -  2023-05-21
CVX :  2023-05-21  -  2023-05-28
CVX :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


GS :  2023-01-08  -  2023-01-15
GS :  2023-01-15  -  2023-01-22
GS :  2023-01-22  -  2023-01-29
GS :  2023-01-29  -  2023-02-05
GS :  2023-02-05  -  2023-02-12
GS :  2023-02-12  -  2023-02-19
GS :  2023-02-19  -  2023-02-26
GS :  2023-02-26  -  2023-03-05
GS :  2023-03-05  -  2023-03-12
GS :  2023-03-12  -  2023-03-19
GS :  2023-03-19  -  2023-03-26
GS :  2023-03-26  -  2023-04-02
GS :  2023-04-02  -  2023-04-09
GS :  2023-04-09  -  2023-04-16
GS :  2023-04-16  -  2023-04-23
GS :  2023-04-23  -  2023-04-30
GS :  2023-04-30  -  2023-05-07
GS :  2023-05-07  -  2023-05-14
GS :  2023-05-14  -  2023-05-21
GS :  2023-05-21  -  2023-05-28
GS :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


HD :  2023-01-08  -  2023-01-15
HD :  2023-01-15  -  2023-01-22
HD :  2023-01-22  -  2023-01-29
HD :  2023-01-29  -  2023-02-05
HD :  2023-02-05  -  2023-02-12
HD :  2023-02-12  -  2023-02-19
HD :  2023-02-19  -  2023-02-26
HD :  2023-02-26  -  2023-03-05
HD :  2023-03-05  -  2023-03-12
HD :  2023-03-12  -  2023-03-19
HD :  2023-03-19  -  2023-03-26
HD :  2023-03-26  -  2023-04-02
HD :  2023-04-02  -  2023-04-09
HD :  2023-04-09  -  2023-04-16
HD :  2023-04-16  -  2023-04-23
HD :  2023-04-23  -  2023-04-30
HD :  2023-04-30  -  2023-05-07
HD :  2023-05-07  -  2023-05-14
HD :  2023-05-14  -  2023-05-21
HD :  2023-05-21  -  2023-05-28
HD :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


HON :  2023-01-08  -  2023-01-15
HON :  2023-01-15  -  2023-01-22
HON :  2023-01-22  -  2023-01-29
HON :  2023-01-29  -  2023-02-05
HON :  2023-02-05  -  2023-02-12
HON :  2023-02-12  -  2023-02-19
HON :  2023-02-19  -  2023-02-26
HON :  2023-02-26  -  2023-03-05
HON :  2023-03-05  -  2023-03-12
HON :  2023-03-12  -  2023-03-19
HON :  2023-03-19  -  2023-03-26
HON :  2023-03-26  -  2023-04-02
HON :  2023-04-02  -  2023-04-09
HON :  2023-04-09  -  2023-04-16
HON :  2023-04-16  -  2023-04-23
HON :  2023-04-23  -  2023-04-30
HON :  2023-04-30  -  2023-05-07
HON :  2023-05-07  -  2023-05-14
HON :  2023-05-14  -  2023-05-21
HON :  2023-05-21  -  2023-05-28
HON :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


IBM :  2023-01-08  -  2023-01-15
IBM :  2023-01-15  -  2023-01-22
IBM :  2023-01-22  -  2023-01-29
IBM :  2023-01-29  -  2023-02-05
IBM :  2023-02-05  -  2023-02-12
IBM :  2023-02-12  -  2023-02-19
IBM :  2023-02-19  -  2023-02-26
IBM :  2023-02-26  -  2023-03-05
IBM :  2023-03-05  -  2023-03-12
IBM :  2023-03-12  -  2023-03-19
IBM :  2023-03-19  -  2023-03-26
IBM :  2023-03-26  -  2023-04-02
IBM :  2023-04-02  -  2023-04-09
IBM :  2023-04-09  -  2023-04-16
IBM :  2023-04-16  -  2023-04-23
IBM :  2023-04-23  -  2023-04-30
IBM :  2023-04-30  -  2023-05-07
IBM :  2023-05-07  -  2023-05-14
IBM :  2023-05-14  -  2023-05-21
IBM :  2023-05-21  -  2023-05-28
IBM :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


INTC :  2023-01-08  -  2023-01-15
INTC :  2023-01-15  -  2023-01-22
INTC :  2023-01-22  -  2023-01-29
INTC :  2023-01-29  -  2023-02-05
INTC :  2023-02-05  -  2023-02-12
INTC :  2023-02-12  -  2023-02-19
INTC :  2023-02-19  -  2023-02-26
INTC :  2023-02-26  -  2023-03-05
INTC :  2023-03-05  -  2023-03-12
INTC :  2023-03-12  -  2023-03-19
INTC :  2023-03-19  -  2023-03-26
INTC :  2023-03-26  -  2023-04-02
INTC :  2023-04-02  -  2023-04-09
INTC :  2023-04-09  -  2023-04-16
INTC :  2023-04-16  -  2023-04-23
INTC :  2023-04-23  -  2023-04-30
INTC :  2023-04-30  -  2023-05-07
INTC :  2023-05-07  -  2023-05-14
INTC :  2023-05-14  -  2023-05-21
INTC :  2023-05-21  -  2023-05-28
INTC :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


JNJ :  2023-01-08  -  2023-01-15
JNJ :  2023-01-15  -  2023-01-22
JNJ :  2023-01-22  -  2023-01-29
JNJ :  2023-01-29  -  2023-02-05
JNJ :  2023-02-05  -  2023-02-12
JNJ :  2023-02-12  -  2023-02-19
JNJ :  2023-02-19  -  2023-02-26
JNJ :  2023-02-26  -  2023-03-05
JNJ :  2023-03-05  -  2023-03-12
JNJ :  2023-03-12  -  2023-03-19
JNJ :  2023-03-19  -  2023-03-26
JNJ :  2023-03-26  -  2023-04-02
JNJ :  2023-04-02  -  2023-04-09
JNJ :  2023-04-09  -  2023-04-16
JNJ :  2023-04-16  -  2023-04-23
JNJ :  2023-04-23  -  2023-04-30
JNJ :  2023-04-30  -  2023-05-07
JNJ :  2023-05-07  -  2023-05-14
JNJ :  2023-05-14  -  2023-05-21
JNJ :  2023-05-21  -  2023-05-28
JNJ :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


KO :  2023-01-08  -  2023-01-15
KO :  2023-01-15  -  2023-01-22
KO :  2023-01-22  -  2023-01-29
KO :  2023-01-29  -  2023-02-05
KO :  2023-02-05  -  2023-02-12
KO :  2023-02-12  -  2023-02-19
KO :  2023-02-19  -  2023-02-26
KO :  2023-02-26  -  2023-03-05
KO :  2023-03-05  -  2023-03-12
KO :  2023-03-12  -  2023-03-19
KO :  2023-03-19  -  2023-03-26
KO :  2023-03-26  -  2023-04-02
KO :  2023-04-02  -  2023-04-09
KO :  2023-04-09  -  2023-04-16
KO :  2023-04-16  -  2023-04-23
KO :  2023-04-23  -  2023-04-30
KO :  2023-04-30  -  2023-05-07
KO :  2023-05-07  -  2023-05-14
KO :  2023-05-14  -  2023-05-21
KO :  2023-05-21  -  2023-05-28
KO :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


JPM :  2023-01-08  -  2023-01-15
JPM :  2023-01-15  -  2023-01-22
JPM :  2023-01-22  -  2023-01-29
JPM :  2023-01-29  -  2023-02-05
JPM :  2023-02-05  -  2023-02-12
JPM :  2023-02-12  -  2023-02-19
JPM :  2023-02-19  -  2023-02-26
JPM :  2023-02-26  -  2023-03-05
JPM :  2023-03-05  -  2023-03-12
JPM :  2023-03-12  -  2023-03-19
JPM :  2023-03-19  -  2023-03-26
JPM :  2023-03-26  -  2023-04-02
JPM :  2023-04-02  -  2023-04-09
JPM :  2023-04-09  -  2023-04-16
JPM :  2023-04-16  -  2023-04-23
JPM :  2023-04-23  -  2023-04-30
JPM :  2023-04-30  -  2023-05-07
JPM :  2023-05-07  -  2023-05-14
JPM :  2023-05-14  -  2023-05-21
JPM :  2023-05-21  -  2023-05-28
JPM :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


MCD :  2023-01-08  -  2023-01-15
MCD :  2023-01-15  -  2023-01-22
MCD :  2023-01-22  -  2023-01-29
MCD :  2023-01-29  -  2023-02-05
MCD :  2023-02-05  -  2023-02-12
MCD :  2023-02-12  -  2023-02-19
MCD :  2023-02-19  -  2023-02-26
MCD :  2023-02-26  -  2023-03-05
MCD :  2023-03-05  -  2023-03-12
MCD :  2023-03-12  -  2023-03-19
MCD :  2023-03-19  -  2023-03-26
MCD :  2023-03-26  -  2023-04-02
MCD :  2023-04-02  -  2023-04-09
MCD :  2023-04-09  -  2023-04-16
MCD :  2023-04-16  -  2023-04-23
MCD :  2023-04-23  -  2023-04-30
MCD :  2023-04-30  -  2023-05-07
MCD :  2023-05-07  -  2023-05-14
MCD :  2023-05-14  -  2023-05-21
MCD :  2023-05-21  -  2023-05-28
MCD :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


MMM :  2023-01-08  -  2023-01-15
MMM :  2023-01-15  -  2023-01-22
MMM :  2023-01-22  -  2023-01-29
MMM :  2023-01-29  -  2023-02-05
MMM :  2023-02-05  -  2023-02-12
MMM :  2023-02-12  -  2023-02-19
MMM :  2023-02-19  -  2023-02-26
MMM :  2023-02-26  -  2023-03-05
MMM :  2023-03-05  -  2023-03-12
MMM :  2023-03-12  -  2023-03-19
MMM :  2023-03-19  -  2023-03-26
MMM :  2023-03-26  -  2023-04-02
MMM :  2023-04-02  -  2023-04-09
MMM :  2023-04-09  -  2023-04-16
MMM :  2023-04-16  -  2023-04-23
MMM :  2023-04-23  -  2023-04-30
MMM :  2023-04-30  -  2023-05-07
MMM :  2023-05-07  -  2023-05-14
MMM :  2023-05-14  -  2023-05-21
MMM :  2023-05-21  -  2023-05-28
MMM :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


MRK :  2023-01-08  -  2023-01-15
MRK :  2023-01-15  -  2023-01-22
MRK :  2023-01-22  -  2023-01-29
MRK :  2023-01-29  -  2023-02-05
MRK :  2023-02-05  -  2023-02-12
MRK :  2023-02-12  -  2023-02-19
MRK :  2023-02-19  -  2023-02-26
MRK :  2023-02-26  -  2023-03-05
MRK :  2023-03-05  -  2023-03-12
MRK :  2023-03-12  -  2023-03-19
MRK :  2023-03-19  -  2023-03-26
MRK :  2023-03-26  -  2023-04-02
MRK :  2023-04-02  -  2023-04-09
MRK :  2023-04-09  -  2023-04-16
MRK :  2023-04-16  -  2023-04-23
MRK :  2023-04-23  -  2023-04-30
MRK :  2023-04-30  -  2023-05-07
MRK :  2023-05-07  -  2023-05-14
MRK :  2023-05-14  -  2023-05-21
MRK :  2023-05-21  -  2023-05-28
MRK :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


MSFT :  2023-01-08  -  2023-01-15
MSFT :  2023-01-15  -  2023-01-22
MSFT :  2023-01-22  -  2023-01-29
MSFT :  2023-01-29  -  2023-02-05
MSFT :  2023-02-05  -  2023-02-12
MSFT :  2023-02-12  -  2023-02-19
MSFT :  2023-02-19  -  2023-02-26
MSFT :  2023-02-26  -  2023-03-05
MSFT :  2023-03-05  -  2023-03-12
MSFT :  2023-03-12  -  2023-03-19
MSFT :  2023-03-19  -  2023-03-26
MSFT :  2023-03-26  -  2023-04-02
MSFT :  2023-04-02  -  2023-04-09
MSFT :  2023-04-09  -  2023-04-16
MSFT :  2023-04-16  -  2023-04-23
MSFT :  2023-04-23  -  2023-04-30
MSFT :  2023-04-30  -  2023-05-07
MSFT :  2023-05-07  -  2023-05-14
MSFT :  2023-05-14  -  2023-05-21
MSFT :  2023-05-21  -  2023-05-28
MSFT :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


NKE :  2023-01-08  -  2023-01-15
NKE :  2023-01-15  -  2023-01-22
NKE :  2023-01-22  -  2023-01-29
NKE :  2023-01-29  -  2023-02-05
NKE :  2023-02-05  -  2023-02-12
NKE :  2023-02-12  -  2023-02-19
NKE :  2023-02-19  -  2023-02-26
NKE :  2023-02-26  -  2023-03-05
NKE :  2023-03-05  -  2023-03-12
NKE :  2023-03-12  -  2023-03-19
NKE :  2023-03-19  -  2023-03-26
NKE :  2023-03-26  -  2023-04-02
NKE :  2023-04-02  -  2023-04-09
NKE :  2023-04-09  -  2023-04-16
NKE :  2023-04-16  -  2023-04-23
NKE :  2023-04-23  -  2023-04-30
NKE :  2023-04-30  -  2023-05-07
NKE :  2023-05-07  -  2023-05-14
NKE :  2023-05-14  -  2023-05-21
NKE :  2023-05-21  -  2023-05-28
NKE :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


PG :  2023-01-08  -  2023-01-15
PG :  2023-01-15  -  2023-01-22
PG :  2023-01-22  -  2023-01-29
PG :  2023-01-29  -  2023-02-05
PG :  2023-02-05  -  2023-02-12
PG :  2023-02-12  -  2023-02-19
PG :  2023-02-19  -  2023-02-26
PG :  2023-02-26  -  2023-03-05
PG :  2023-03-05  -  2023-03-12
PG :  2023-03-12  -  2023-03-19
PG :  2023-03-19  -  2023-03-26
PG :  2023-03-26  -  2023-04-02
PG :  2023-04-02  -  2023-04-09
PG :  2023-04-09  -  2023-04-16
PG :  2023-04-16  -  2023-04-23
PG :  2023-04-23  -  2023-04-30
PG :  2023-04-30  -  2023-05-07
PG :  2023-05-07  -  2023-05-14
PG :  2023-05-14  -  2023-05-21
PG :  2023-05-21  -  2023-05-28
PG :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


TRV :  2023-01-08  -  2023-01-15
TRV :  2023-01-15  -  2023-01-22
TRV :  2023-01-22  -  2023-01-29
TRV :  2023-01-29  -  2023-02-05
TRV :  2023-02-05  -  2023-02-12
TRV :  2023-02-12  -  2023-02-19
TRV :  2023-02-19  -  2023-02-26
TRV :  2023-02-26  -  2023-03-05
TRV :  2023-03-05  -  2023-03-12
TRV :  2023-03-12  -  2023-03-19
TRV :  2023-03-19  -  2023-03-26
TRV :  2023-03-26  -  2023-04-02
TRV :  2023-04-02  -  2023-04-09
TRV :  2023-04-09  -  2023-04-16
TRV :  2023-04-16  -  2023-04-23
TRV :  2023-04-23  -  2023-04-30
TRV :  2023-04-30  -  2023-05-07
TRV :  2023-05-07  -  2023-05-14
TRV :  2023-05-14  -  2023-05-21
TRV :  2023-05-21  -  2023-05-28
TRV :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


UNH :  2023-01-08  -  2023-01-15
UNH :  2023-01-15  -  2023-01-22
UNH :  2023-01-22  -  2023-01-29
UNH :  2023-01-29  -  2023-02-05
UNH :  2023-02-05  -  2023-02-12
UNH :  2023-02-12  -  2023-02-19
UNH :  2023-02-19  -  2023-02-26
UNH :  2023-02-26  -  2023-03-05
UNH :  2023-03-05  -  2023-03-12
UNH :  2023-03-12  -  2023-03-19
UNH :  2023-03-19  -  2023-03-26
UNH :  2023-03-26  -  2023-04-02
UNH :  2023-04-02  -  2023-04-09
UNH :  2023-04-09  -  2023-04-16
UNH :  2023-04-16  -  2023-04-23
UNH :  2023-04-23  -  2023-04-30
UNH :  2023-04-30  -  2023-05-07
UNH :  2023-05-07  -  2023-05-14
UNH :  2023-05-14  -  2023-05-21
UNH :  2023-05-21  -  2023-05-28
UNH :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


CRM :  2023-01-08  -  2023-01-15
CRM :  2023-01-15  -  2023-01-22
CRM :  2023-01-22  -  2023-01-29
CRM :  2023-01-29  -  2023-02-05
CRM :  2023-02-05  -  2023-02-12
CRM :  2023-02-12  -  2023-02-19
CRM :  2023-02-19  -  2023-02-26
CRM :  2023-02-26  -  2023-03-05
CRM :  2023-03-05  -  2023-03-12
CRM :  2023-03-12  -  2023-03-19
CRM :  2023-03-19  -  2023-03-26
CRM :  2023-03-26  -  2023-04-02
CRM :  2023-04-02  -  2023-04-09
CRM :  2023-04-09  -  2023-04-16
CRM :  2023-04-16  -  2023-04-23
CRM :  2023-04-23  -  2023-04-30
CRM :  2023-04-30  -  2023-05-07
CRM :  2023-05-07  -  2023-05-14
CRM :  2023-05-14  -  2023-05-21
CRM :  2023-05-21  -  2023-05-28
CRM :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


VZ :  2023-01-08  -  2023-01-15
VZ :  2023-01-15  -  2023-01-22
VZ :  2023-01-22  -  2023-01-29
VZ :  2023-01-29  -  2023-02-05
VZ :  2023-02-05  -  2023-02-12
VZ :  2023-02-12  -  2023-02-19
VZ :  2023-02-19  -  2023-02-26
VZ :  2023-02-26  -  2023-03-05
VZ :  2023-03-05  -  2023-03-12
VZ :  2023-03-12  -  2023-03-19
VZ :  2023-03-19  -  2023-03-26
VZ :  2023-03-26  -  2023-04-02
VZ :  2023-04-02  -  2023-04-09
VZ :  2023-04-09  -  2023-04-16
VZ :  2023-04-16  -  2023-04-23
VZ :  2023-04-23  -  2023-04-30
VZ :  2023-04-30  -  2023-05-07
VZ :  2023-05-07  -  2023-05-14
VZ :  2023-05-14  -  2023-05-21
VZ :  2023-05-21  -  2023-05-28
VZ :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


V :  2023-01-08  -  2023-01-15
V :  2023-01-15  -  2023-01-22
V :  2023-01-22  -  2023-01-29
V :  2023-01-29  -  2023-02-05
V :  2023-02-05  -  2023-02-12
V :  2023-02-12  -  2023-02-19
V :  2023-02-19  -  2023-02-26
V :  2023-02-26  -  2023-03-05
V :  2023-03-05  -  2023-03-12
V :  2023-03-12  -  2023-03-19
V :  2023-03-19  -  2023-03-26
V :  2023-03-26  -  2023-04-02
V :  2023-04-02  -  2023-04-09
V :  2023-04-09  -  2023-04-16
V :  2023-04-16  -  2023-04-23
V :  2023-04-23  -  2023-04-30
V :  2023-04-30  -  2023-05-07
V :  2023-05-07  -  2023-05-14
V :  2023-05-14  -  2023-05-21
V :  2023-05-21  -  2023-05-28
V :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


WBA :  2023-01-08  -  2023-01-15
WBA :  2023-01-15  -  2023-01-22
WBA :  2023-01-22  -  2023-01-29
WBA :  2023-01-29  -  2023-02-05
WBA :  2023-02-05  -  2023-02-12
WBA :  2023-02-12  -  2023-02-19
WBA :  2023-02-19  -  2023-02-26
WBA :  2023-02-26  -  2023-03-05
WBA :  2023-03-05  -  2023-03-12
WBA :  2023-03-12  -  2023-03-19
WBA :  2023-03-19  -  2023-03-26
WBA :  2023-03-26  -  2023-04-02
WBA :  2023-04-02  -  2023-04-09
WBA :  2023-04-09  -  2023-04-16
WBA :  2023-04-16  -  2023-04-23
WBA :  2023-04-23  -  2023-04-30
WBA :  2023-04-30  -  2023-05-07
WBA :  2023-05-07  -  2023-05-14
WBA :  2023-05-14  -  2023-05-21
WBA :  2023-05-21  -  2023-05-28
WBA :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


WMT :  2023-01-08  -  2023-01-15
WMT :  2023-01-15  -  2023-01-22
WMT :  2023-01-22  -  2023-01-29
WMT :  2023-01-29  -  2023-02-05
WMT :  2023-02-05  -  2023-02-12
WMT :  2023-02-12  -  2023-02-19
WMT :  2023-02-19  -  2023-02-26
WMT :  2023-02-26  -  2023-03-05
WMT :  2023-03-05  -  2023-03-12
WMT :  2023-03-12  -  2023-03-19
WMT :  2023-03-19  -  2023-03-26
WMT :  2023-03-26  -  2023-04-02
WMT :  2023-04-02  -  2023-04-09
WMT :  2023-04-09  -  2023-04-16
WMT :  2023-04-16  -  2023-04-23
WMT :  2023-04-23  -  2023-04-30
WMT :  2023-04-30  -  2023-05-07
WMT :  2023-05-07  -  2023-05-14
WMT :  2023-05-14  -  2023-05-21
WMT :  2023-05-21  -  2023-05-28
WMT :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


DIS :  2023-01-08  -  2023-01-15
DIS :  2023-01-15  -  2023-01-22
DIS :  2023-01-22  -  2023-01-29
DIS :  2023-01-29  -  2023-02-05
DIS :  2023-02-05  -  2023-02-12
DIS :  2023-02-12  -  2023-02-19
DIS :  2023-02-19  -  2023-02-26
DIS :  2023-02-26  -  2023-03-05
DIS :  2023-03-05  -  2023-03-12
DIS :  2023-03-12  -  2023-03-19
DIS :  2023-03-19  -  2023-03-26
DIS :  2023-03-26  -  2023-04-02
DIS :  2023-04-02  -  2023-04-09
DIS :  2023-04-09  -  2023-04-16
DIS :  2023-04-16  -  2023-04-23
DIS :  2023-04-23  -  2023-04-30
DIS :  2023-04-30  -  2023-05-07
DIS :  2023-05-07  -  2023-05-14
DIS :  2023-05-14  -  2023-05-21
DIS :  2023-05-21  -  2023-05-28
DIS :  2023-05-28  -  2023-06-04


[*********************100%***********************]  1 of 1 completed


DOW :  2023-01-08  -  2023-01-15
DOW :  2023-01-15  -  2023-01-22
DOW :  2023-01-22  -  2023-01-29
DOW :  2023-01-29  -  2023-02-05
DOW :  2023-02-05  -  2023-02-12
DOW :  2023-02-12  -  2023-02-19
DOW :  2023-02-19  -  2023-02-26
DOW :  2023-02-26  -  2023-03-05
DOW :  2023-03-05  -  2023-03-12
DOW :  2023-03-12  -  2023-03-19
DOW :  2023-03-19  -  2023-03-26
DOW :  2023-03-26  -  2023-04-02
DOW :  2023-04-02  -  2023-04-09
DOW :  2023-04-09  -  2023-04-16
DOW :  2023-04-16  -  2023-04-23
DOW :  2023-04-23  -  2023-04-30
DOW :  2023-04-30  -  2023-05-07
DOW :  2023-05-07  -  2023-05-14
DOW :  2023-05-14  -  2023-05-21
DOW :  2023-05-21  -  2023-05-28
DOW :  2023-05-28  -  2023-06-04


# Generate Prompt from Financial Data

In [7]:
def get_company_prompt(symbol):
    
    profile = finnhub_client.company_profile2(symbol=symbol)

    company_template = "[Company Introduction]:\n\n{name} is a leading entity in the {finnhubIndustry} sector. Incorporated and publicly traded since {ipo}, the company has established its reputation as one of the key players in the market. As of today, {name} has a market capitalization of {marketCapitalization:.2f} in {currency}, with {shareOutstanding:.2f} shares outstanding." \
        "\n\n{name} operates primarily in the {country}, trading under the ticker {ticker} on the {exchange}. As a dominant force in the {finnhubIndustry} space, the company continues to innovate and drive progress within the industry."

    formatted_str = company_template.format(**profile)
    
    return formatted_str


def get_prompt_by_row(symbol, row):

    start_date = row['Start Date'] if isinstance(row['Start Date'], str) else row['Start Date'].strftime('%Y-%m-%d')
    end_date = row['End Date'] if isinstance(row['End Date'], str) else row['End Date'].strftime('%Y-%m-%d')
    term = 'increased' if row['End Price'] > row['Start Price'] else 'decreased'
    head = "From {} to {}, {}'s stock price {} from {:.2f} to {:.2f}. Company news during this period are listed below:\n\n".format(
        start_date, end_date, symbol, term, row['Start Price'], row['End Price'])
    
    news = json.loads(row["News"])
    news = ["[Headline]: {}\n[Summary]: {}\n".format(
        n['headline'], n['summary']) for n in news if n['date'][:8] <= end_date.replace('-', '') and \
        not n['summary'].startswith("Looking for stock market analysis and research with proves results?")]

    basics = json.loads(row['Basics'])
    if basics:
        basics = "Some recent basic financials of {}, reported at {}, are presented below:\n\n[Basic Financials]:\n\n".format(
            symbol, basics['period']) + "\n".join(f"{k}: {v}" for k, v in basics.items() if k != 'period')
    else:
        basics = "[Basic Financials]:\n\nNo basic financial reported."
    
    return head, news, basics


def sample_news(news, k=5):
    
    return [news[i] for i in sorted(random.sample(range(len(news)), k))]


def map_bin_label(bin_lb):
    
    lb = bin_lb.replace('U', 'up by ')
    lb = lb.replace('D', 'down by ')
    lb = lb.replace('1', '0-1%')
    lb = lb.replace('2', '1-2%')
    lb = lb.replace('3', '2-3%')
    lb = lb.replace('4', '3-4%')
    if lb.endswith('+'):
        lb = lb.replace('5+', 'more than 5%')
#         lb = lb.replace('5+', '5+%')
    else:
        lb = lb.replace('5', '4-5%')
    
    return lb


def get_all_prompts(symbol, min_past_weeks=1, max_past_weeks=3, with_basics=True):

    
    if with_basics:
        df = pd.read_csv(f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}.csv')
    else:
        df = pd.read_csv(f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics.csv')
    
    company_prompt = get_company_prompt(symbol)

    prev_rows = []
    all_prompts = []

    for row_idx, row in df.iterrows():

        prompt = ""
        if len(prev_rows) >= min_past_weeks:
            idx = min(random.choice(range(min_past_weeks, max_past_weeks+1)), len(prev_rows))
            for i in range(-idx, 0):
                # Add Price Movement (Head)
                prompt += "\n" + prev_rows[i][0]
                # Add News of previous weeks
                sampled_news = sample_news(
                    prev_rows[i][1],
                    min(5, len(prev_rows[i][1]))
                )
                if sampled_news:
                    prompt += "\n".join(sampled_news)
                else:
                    prompt += "No relative news reported."

        head, news, basics = get_prompt_by_row(symbol, row)

        prev_rows.append((head, news, basics))
        if len(prev_rows) > max_past_weeks:
            prev_rows.pop(0)  

        if not prompt:
            continue

        prediction = map_bin_label(row['Bin Label'])
        
        prompt = company_prompt + '\n' + prompt + '\n' + basics
        prompt += f"\n\nBased on all the information before {row['Start Date']}, let's first analyze the positive developments and potential concerns for {symbol}. Come up with 2-4 most important factors respectively and keep them concise. Most factors should be inferred from company related news. " \
            f"Then let's assume your prediction for next week ({row['Start Date']} to {row['End Date']}) is {prediction}. Provide a summary analysis to support your prediction. The prediction result need to be inferred from your analysis at the end, and thus not appearing as a foundational factor of your analysis."

        all_prompts.append(prompt.strip())
    
    return all_prompts

In [8]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"


SYSTEM_PROMPT = "You are a seasoned stock market analyst. Your task is to list the positive developments and potential concerns for companies based on relevant news and basic financials from the past weeks, then provide an analysis and prediction for the companies' stock price movement for the upcoming week. " \
    "Your answer format should be as follows:\n\n[Positive Developments]:\n1. ...\n\n[Potential Concerns]:\n1. ...\n\n[Prediction & Analysis]:\n...\n"

print(SYSTEM_PROMPT)

# prompts = get_all_prompts("AAPL", 1, 3)
# prompts = get_all_prompts("MSFT", 1, 3, False)
prompts = get_all_prompts("TRV", 1, 4)

print(prompts[0])


You are a seasoned stock market analyst. Your task is to list the positive developments and potential concerns for companies based on relevant news and basic financials from the past weeks, then provide an analysis and prediction for the companies' stock price movement for the upcoming week. Your answer format should be as follows:

[Positive Developments]:
1. ...

[Potential Concerns]:
1. ...

[Prediction & Analysis]:
...

[Company Introduction]:

Travelers Companies Inc is a leading entity in the Insurance sector. Incorporated and publicly traded since 1980-03-17, the company has established its reputation as one of the key players in the market. As of today, Travelers Companies Inc has a market capitalization of 59158.87 in USD, with 227.02 shares outstanding.

Travelers Companies Inc operates primarily in the US, trading under the ticker TRV on the NEW YORK STOCK EXCHANGE, INC.. As a dominant force in the Insurance space, the company continues to innovate and drive progress within 

# Request to GPT-4 for Financial Analysis

In [9]:
def append_to_csv(filename, input_data, output_data):
    
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([input_data, output_data])

        
def initialize_csv(filename):
    
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "answer"])


def query_gpt4(symbol_list, min_past_weeks=1, max_past_weeks=3, with_basics=True):

    for symbol in symbol_list:
        
        csv_file = f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics_gpt-4.csv'
        
        if not os.path.exists(csv_file):
            initialize_csv(csv_file)
            pre_done = 0
        else:
            df = pd.read_csv(csv_file)
            pre_done = len(df)

        prompts = get_all_prompts(symbol, min_past_weeks, max_past_weeks, with_basics)

        for i, prompt in enumerate(prompts):
            
            if i < pre_done:
                continue

            print(f"{symbol} - {i}")
            
            cnt = 0
            while cnt < 5:
                try:
                    completion = client.chat.completions.create(
                        model="gpt-4",
                        messages=[
                            {"role": "system", "content": SYSTEM_PROMPT},
                            {"role": "user", "content": prompt}
                          ]
                    )
                    break    
                except Exception:
                    cnt += 1
                    print(f'retry cnt {cnt}')
            
            answer = completion.choices[0].message.content if cnt < 5 else ""
            append_to_csv(csv_file, prompt, answer)
      

In [10]:
# query_gpt4(DOW_30, 1, 3)
query_gpt4(DOW_30, 1, 4)
# query_gpt4(['WBA'], 1, 4)

AXP - 0
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 1
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 2
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 3
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 4
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 5
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 6
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 7
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 8
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 9
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 10
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 11
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 12
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 13
retry cnt 1
retry cnt 2
retry cnt 3
retry cnt 4
retry cnt 5
AXP - 14
retry cnt 1
retry cnt 2
retry cnt 3

KeyboardInterrupt: 

# Transform into Llama2 Training Format

In [None]:
def gpt4_to_llama(symbol, with_basics=True):
    
    csv_file = f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{START_DATE}_{END_DATE}_nobasics_gpt-4.csv'
    
    df = pd.read_csv(csv_file)
    
    prompts, answers, periods, labels = [], [], [], []
    
    for i, row in df.iterrows():
        
        prompt, answer = row['prompt'], row['answer']
        
        res = re.search(r"Then let's assume your prediction for next week \((.*)\) is ((:?up|down) by .*%).", prompt)
        
        period, label = res.group(1), res.group(2)
#         label = label.replace('more than 5', '5+')
        
        prompt = re.sub(
            r"Then let's assume your prediction for next week \((.*)\) is (up|down) by ((:?.*)%). Provide a summary analysis to support your prediction. The prediction result need to be inferred from your analysis at the end, and thus not appearing as a foundational factor of your analysis.", 
            f"Then make your prediction of the {symbol} stock price movement for next week ({period}). Provide a summary analysis to support your prediction.",
            prompt
        )
        try:
            answer = re.sub(
                r"\[Prediction & Analysis\]:\s*",
                f"[Prediction & Analysis]:\nPrediction: {label.capitalize()}\nAnalysis: ",
                answer
            )
        except Exception:
            print(symbol, i)
            print(label)
            print(answer)
            continue
            
        new_system_prompt = SYSTEM_PROMPT.replace(':\n...', '\nPrediction: ...\nAnalysis: ...')
#         new_system_prompt = SYSTEM_PROMPT.replace(':\n...', '\nPrediction: {Up|Down} by {1-2|2-3|3-4|4-5|5+}%\nAnalysis: ...')
        
        prompt = B_INST + B_SYS + new_system_prompt + E_SYS + prompt + E_INST
        
        prompts.append(prompt)
        answers.append(answer)
        periods.append(period)
        labels.append(label)
        
    return {
        "prompt": prompts,
        "answer": answers,
        "period": periods,
        "label": labels,
    }


def create_dataset(symbol_list, train_ratio=0.8, with_basics=True):

    train_dataset_list = []
    test_dataset_list = []

    for symbol in symbol_list:

        data_dict = gpt4_to_llama(symbol, with_basics)
#         print(data_dict['prompt'][-1])
#         print(data_dict['answer'][-1])
        symbols = [symbol] * len(data_dict['label'])
        data_dict.update({"symbol": symbols})

        dataset = Dataset.from_dict(data_dict)
        train_size = round(train_ratio * len(dataset))

        train_dataset_list.append(dataset.select(range(train_size)))
        test_dataset_list.append(dataset.select(range(train_size, len(dataset))))

    train_dataset = datasets.concatenate_datasets(train_dataset_list)
    test_dataset = datasets.concatenate_datasets(test_dataset_list)

    dataset = datasets.DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    
    return dataset
   

In [None]:
# v1
# dow30_dataset = create_dataset(DOW30, True)
# v2
# dow30_nobasic_dataset = create_dataset(DOW_30, 0.8, False)
# v3
dow30_v3_dataset = create_dataset(DOW_30, 0.9)

In [None]:
# dow30_dataset.save_to_disk('fingpt-forecaster-dow30-20230601-20230930-llama')
# dow30_nobasics_dataset.save_to_disk('fingpt-forecaster-dow30nobasics-20230601-20230930-llama')
dow30_v3_dataset.save_to_disk('fingpt-forecaster-dow30v3-20221231-20230531-llama')

In [None]:
dow30_v3_dataset

# Test-time Information Fetching

In [None]:
import yfinance as yf
import pandas as pd
from datetime import date, datetime, timedelta


def get_curday():
    
    return date.today().strftime("%Y-%m-%d")


def n_weeks_before(date_string, n):
    
    date = datetime.strptime(date_string, "%Y-%m-%d") - timedelta(days=7*n)
    
    return date.strftime("%Y-%m-%d")


def get_stock_data(stock_symbol, steps):

    stock_data = yf.download(stock_symbol, steps[0], steps[-1])
    
#     print(stock_data)
    
    dates, prices = [], []
    stock_data.index.astype(str)
    
    for date in steps[:-1]:
        for i in range(len(stock_data)):
            if available_dates[i] >= date:
                prices.append(stock_data.loc[available_dates[i], 'Close'])
                dates.append(datetime.strptime(available_dates[i], "%Y-%m-%d"))
                break

    dates.append(datetime.strptime(available_dates[-1], "%Y-%m-%d"))
    prices.append(stock_data['Close'][-1])
    
    return pd.DataFrame({
        "Start Date": dates[:-1], "End Date": dates[1:],
        "Start Price": prices[:-1], "End Price": prices[1:]
    })


def get_current_basics(symbol, curday):

    basic_financials = finnhub_client.company_basic_financials(symbol, 'all')
    
    final_basics, basic_list, basic_dict = [], [], defaultdict(dict)
    
    for metric, value_list in basic_financials['series']['quarterly'].items():
        for value in value_list:
            basic_dict[value['period']].update({metric: value['v']})

    for k, v in basic_dict.items():
        v.update({'period': k})
        basic_list.append(v)
        
    basic_list.sort(key=lambda x: x['period'])
    
    for basic in basic_list[::-1]:
        if basic['period'] <= curday:
            break
            
    return basic
    

def get_all_prompts_online(symbol, data, curday, with_basics=True):

    company_prompt = get_company_prompt(symbol)

    prev_rows = []

    for row_idx, row in data.iterrows():
        head, news, _ = get_prompt_by_row(symbol, row)
        prev_rows.append((head, news, None))
        
    prompt = ""
    for i in range(-len(prev_rows), 0):
        prompt += "\n" + prev_rows[i][0]
        sampled_news = sample_news(
            prev_rows[i][1],
            min(5, len(prev_rows[i][1]))
        )
        if sampled_news:
            prompt += "\n".join(sampled_news)
        else:
            prompt += "No relative news reported."
        
    period = "{} to {}".format(curday, n_weeks_before(curday, -1))
    
    if with_basics:
        basics = get_current_basics(symbol, curday)
        basics = "Some recent basic financials of {}, reported at {}, are presented below:\n\n[Basic Financials]:\n\n".format(
            symbol, basics['period']) + "\n".join(f"{k}: {v}" for k, v in basics.items() if k != 'period')
    else:
        basics = "[Basic Financials]:\n\nNo basic financial reported."

    info = company_prompt + '\n' + prompt + '\n' + basics
    prompt = info + f"\n\nBased on all the information before {curday}, let's first analyze the positive developments and potential concerns for {symbol}. Come up with 2-4 most important factors respectively and keep them concise. Most factors should be inferred from company related news. " \
        f"Then make your prediction of the {symbol} stock price movement for next week ({period}). Provide a summary analysis to support your prediction."
        
    return info, prompt

In [None]:
ticker = "AAPL"
n_weeks = 2
curday = get_curday()
steps = [n_weeks_before(curday, n) for n in range(n_weeks + 1)][::-1]

data = get_stock_data(ticker, steps)

data = get_news(ticker, data)

data['Basics'] = [json.dumps({})] * len(data)
# data = get_basics(ticker, data, always=True)


In [None]:
info, prompt = get_all_prompts_online(ticker, data, curday, False)

print(prompt)

In [None]:
import os
import pandas as pd

# ダミーフォルダとファイルのパス
folder_path = './2022-12-31_2023-05-31/'
file_path = os.path.join(folder_path, 'sample.csv')

# フォルダを作成
os.makedirs(folder_path, exist_ok=True)

# ダミーデータの作成
dummy_data = {
    'date': ['2022-12-31', '2023-01-01', '2023-01-02'],
    'open': [100, 102, 104],
    'high': [110, 112, 114],
    'low': [95, 98, 101],
    'close': [108, 109, 110],
    'volume': [1000, 1500, 1200]
}

# ダミーデータをCSVに保存
df = pd.DataFrame(dummy_data)
df.to_csv(file_path, index=False)

print(f"Dummy file created at: {file_path}")


In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

# CSVを読み込む
df = pd.read_csv('./2022-12-31_2023-05-31/sample.csv')

# データセット形式に変換
dataset = Dataset.from_pandas(df)

# 分割を定義（例: 訓練データとテストデータ）
dataset_dict = DatasetDict({
    "train": dataset.select(range(int(len(dataset) * 0.8))),
    "test": dataset.select(range(int(len(dataset) * 0.8), len(dataset)))
})

# データセットを保存
dataset_dict.save_to_disk('./data/fingpt-forecaster-crypto-20230131-20231231-1-4-08/')
