In [1]:
import requests
import pandas as pd
import os
import numpy as np

def fetch_cryptocompare_ohlcv_data(fsym, tsym, limit=2000, aggregate=1, interval='day'):
    """
    Cryptocompare APIを使用して、過去のOHLCVデータを取得する関数
    """
    base_url = 'https://min-api.cryptocompare.com/data/v2/histo'
    intervals = {'minute': 'minute', 'hour': 'hour', 'day': 'day'}
    
    if interval not in intervals:
        raise ValueError(f"Invalid interval: {interval}")

    url = f"{base_url}{intervals[interval]}"
    params = {
        'fsym': fsym.upper(),
        'tsym': tsym.upper(),
        'limit': limit,
        'aggregate': aggregate
    }
    response = requests.get(url, params=params)
    data = response.json()['Data']['Data']
    
    # データをpandas DataFrameに変換
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['time'], unit='s')
    df = df[['timestamp', 'open', 'high', 'low', 'close', 'volumeto']]
    df.columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
    
    return df

def save_to_csv(df, filename):
    df.to_csv(filename, index=False)

def load_from_csv(filename):
    return pd.read_csv(filename)

filename = 'ohlcv_data.csv'

if os.path.exists(filename):
    # ローカルにファイルが存在する場合、データをローカルから読み込む
    ohlcv_data = load_from_csv(filename)
else:
    # ローカルにファイルが存在しない場合、データを取得し、ローカルに保存する
    fsym = 'BTC'
    tsym = 'JPY'
    ohlcv_data = fetch_cryptocompare_ohlcv_data(fsym, tsym)
    save_to_csv(ohlcv_data, filename)

print(ohlcv_data)

      timestamp        open        high         low       close        volume
0    2017-10-31   679910.34   727288.22   674553.25   723579.09  4.728300e+10
1    2017-11-01   723751.37   766552.45   713575.88   764334.90  5.844858e+10
2    2017-11-02   764247.87   857528.51   761617.11   805165.37  9.242606e+10
3    2017-11-03   805246.62   853429.62   786480.90   823854.18  8.196366e+10
4    2017-11-04   823851.31   845389.80   800200.82   839556.37  6.597304e+10
...         ...         ...         ...         ...         ...           ...
1996 2023-04-19  4069137.27  4076795.31  3872362.29  3893088.33  2.048354e+10
1997 2023-04-20  3893088.33  3918645.83  3765607.66  3786412.95  1.844144e+10
1998 2023-04-21  3786412.95  3802188.72  3644461.28  3654750.87  1.702854e+10
1999 2023-04-22  3654750.87  3754880.86  3641006.51  3747411.86  8.716559e+09
2000 2023-04-23  3747411.86  3747981.50  3700333.12  3709590.27  3.535152e+09

[2001 rows x 6 columns]


In [2]:
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def read_api_key(file_path):
    with open(file_path, 'r') as file:
        api_key = file.read().strip()
    return api_key


def fetch_cryptocurrency_news(api_key):
    url = "https://newsapi.org/v2/everything"
    
    # Bitcoinに関連するニュースを検索
    parameters = {
        'q': 'bitcoin',
        'language': 'en',
        'sortBy': 'publishedAt',
        'apiKey': api_key
    }
    
    response = requests.get(url, params=parameters)
    data = response.json()
    
    # 必要なデータを抽出
    news_data = []
    for article in data['articles']:
        news = {
            'title': article['title'],
            'date': article['publishedAt'],
            'url': article['url']
        }
        news_data.append(news)
    
    return news_data

def vectorize_news_data(news_data):
    vectorizer = TfidfVectorizer()
    titles = [news['title'] for news in news_data]
    vectors = vectorizer.fit_transform(titles)
    return vectors

# NewsAPIキーを取得
api_key_file_path = 'api_key.txt'
API_KEY = read_api_key(api_key_file_path)

# ニュースデータを取得
news_data = fetch_cryptocurrency_news(API_KEY)

# ニュースデータをベクトル化
news_vectors = vectorize_news_data(news_data)

print(news_data)
print(news_vectors)

FileNotFoundError: [Errno 2] No such file or directory: 'api_key.txt'

In [None]:
def merge_crypto_and_news_data(crypto_data, news_data, news_vectors):
    # ニュースデータを時系列でソート
    news_data_sorted = sorted(news_data, key=lambda x: x['date'])

    # 日付をインデックスと一致させる
    news_data_indexed = {}
    for news, vector in zip(news_data_sorted, news_vectors):
        date = pd.to_datetime(news['date']).strftime('%Y-%m-%d')
        if date not in news_data_indexed:
            news_data_indexed[date] = []
        news_data_indexed[date].append(vector)

    # ニュースのベクトル情報を平均化して仮想通貨のDataFrameに追加
    crypto_data['news_vector'] = [np.zeros(news_vectors.shape[1])] * len(crypto_data)
    for date, vectors in news_data_indexed.items():
        if date in crypto_data.index:
            mean_vector = np.mean(vectors, axis=0)
            crypto_data.at[date, 'news_vector'] = mean_vector

    return crypto_data

merged_data = merge_crypto_and_news_data(ohlcv_data, news_data, news_vectors)
print(merged_data)
merged_data.to_csv("merged_crypto_news_data.csv", index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# データの読み込み
merged_data = pd.read_csv("merged_crypto_news_data.csv")

# 入力データ（ニュース情報のベクトル）とターゲットデータ（価格情報）に分割
X = merged_data.drop(columns=['close'])
y = merged_data['close']

# 訓練用データと検証用データに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, AdamW
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

class CryptoNewsDataset(Dataset):
    def __init__(self, news_vectors, targets):
        self.news_vectors = news_vectors
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.news_vectors[idx], self.targets[idx]

# Load data
data = pd.read_csv("merged_crypto_news_data.csv")

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize news titles
train_encodings = tokenizer(train_data["title"].tolist(), padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_data["title"].tolist(), padding=True, truncation=True, return_tensors="pt")

# Create torch datasets
train_dataset = CryptoNewsDataset(train_encodings["input_ids"], torch.tensor(train_data["close"].values))
val_dataset = CryptoNewsDataset(val_encodings["input_ids"], torch.tensor(val_data["close"].values))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train model
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = torch.nn.MSELoss()(outputs.logits.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

    # Evaluate model
    model.eval()
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_preds.extend(outputs.logits.squeeze().tolist())
            val_labels.extend(labels.tolist())
    mse = mean_squared_error(val_labels, val_preds)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation MSE: {mse}")