In [155]:

import argparse
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline         
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error       
from pathlib import Path
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [85]:
# read the three already-saved CSVs
hp = (pd.read_csv("hashprice.csv", parse_dates=["timestamp"])
        # convert UTC seconds → America/Los_Angeles midnight
        .assign(date=lambda df:
                pd.to_datetime(df["timestamp"], unit="s", utc=True)
                  .dt.tz_convert("America/Los_Angeles")
                  .dt.normalize())
        .drop(columns=["timestamp"])
        .rename(columns={"usd_hashprice": "hash_usd",
                         "btc_hashprice": "hash_btc"})
        .set_index("date")
        .sort_index())

# 2⎯ pick the modelling target (here USD hash-price)
hp = hp.rename(columns={"usd_hashprice": "hash_usd",      # target
                        "btc_hashprice": "hash_btc"}) 
hp.index = hp.index.tz_localize(None)


In [87]:
sent = (pd.read_csv("fng.csv", parse_dates=["date"], dayfirst=True)
          .rename(columns={
              "fng_vale": "sentiment",          # numeric 0–100
              "fng_classification": "sentiment_class"
          })
          .set_index("date")
          .sort_index())
sent.index = sent.index.tz_localize(None)


In [89]:
# 1⎯ read, ignoring the metadata row
wx_raw = pd.read_csv(
    "weather.csv",
    skiprows=2,                 # drops the lat/long row
    parse_dates=["time"]        # converts the date string to Timestamp
)

# 2⎯ rename the columns to concise identifiers
wx = (wx_raw
       .assign(date=lambda df: pd.to_datetime(df["time"])
                               .dt.tz_localize("UTC")
                               .dt.tz_convert("America/Los_Angeles")
                               .dt.normalize()
                               .dt.tz_localize(None))
       .set_index("date")
       .rename(columns={"precipitation_sum (mm)": "precip",
                        "temperature_2m_mean (°C)": "temp_mean"}))
wx = wx.iloc[:, :2]
wx.index = wx.index.tz_localize(None)


In [91]:
for df in (hp, sent, wx):
    df.index = pd.to_datetime(df.index)   # safe even if already datetime
    df.sort_index(inplace=True)           # chronological order


In [93]:
for df in (hp, sent, wx):
    df.index = pd.to_datetime(df.index)          # guarantee `DatetimeIndex`
    df.sort_index(inplace=True)                 # chronological order
combined = pd.concat([hp, sent, wx], axis=1, join='inner')   # or 'outer'

In [95]:
hp  = hp.add_prefix("hp_")     # hp_time, hp_price, …
sent = sent.add_prefix("sent_")
wx   = wx.add_prefix("wx_")
combined = pd.concat([hp, sent, wx], axis=1, join="outer")  # keeps union of dates


In [97]:
print(combined)

            hp_hash_usd  hp_hash_btc  sent_fng_value sent_sentiment_class  \
date                                                                        
2020-06-20        76.73     0.008208             NaN                  NaN   
2020-06-21        77.87     0.008214             NaN                  NaN   
2020-06-22        80.25     0.008326             NaN                  NaN   
2020-06-23        78.82     0.008348             NaN                  NaN   
2020-06-24        76.89     0.008329             NaN                  NaN   
...                 ...          ...             ...                  ...   
2025-06-17        52.97     0.000506            68.0                Greed   
2025-06-18        52.73     0.000504            52.0              Neutral   
2025-06-19        52.62     0.000503            57.0                Greed   
2025-06-20        51.91     0.000503            54.0              Neutral   
2025-06-21          NaN          NaN            49.0              Neutral   

In [99]:
df = combined.copy()                     # keep original untouched

# 1.1  drop the redundant wx_time column
df = df.drop(columns="wx_time")

# 1.2  convert the sentiment class to one-hot dummies
df = pd.get_dummies(df, columns=["sent_sentiment_class"], prefix="sent_cls", dummy_na=False)

# 1.3  handle missing numeric values – simplest: forward-fill then back-fill
df = df.ffill().bfill()

# 1.4  scale all numeric columns except the target
num_cols = df.columns.drop("hp_hash_usd")
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [101]:
look_back  = 30      # L = 30 past days
horizon    = 1       # predict next day
target_col = df.columns.get_loc("hp_hash_usd")

X, y = [], []
values = df.values            # 2-D NumPy array (rows × columns)
for i in range(look_back, len(values) - horizon + 1):
    X.append(values[i-look_back:i])               # shape (L, d)
    y.append(values[i + horizon - 1, target_col]) # scalar
X = np.stack(X)
y = np.array(y)

In [103]:
n  = len(X)
n_train = int(0.6 * n)
n_val   = int(0.2 * n)

X_train, y_train = X[:n_train],           y[:n_train]
X_val,   y_val   = X[n_train:n_train+n_val], y[n_train:n_train+n_val]
X_test,  y_test  = X[n_train+n_val:],     y[n_train+n_val:]


In [105]:

class HashPriceGRU(nn.Module):
    def __init__(self, d_in, hidden=32):
        super().__init__()
        self.gru  = nn.GRU(d_in, hidden, batch_first=True)
        self.head = nn.Sequential(
            nn.Linear(hidden, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        _, h = self.gru(x)           # h shape: (1, batch, hidden)
        return self.head(h.squeeze(0))

In [129]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train_loader = DataLoader(TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
), batch_size=32, shuffle=False)   # keep order within each batch

val_loader = DataLoader(TensorDataset(
    torch.tensor(X_val, dtype=torch.float32),
    torch.tensor(y_val, dtype=torch.float32)
), batch_size=32, shuffle=False)

model = HashPriceGRU(d_in=X.shape[2]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

best_val = float("inf")
patience, epochs_no_improve = 10, 0
for epoch in range(100):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb).squeeze(1)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # --- validation ---
    model.eval()
    with torch.no_grad():
        val_loss = sum(
            criterion(model(xv.to(device)).squeeze(1), yv.to(device)).item()
            for xv, yv in val_loader
        ) / len(val_loader)
    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), "best_gru.pt")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            break

In [157]:
def generate_random_data(days: int = 730, start: str = "2023-01-01") -> pd.DataFrame:
    rng = np.random.default_rng(seed=42)
    dates = pd.date_range(start, periods=days, freq="D")

    base_price = 100.0
    returns = rng.normal(loc=0.0002, scale=0.01, size=days)
    price_series = base_price * np.exp(np.cumsum(returns))

    hash_usd = price_series / 2.0 + rng.normal(0, 1.0, size=days)
    hash_btc = hash_usd / 40000.0

    temp_mean = 12 + 8*np.sin(2*np.pi*np.arange(days)/365) + rng.normal(0, 1.5, size=days)

    sentiment_val = rng.uniform(0, 100, size=days)

    def label(v):
        if v < 25:
            return "Extreme_Fear"
        elif v < 50:
            return "Fear"
        elif v < 75:
            return "Neutral"
        else:
            return "Greed"
    sentiment_cls = np.vectorize(label)(sentiment_val)

    df = pd.DataFrame({
        "hp_hash_usd": hash_usd,
        "hp_hash_btc": hash_btc,
        "sent_fng_value": sentiment_val,
        "sent_sentiment_class": sentiment_cls,
        "wx_temp_mean": temp_mean,
    }, index=dates)
    return df



In [167]:
def make_supervised(df: pd.DataFrame, look_back: int = 30, horizon: int = 1):
    """
    Convert a daily DataFrame into supervised learning matrices (features, target)
    using a sliding window. For sklearn we flatten each window into 1-D.
    """
    vals = df.values
    target_idx = df.columns.get_loc("hp_hash_usd")
    X, y = [], []
    for i in range(look_back, len(vals) - horizon + 1):
        window = vals[i-look_back:i].flatten()      # shape (look_back * d,)
        X.append(window)
        y.append(vals[i + horizon - 1, target_idx])
    return np.stack(X), np.array(y)

def train_run(days: int = 730):
    # 1. generate data
    df = generate_random_data(days)
    # 2. preprocess: one-hot encode categorical sentiment
    df_proc = pd.get_dummies(df, columns=["sent_sentiment_class"], prefix="sent_cls")
    df_proc = df_proc.ffill().bfill()

    # 3. make supervised dataset
    L = 30
    X, y = make_supervised(df_proc, look_back=L, horizon=1)

    # 4. chronological split
    n = len(X)
    n_train = int(0.6 * n)
    n_val   = int(0.2 * n)
    X_train, y_train = X[:n_train], y[:n_train]
    X_val,   y_val   = X[n_train:n_train+n_val], y[n_train:n_train+n_val]
    X_test,  y_test  = X[n_train+n_val:], y[n_train+n_val:]

    # 5. model: StandardScaler + MLPRegressor
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("mlp", MLPRegressor(hidden_layer_sizes=(64, 32),
                             activation="relu",
                             solver="adam",
                             learning_rate_init=1e-3,
                             max_iter=500,
                             random_state=42))
    ])
    model.fit(X_train, y_train)

    # 6. evaluation
    pred_test = model.predict(X_test)
    mse  = mean_squared_error(y_test, pred_test)  # returns MSE
    rmse = np.sqrt(mse)     

    # 7. show first 5 predictions vs actual
    res = pd.DataFrame({"pred": pred_test[:5], "actual": y_test[:5]})
    print(f"Test RMSE (MLP): {rmse:.4f}")
    return res

results_head = train_run()
results_head


Test RMSE (MLP): 10.4679


Unnamed: 0,pred,actual
0,30.625755,48.568123
1,34.062917,48.900768
2,28.820216,50.358445
3,34.206878,48.002388
4,36.573406,49.172915


In [168]:

results_head

Test RMSE (MLP): 10.4679


Unnamed: 0,pred,actual
0,30.625755,48.568123
1,34.062917,48.900768
2,28.820216,50.358445
3,34.206878,48.002388
4,36.573406,49.172915


In [185]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from typing import List, Dict

# === 1. constants and helper functions ===
LOOK_BACK = 30
NUMERIC_VARS = ["hp_hash_usd", "hp_hash_btc", "sent_fng_value", "wx_temp_mean"]
CATEG_LABELS = ["Extreme_Fear", "Fear", "Neutral", "Greed"]

def _one_hot(label: str) -> np.ndarray:
    vec = np.zeros(len(CATEG_LABELS), dtype=float)
    try:
        vec[CATEG_LABELS.index(label)] = 1.0
    except ValueError:
        pass
    return vec

def _prepare_single_row(row_dict: Dict) -> np.ndarray:
    numeric_part = np.array([row_dict[k] for k in NUMERIC_VARS], dtype=float)
    one_hot_part = _one_hot(row_dict["sent_sentiment_class"])
    return np.concatenate([numeric_part, one_hot_part])

# === 2. random synthetic data generator ===
def generate_random_data(days: int = 365, start: str = "2024-01-01") -> pd.DataFrame:
    rng = np.random.default_rng(seed=0)
    dates = pd.date_range(start, periods=days, freq="D")
    
    base_price = 100.0
    returns = rng.normal(loc=0.0001, scale=0.01, size=days)
    price_series = base_price * np.exp(np.cumsum(returns))
    
    hash_usd = price_series / 2 + rng.normal(0, 1, size=days)
    hash_btc = hash_usd / 40000.0
    sentiment_val = rng.uniform(0, 100, size=days)
    temp_mean = 15 + 5*np.sin(2*np.pi*np.arange(days)/365) + rng.normal(0, 1.5, size=days)
    
    def label(v):
        if v < 25:
            return "Extreme_Fear"
        elif v < 50:
            return "Fear"
        elif v < 75:
            return "Neutral"
        else:
            return "Greed"
    sentiment_cls = np.vectorize(label)(sentiment_val)
    
    return pd.DataFrame({
        "hp_hash_usd": hash_usd,
        "hp_hash_btc": hash_btc,
        "sent_fng_value": sentiment_val,
        "sent_sentiment_class": sentiment_cls,
        "wx_temp_mean": temp_mean,
    }, index=dates)

# === 3. make supervised matrix ===
def make_supervised(df: pd.DataFrame, look_back: int = LOOK_BACK):
    df_enc = pd.get_dummies(df, columns=["sent_sentiment_class"], prefix="sent_cls")
    df_enc = df_enc.ffill().bfill()
    values = df_enc.values
    X, y = [], []
    target_idx = df_enc.columns.get_loc("hp_hash_usd")
    for i in range(look_back, len(values)):
        X.append(values[i-look_back:i].flatten())
        y.append(values[i, target_idx])
    return np.stack(X), np.array(y)

# === 4. prediction function ===
def predict_hash_usd_future(model: Pipeline, history: List[Dict], nday: int = 1) -> float:
    if len(history) != LOOK_BACK:
        raise ValueError(f"history must be {LOOK_BACK} days long")
    window = history.copy()
    for _ in range(nday):
        X_window = np.vstack([_prepare_single_row(d) for d in window]).flatten()[None, :]
        y_pred = model.predict(X_window)[0]
        last = window[-1]
        synthetic_next = {
            "hp_hash_usd": y_pred,
            "hp_hash_btc": y_pred / 40000.0,
            "sent_fng_value": last["sent_fng_value"],
            "wx_temp_mean": last["wx_temp_mean"],
            "sent_sentiment_class": last["sent_sentiment_class"]
        }
        window.append(synthetic_next)
        window.pop(0)  # keep length constant
    return y_pred

# === 5. train a simple model on synthetic data ===
df_rand = generate_random_data(400)
X_train, y_train = make_supervised(df_rand)

model = Pipeline([
    ("scaler", StandardScaler()),
    ("linreg", LinearRegression())
])
model.fit(X_train, y_train)

# === 6. prepare a random 30-day history window ===
history_frame = generate_random_data(LOOK_BACK, start="2025-01-01")
history_dicts = history_frame.to_dict(orient="records")

# === 7. test predictions ===
pred_1d = predict_hash_usd_future(model, history_dicts, nday=1)
pred_3d = predict_hash_usd_future(model, history_dicts, nday=2)

print("Prediction 1 day ahead:", pred_1d)
print("Prediction 3 days ahead:", pred_3d)


TypeError: 'float' object cannot be interpreted as an integer