In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
from transformers import pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import joblib
from datetime import datetime, timedelta



In [3]:
# Step 1: Download Bitcoin Price Data
def download_btc_price(start_date, end_date):
    btc = yf.download("BTC-USD", start=start_date, end=end_date)
    return btc[["Close"]]

In [4]:
# Step 2: Load Tweets from CSV
def load_tweets_from_csv(file_path):
    posts_df = pd.read_csv(file_path, low_memory=False)  # Handle large files
    # Adjust column names based on Kaggle dataset
    posts_df["timestamp"] = pd.to_datetime(posts_df["date"])  # 'date' is typical in this dataset
    posts_df["text"] = posts_df["text"].astype(str)  # Ensure text column is string
    return posts_df

In [5]:
# Step 3: Sentiment Analysis
def analyze_sentiment(posts_df):
    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased")
    # Apply sentiment analysis (may take time for large datasets; consider sampling)
    posts_df["sentiment"] = posts_df["text"].apply(
        lambda x: sentiment_analyzer(x[:512])[0]["score"] if sentiment_analyzer(x[:512])[0]["label"] == "POSITIVE" else -sentiment_analyzer(x[:512])[0]["score"]
    )  # Truncate to 512 chars for BERT
    return posts_df

In [7]:
# Step 4: Prepare Data for DNN
def prepare_data(price_df, posts_df, seq_length=7):
    posts_df["date"] = posts_df["timestamp"].dt.date
    daily_sentiment = posts_df.groupby("date")["sentiment"].mean()
    data = pd.merge(price_df, daily_sentiment, left_index=True, right_index=True, how="inner")
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data[["Close", "sentiment"]])
    X, y = [], []
    for i in range(len(scaled_data) - seq_length):
        X.append(scaled_data[i:i + seq_length])
        y.append(scaled_data[i + seq_length, 0])
    return np.array(X), np.array(y), scaler, data

In [8]:
# Step 5: Build and Train LSTM Model
def build_and_train_model(X, y):
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=(X.shape[1], X.shape[2])),
        LSTM(32),
        Dense(1)
    ])
    model.compile(optimizer="adam", loss="mse")
    model.fit(X, y, epochs=50, batch_size=32, validation_split=0.1, verbose=1)
    return model

In [1]:
# Main Execution
if __name__ == "__main__":
    # Define time range (adjust to match your CSV data; this is an example)
    end_date = datetime(2025, 2, 25)
    start_date = end_date - timedelta(days=30)

    # Download Bitcoin price data
    price_df = download_btc_price(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"))
    # Mount Google Drive (run this first in Colab)
    from google.colab import drive
    drive.mount('/content/drive')

    # Load tweets from Kaggle CSV
    file_path = "/content/drive/My Drive/Colab Notebooks/bitcoin_pred/BitcoinTweets.csv"  # Update if filename differs after download
    posts_df = load_tweets_from_csv(file_path)

    # Optional: Sample if too large (e.g., first 10,000 rows for speed)
    posts_df = posts_df.sample(n=10000, random_state=42) if len(posts_df) > 10000 else posts_df

    # Process sentiment
    print("Analyzing sentiment (this may take a while)...")
    posts_df = analyze_sentiment(posts_df)

    # Prepare data
    X, y, scaler, data = prepare_data(price_df, posts_df)

    # Train and save model
    model = build_and_train_model(X, y)
    model.save("btc_model.h5")
    joblib.dump(scaler, "scaler.pkl")
    print("Model trained and saved as 'btc_model.h5'; Scaler saved as 'scaler.pkl'")

    # Test prediction with old data
    last_sequence = scaler.transform(data.tail(7)[["Close", "sentiment"]])
    last_sequence = last_sequence.reshape(1, 7, 2)
    prediction = model.predict(last_sequence)
    dummy = np.zeros((1, 2))
    dummy[0, 0] = prediction[0]
    predicted_price = scaler.inverse_transform(dummy)[0, 0]
    print(f"Test prediction with old data for Feb 26, 2025: ${predicted_price:.2f}")

NameError: name 'datetime' is not defined