# Bitcoin Price Prediction using Machine Learning

**Objective:** Develop a machine learning model to predict Bitcoin price trends based on historical data, helping traders make informed investment decisions.

**Dataset:** Bitcoin Historical Price Dataset

## Project Structure:
1. Importing Necessary Libraries and Dataset
2. Data Preprocessing
3. Exploratory Data Analysis (EDA)
4. Feature Engineering
5. Model Training and Selection
6. Model Evaluation and Prediction

## 1. Importing Necessary Libraries and Dataset

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

# Deep Learning Libraries
try:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    from tensorflow.keras.optimizers import Adam
    TENSORFLOW_AVAILABLE = True
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("TensorFlow not available. LSTM model will be skipped.")

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("All libraries imported successfully!")

In [None]:
# Upload and load the dataset
from google.colab import files
uploaded = files.upload()

# Load the first uploaded file (assuming it's the bitcoin.csv)
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"Dataset Shape: {df.shape}")
print(f"\nColumn Names: {df.columns.tolist()}")
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset Information:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

print("\nBasic Statistics:")
df.describe()

## 2. Data Preprocessing

In [None]:
# Convert Date column to datetime format and set as index
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')
df = df.sort_index()

print("Date column converted to datetime and set as index")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Total trading days: {len(df)}")

df.head()

In [None]:
# Handle missing values (if any) by forward filling
df = df.fillna(method='ffill')

# Create additional price features
df['Price_Change'] = df['Close'].pct_change()
df['High_Low_Ratio'] = df['High'] / df['Low']
df['Open_Close_Ratio'] = df['Open'] / df['Close']

print("Additional price features created:")
print("- Price_Change: Daily percentage change in closing price")
print("- High_Low_Ratio: Ratio of high to low price")
print("- Open_Close_Ratio: Ratio of open to close price")

df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Bitcoin price trends over time
plt.figure(figsize=(15, 8))

plt.subplot(2, 2, 1)
plt.plot(df.index, df['Close'], color='orange', linewidth=2)
plt.title('Bitcoin Price Over Time', fontsize=14, fontweight='bold')
plt.ylabel('Price (USD)')
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
plt.plot(df.index, df['Volume'], color='blue', alpha=0.7)
plt.title('Bitcoin Trading Volume Over Time', fontsize=14, fontweight='bold')
plt.ylabel('Volume')
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 3)
plt.hist(df['Price_Change'].dropna(), bins=50, color='green', alpha=0.7)
plt.title('Distribution of Daily Price Changes', fontsize=14, fontweight='bold')
plt.xlabel('Price Change (%)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 4)
plt.scatter(df['Volume'], df['Close'], alpha=0.5, color='red')
plt.title('Volume vs Close Price', fontsize=14, fontweight='bold')
plt.xlabel('Volume')
plt.ylabel('Close Price (USD)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
correlation_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of Bitcoin Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("Key Correlations with Close Price:")
close_corr = correlation_matrix['Close'].sort_values(ascending=False)
for feature, corr in close_corr.items():
    print(f"{feature}: {corr:.3f}")

## 4. Feature Engineering

In [None]:
# Technical Indicators

# Simple Moving Averages
df['SMA_7'] = df['Close'].rolling(window=7).mean()
df['SMA_21'] = df['Close'].rolling(window=21).mean()
df['SMA_50'] = df['Close'].rolling(window=50).mean()

# Exponential Moving Averages
df['EMA_12'] = df['Close'].ewm(span=12).mean()
df['EMA_26'] = df['Close'].ewm(span=26).mean()

# Bollinger Bands
window = 20
rolling_mean = df['Close'].rolling(window).mean()
rolling_std = df['Close'].rolling(window).std()
df['BB_Upper'] = rolling_mean + (rolling_std * 2)
df['BB_Lower'] = rolling_mean - (rolling_std * 2)
df['BB_Width'] = df['BB_Upper'] - df['BB_Lower']

# RSI (Relative Strength Index)
def calculate_rsi(price, window=14):
    delta = price.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df['RSI'] = calculate_rsi(df['Close'])

# MACD
df['MACD'] = df['EMA_12'] - df['EMA_26']
df['MACD_Signal'] = df['MACD'].ewm(span=9).mean()
df['MACD_Histogram'] = df['MACD'] - df['MACD_Signal']

print("Technical indicators created:")
print("- Simple Moving Averages (7, 21, 50 days)")
print("- Exponential Moving Averages (12, 26 days)")
print("- Bollinger Bands")
print("- RSI (Relative Strength Index)")
print("- MACD (Moving Average Convergence Divergence)")

print(f"\nDataset shape after feature engineering: {df.shape}")
df.head()

In [None]:
# Visualize technical indicators
plt.figure(figsize=(15, 12))

# Price with Moving Averages
plt.subplot(3, 1, 1)
plt.plot(df.index, df['Close'], label='Close Price', linewidth=2)
plt.plot(df.index, df['SMA_7'], label='SMA 7', alpha=0.8)
plt.plot(df.index, df['SMA_21'], label='SMA 21', alpha=0.8)
plt.plot(df.index, df['SMA_50'], label='SMA 50', alpha=0.8)
plt.title('Bitcoin Price with Moving Averages', fontsize=14, fontweight='bold')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True, alpha=0.3)

# RSI
plt.subplot(3, 1, 2)
plt.plot(df.index, df['RSI'], color='purple', linewidth=2)
plt.axhline(y=70, color='r', linestyle='--', alpha=0.7)
plt.axhline(y=30, color='g', linestyle='--', alpha=0.7)
plt.title('RSI (Relative Strength Index)', fontsize=14, fontweight='bold')
plt.ylabel('RSI')
plt.ylim(0, 100)
plt.grid(True, alpha=0.3)

# MACD
plt.subplot(3, 1, 3)
plt.plot(df.index, df['MACD'], label='MACD', linewidth=2)
plt.plot(df.index, df['MACD_Signal'], label='Signal Line', linewidth=2)
plt.bar(df.index, df['MACD_Histogram'], label='Histogram', alpha=0.3)
plt.title('MACD (Moving Average Convergence Divergence)', fontsize=14, fontweight='bold')
plt.ylabel('MACD')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Model Training and Selection

In [None]:
# Prepare data for machine learning
# Drop rows with NaN values (created by technical indicators)
df_clean = df.dropna()

# Select features for prediction
feature_columns = ['Open', 'High', 'Low', 'Volume', 'Price_Change', 
                  'High_Low_Ratio', 'Open_Close_Ratio', 'SMA_7', 'SMA_21', 
                  'SMA_50', 'EMA_12', 'EMA_26', 'BB_Upper', 'BB_Lower', 
                  'BB_Width', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Histogram']

X = df_clean[feature_columns]
y = df_clean['Close']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Clean dataset shape: {df_clean.shape}")

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, shuffle=False)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Normalize features using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures normalized using MinMaxScaler")

In [None]:
# Function to evaluate model performance
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    
    print(f"{model_name} Performance:")
    print(f"  MAE: ${mae:.2f}")
    print(f"  MSE: ${mse:.2f}")
    print(f"  RMSE: ${rmse:.2f}")
    print(f"  R² Score: {r2:.4f}")
    print("-" * 50)
    
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

# Dictionary to store model results
model_results = {}

In [None]:
# 1. Linear Regression
print("Training Linear Regression Model...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
model_results['Linear Regression'] = evaluate_model(y_test, lr_pred, 'Linear Regression')

In [None]:
# 2. Random Forest Regressor
print("Training Random Forest Model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
model_results['Random Forest'] = evaluate_model(y_test, rf_pred, 'Random Forest')

In [None]:
# 3. Support Vector Regression (SVR)
print("Training Support Vector Regression Model...")
svr_model = SVR(kernel='rbf', C=1000, gamma=0.001)
svr_model.fit(X_train_scaled, y_train)
svr_pred = svr_model.predict(X_test_scaled)
model_results['SVR'] = evaluate_model(y_test, svr_pred, 'Support Vector Regression')

In [None]:
# 4. XGBoost Regressor
print("Training XGBoost Model...")
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, 
                            random_state=42, n_jobs=-1)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_test_scaled)
model_results['XGBoost'] = evaluate_model(y_test, xgb_pred, 'XGBoost')

In [None]:
# 5. LSTM Deep Learning Model (only if TensorFlow is available)
if TENSORFLOW_AVAILABLE:
    print("Preparing data for LSTM Model...")
    
    # Prepare sequential data for LSTM
    def create_sequences(data, target, sequence_length=60):
        X, y = [], []
        for i in range(sequence_length, len(data)):
            X.append(data[i-sequence_length:i])
            y.append(target[i])
        return np.array(X), np.array(y)
    
    # Use only Close price for LSTM to keep it simple
    lstm_data = df_clean['Close'].values.reshape(-1, 1)
    lstm_scaler = MinMaxScaler()
    lstm_data_scaled = lstm_scaler.fit_transform(lstm_data)
    
    # Create sequences
    sequence_length = 60
    X_lstm, y_lstm = create_sequences(lstm_data_scaled, lstm_data_scaled.flatten(), sequence_length)
    
    # Split LSTM data
    split_index = int(0.8 * len(X_lstm))
    X_train_lstm, X_test_lstm = X_lstm[:split_index], X_lstm[split_index:]
    y_train_lstm, y_test_lstm = y_lstm[:split_index], y_lstm[split_index:]
    
    print(f"LSTM Training sequences: {X_train_lstm.shape}")
    print(f"LSTM Testing sequences: {X_test_lstm.shape}")
    
    # Build LSTM model
    lstm_model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(sequence_length, 1)),
        Dropout(0.2),
        LSTM(50, return_sequences=False),
        Dropout(0.2),
        Dense(25),
        Dense(1)
    ])
    
    lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    print("\nLSTM Model Architecture:")
    lstm_model.summary()
else:
    print("TensorFlow not available. LSTM model skipped.")

In [None]:
# Train LSTM model (if available)
if TENSORFLOW_AVAILABLE and 'lstm_model' in locals():
    print("Training LSTM Model...")
    history = lstm_model.fit(X_train_lstm, y_train_lstm, 
                            batch_size=32, epochs=50, 
                            validation_split=0.1, 
                            verbose=1)
    
    # Make predictions
    lstm_pred_scaled = lstm_model.predict(X_test_lstm)
    lstm_pred = lstm_scaler.inverse_transform(lstm_pred_scaled)
    y_test_lstm_actual = lstm_scaler.inverse_transform(y_test_lstm.reshape(-1, 1))
    
    model_results['LSTM'] = evaluate_model(y_test_lstm_actual.flatten(), 
                                          lstm_pred.flatten(), 'LSTM')
else:
    print("LSTM model training skipped.")

## 6. Model Evaluation and Comparison

In [None]:
# Compare all model performances
results_df = pd.DataFrame(model_results).T
results_df = results_df.sort_values('RMSE')

print("\n" + "=" * 60)
print("MODEL PERFORMANCE COMPARISON (Sorted by RMSE)")
print("=" * 60)
print(results_df.round(2))

# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['MAE', 'RMSE', 'R2']
for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    results_df[metric].plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)
    ax.grid(True, alpha=0.3)

# Remove the empty subplot
fig.delaxes(axes[1, 1])

plt.tight_layout()
plt.show()

# Find best model
best_model_name = results_df.index[0]
print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"   RMSE: ${results_df.loc[best_model_name, 'RMSE']:.2f}")
print(f"   R² Score: {results_df.loc[best_model_name, 'R2']:.4f}")

In [None]:
# Visualize predictions vs actual prices
plt.figure(figsize=(15, 10))

# Get test dates for plotting
test_dates = y_test.index

plt.subplot(2, 2, 1)
plt.plot(test_dates, y_test.values, label='Actual', linewidth=2, color='black')
plt.plot(test_dates, lr_pred, label='Linear Regression', alpha=0.8)
plt.title('Linear Regression Predictions', fontsize=12, fontweight='bold')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
plt.plot(test_dates, y_test.values, label='Actual', linewidth=2, color='black')
plt.plot(test_dates, rf_pred, label='Random Forest', alpha=0.8)
plt.title('Random Forest Predictions', fontsize=12, fontweight='bold')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 3)
plt.plot(test_dates, y_test.values, label='Actual', linewidth=2, color='black')
plt.plot(test_dates, svr_pred, label='SVR', alpha=0.8)
plt.title('SVR Predictions', fontsize=12, fontweight='bold')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 4)
plt.plot(test_dates, y_test.values, label='Actual', linewidth=2, color='black')
plt.plot(test_dates, xgb_pred, label='XGBoost', alpha=0.8)
plt.title('XGBoost Predictions', fontsize=12, fontweight='bold')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature Importance Analysis (for tree-based models)
if 'Random Forest' in best_model_name or 'XGBoost' in best_model_name:
    if best_model_name == 'Random Forest':
        feature_importance = rf_model.feature_importances_
    else:
        feature_importance = xgb_model.feature_importances_
    
    # Create feature importance dataframe
    importance_df = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(data=importance_df.head(10), x='Importance', y='Feature', palette='viridis')
    plt.title(f'Top 10 Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    for i, (_, row) in enumerate(importance_df.head(10).iterrows(), 1):
        print(f"{i:2d}. {row['Feature']:20s}: {row['Importance']:.4f}")

## 7. Future Price Prediction and Insights

In [None]:
# Make future predictions using the best performing model
print(f"Making future predictions using {best_model_name}...")

# Use the last 30 days of data to predict next 7 days
last_30_days = df_clean[feature_columns].tail(30)
last_30_days_scaled = scaler.transform(last_30_days)

# Select the best model
if best_model_name == 'Linear Regression':
    best_model = lr_model
elif best_model_name == 'Random Forest':
    best_model = rf_model
elif best_model_name == 'SVR':
    best_model = svr_model
elif best_model_name == 'XGBoost':
    best_model = xgb_model

if best_model_name != 'LSTM':
    # Make predictions for the next period
    recent_prediction = best_model.predict(last_30_days_scaled[-1:])
    current_price = df_clean['Close'].iloc[-1]
    predicted_price = recent_prediction[0]
    price_change = ((predicted_price - current_price) / current_price) * 100
    
    print(f"\n📊 PRICE PREDICTION SUMMARY")
    print(f"Current Bitcoin Price: ${current_price:.2f}")
    print(f"Predicted Next Price: ${predicted_price:.2f}")
    print(f"Expected Change: {price_change:+.2f}%")
    
    if price_change > 0:
        print("📈 BULLISH Signal: Price expected to rise")
    else:
        print("📉 BEARISH Signal: Price expected to fall")

## 8. Conclusion and Recommendations

In [None]:
print("\n" + "=" * 70)
print("🎯 PROJECT CONCLUSION AND RECOMMENDATIONS")
print("=" * 70)

print(f"✅ BEST PERFORMING MODEL: {best_model_name}")
print(f"   - RMSE: ${results_df.loc[best_model_name, 'RMSE']:.2f}")
print(f"   - R² Score: {results_df.loc[best_model_name, 'R2']:.4f}")
print(f"   - Mean Absolute Error: ${results_df.loc[best_model_name, 'MAE']:.2f}")

print("\n🔍 KEY INSIGHTS:")
print("   • Technical indicators significantly improve prediction accuracy")
print("   • Moving averages and RSI are strong predictive features")
print("   • Bitcoin price shows high volatility, making perfect prediction challenging")
print("   • Machine learning models can capture trends but not extreme market events")

print("\n💡 FUTURE IMPROVEMENTS:")
print("   • Include external factors (news sentiment, market cap, etc.)")
print("   • Implement ensemble methods combining multiple models")
print("   • Use transformer architectures for better sequence modeling")
print("   • Add real-time data feeds for live predictions")
print("   • Implement risk management and position sizing strategies")

print("\n⚠️  DISCLAIMER:")
print("   This model is for educational purposes only.")
print("   Cryptocurrency trading involves significant risks.")
print("   Always conduct thorough research before making investment decisions.")
print("=" * 70)