In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

def load_data(price_file, news_file):
    price_df = pd.read_csv(price_file)
    news_df = pd.read_csv(news_file)
    return price_df, news_df

def select_time_range(df, start_date, end_date):
    df['Date'] = pd.to_datetime(df['Date'])
    return df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

price_file = r'C:\Users\Administrator\Desktop\DSP391m_cryptocurrency-price-prediction\data\processed\BTC\BTC_price_dataset.csv'
news_file = r'C:\Users\Administrator\Desktop\DSP391m_cryptocurrency-price-prediction\data\processed\BTC_news\BTC_sentiment_dataset.csv'

price_df, news_df = load_data(price_file, news_file)

start_date = '2019-12-01'
end_date = '2024-07-01'

price_df_filtered = select_time_range(price_df, start_date, end_date)
news_df_filtered = select_time_range(news_df, start_date, end_date)



In [2]:
print("Price data:")
print(price_df_filtered.info())

Price data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       945 non-null    datetime64[ns]
 1   Open       945 non-null    float64       
 2   High       945 non-null    float64       
 3   Low        945 non-null    float64       
 4   Close      945 non-null    float64       
 5   Change     945 non-null    float64       
 6   Amplitude  945 non-null    float64       
 7   MA(7)      945 non-null    float64       
 8   MA(25)     945 non-null    float64       
 9   MA(99)     945 non-null    float64       
 10  Vol(USDT)  945 non-null    float64       
 11  RSI        945 non-null    float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 88.7 KB
None


In [3]:
print("\nNews data:")
print(news_df_filtered.info())


News data:
<class 'pandas.core.frame.DataFrame'>
Index: 895 entries, 0 to 894
Data columns (total 6 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Date                           895 non-null    datetime64[ns]
 1   Ensemble_Sentiment             895 non-null    float64       
 2   Ensemble_Sentiment_Normalized  895 non-null    float64       
 3   Sentiment_7day_MA              895 non-null    float64       
 4   Sentiment_Change               895 non-null    float64       
 5   Sentiment_Volatility           895 non-null    float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 48.9 KB
None


In [4]:
# 2. Merge datasets
merged_df = pd.merge(price_df_filtered, news_df_filtered, on='Date', how='outer')
merged_df.sort_values('Date', inplace=True)
print("Merged data shape:", merged_df.shape)
merged_df.head()

Merged data shape: (961, 17)


Unnamed: 0,Date,Open,High,Low,Close,Change,Amplitude,MA(7),MA(25),MA(99),Vol(USDT),RSI,Ensemble_Sentiment,Ensemble_Sentiment_Normalized,Sentiment_7day_MA,Sentiment_Change,Sentiment_Volatility
0,2021-11-12,,,,,,,,,,,,0.117124,0.620403,0.039219,0.195836,0.117536
1,2021-11-14,,,,,,,,,,,,0.321889,1.0,0.082579,0.204765,0.157689
2,2021-11-15,,,,,,,,,,,,0.222948,0.816581,0.114429,-0.098941,0.160716
3,2021-11-16,,,,,,,,,,,,0.132033,0.648042,0.148298,-0.090915,0.128508
4,2021-11-17,,,,,,,,,,,,0.119481,0.624772,0.132074,-0.012552,0.12308


In [5]:
print("\nMissing values before handling:")
print(merged_df.isnull().sum())


Missing values before handling:
Date                              0
Open                             16
High                             16
Low                              16
Close                            16
Change                           16
Amplitude                        16
MA(7)                            16
MA(25)                           16
MA(99)                           16
Vol(USDT)                        16
RSI                              16
Ensemble_Sentiment               64
Ensemble_Sentiment_Normalized    64
Sentiment_7day_MA                64
Sentiment_Change                 64
Sentiment_Volatility             64
dtype: int64


In [6]:
merged_df.set_index('Date', inplace=True)

merged_df = merged_df.interpolate(method='time')

merged_df.reset_index(inplace=True)

print("\nMissing values after handling:")
print(merged_df.isnull().sum())


Missing values after handling:
Date                              0
Open                             15
High                             15
Low                              15
Close                            15
Change                           15
Amplitude                        15
MA(7)                            15
MA(25)                           15
MA(99)                           15
Vol(USDT)                        15
RSI                              15
Ensemble_Sentiment                0
Ensemble_Sentiment_Normalized     0
Sentiment_7day_MA                 0
Sentiment_Change                  0
Sentiment_Volatility              0
dtype: int64


In [7]:
# Feature engineering
# Create lag features
for col in ['Close', 'Ensemble_Sentiment']:
    for lag in [7, 14]:
        merged_df[f'{col}_lag_{lag}'] = merged_df[col].shift(lag)

# Create rolling mean features
for col in ['Close', 'Ensemble_Sentiment']:
    for window in [7, 14]:
        merged_df[f'{col}_rolling_{window}'] = merged_df[col].rolling(window=window).mean()


In [14]:
merged_df.head()

new_start_date = '2022-01-01'
new_end_date = '2024-07-11'

dataset = select_time_range(merged_df, new_start_date, new_end_date)

print("\nMissing values:")
print(dataset.isnull().sum())


Missing values:
Date                             0
Open                             0
High                             0
Low                              0
Close                            0
Change                           0
Amplitude                        0
MA(7)                            0
MA(25)                           0
MA(99)                           0
Vol(USDT)                        0
RSI                              0
Ensemble_Sentiment               0
Ensemble_Sentiment_Normalized    0
Sentiment_7day_MA                0
Sentiment_Change                 0
Sentiment_Volatility             0
Close_lag_7                      0
Close_lag_14                     0
Ensemble_Sentiment_lag_7         0
Ensemble_Sentiment_lag_14        0
Close_rolling_7                  0
Close_rolling_14                 0
Ensemble_Sentiment_rolling_7     0
Ensemble_Sentiment_rolling_14    0
dtype: int64


In [15]:
dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Change,Amplitude,MA(7),MA(25),MA(99),...,Sentiment_Change,Sentiment_Volatility,Close_lag_7,Close_lag_14,Ensemble_Sentiment_lag_7,Ensemble_Sentiment_lag_14,Close_rolling_7,Close_rolling_14,Ensemble_Sentiment_rolling_7,Ensemble_Sentiment_rolling_14
46,2022-01-01,47421.77,48269.13,46166.0,46751.555,-0.01415,0.04445,48019.95,48345.145,54994.24,...,0.115827,0.153186,50399.66,46834.48,-0.217538,-0.066035,47939.240714,48479.041786,-0.0085,0.022521
47,2022-01-02,47722.66,47990.0,46654.0,47286.18,-0.0091,0.028,47579.5,48224.12,55042.36,...,0.071262,0.145427,50775.49,46681.23,-0.044782,-0.013734,47440.767857,48522.2525,0.012941,0.031024
48,2022-01-03,47286.18,47570.0,45696.0,46446.1,-0.0178,0.0396,46971.59,48180.14,55075.54,...,0.026697,0.137668,50701.44,46914.16,0.127975,0.038567,46832.862143,48488.819643,0.01161,0.036745
49,2022-01-04,46446.1,47557.54,45500.0,45832.01,-0.0132,0.0443,46727.06,48127.8,55112.76,...,-0.159174,0.129053,47543.74,48889.88,-0.217538,0.0,46588.329286,48270.400357,0.036899,0.033851
50,2022-01-05,45832.01,47070.0,42500.0,43451.13,-0.0519,0.0997,46296.55,47890.24,55137.25,...,0.277789,0.120928,46464.66,48588.16,-0.115851,0.250325,46157.825,47903.469643,0.087345,0.032918


In [16]:
# 5. Handle outliers
def remove_outliers(df, columns, n_std=3):
    df_copy = df.copy()
    for col in columns:
        mean = df_copy[col].mean()
        std = df_copy[col].std()
        df_copy.loc[:, col] = df_copy[col].clip(mean - n_std * std, mean + n_std * std)
    return df_copy


numeric_columns = dataset.select_dtypes(include=[np.number]).columns
dataset = remove_outliers(dataset, numeric_columns)

# 6. Normalize data
scaler = MinMaxScaler()
dataset.loc[:, numeric_columns] = scaler.fit_transform(dataset[numeric_columns])

In [17]:
dataset.to_csv(r'C:\Users\Administrator\Desktop\DSP391m_cryptocurrency-price-prediction\data\dataset\processed_dataset.csv', index=False)
print("\nProcessed data saved to 'processed_data.csv'")
dataset.head()


Processed data saved to 'processed_data.csv'


Unnamed: 0,Date,Open,High,Low,Close,Change,Amplitude,MA(7),MA(25),MA(99),...,Sentiment_Change,Sentiment_Volatility,Close_lag_7,Close_lag_14,Ensemble_Sentiment_lag_7,Ensemble_Sentiment_lag_14,Close_rolling_7,Close_rolling_14,Ensemble_Sentiment_rolling_7,Ensemble_Sentiment_rolling_14
46,2022-01-01,0.552276,0.556092,0.549436,0.540577,0.413396,0.347672,0.582894,0.606334,0.759323,...,0.649354,0.731691,0.604254,0.542024,0.0,0.280859,0.581409,0.607399,0.283334,0.361037
47,2022-01-02,0.557528,0.551234,0.558172,0.549909,0.442729,0.208349,0.574787,0.60402,0.760308,...,0.590912,0.68891,0.610814,0.53935,0.320259,0.377816,0.572233,0.608219,0.3353,0.38393
48,2022-01-03,0.549909,0.543925,0.541021,0.535245,0.392194,0.306595,0.563597,0.603179,0.760987,...,0.532469,0.646129,0.609521,0.543415,0.640518,0.474773,0.561044,0.607585,0.332074,0.399332
49,2022-01-04,0.535246,0.543708,0.537512,0.524527,0.418914,0.346402,0.559096,0.602179,0.761748,...,0.288722,0.598624,0.554404,0.577901,0.0,0.403276,0.556542,0.603443,0.393364,0.39154
50,2022-01-05,0.524527,0.535223,0.483804,0.482969,0.19412,0.815611,0.551172,0.597637,0.762249,...,0.861749,0.553825,0.535569,0.572634,0.18851,0.867333,0.548618,0.596484,0.515626,0.38903
