# Pre-processing

In [None]:
import pandas as pd

# tweets csv big dataset 2.1GB
file_path = 'bitcoin_tweets.csv'

# read tweet csv
# df_all = pd.read_csv(file_path)
# print(df_all.head())

columns = ['date', 'text']
# read two columns: date, text
df = pd.read_csv(file_path, engine="python", usecols=columns)
df.tail(10)

Changing the data type of the Date column to datetime data type to support grouping based on *dates*

In [None]:
df = df[pd.to_datetime(df['date'], errors='coerce').notnull()]
df['date'] = pd.to_datetime(df['date'])

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re

def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", '', text)


Call the functions to pre-process the Tweets column and store the pre-processed tweets to a seperate column
Processed Done:

* Remove Hashtags
* Remove URL's
* Remove Special Characters
* Remove the '\n' in the tweets

Using the functions created pre-process each dataset and store the pre-processed data to a seperate column labeled "preprocessed_tweets"

In [None]:
df['preprocessed_tweets'] = (df['text']
                             .apply(remove_hashtags)
                             .apply(remove_urls)
                             .apply(remove_mentions)
                             .apply(remove_special_characters)
                             .str.replace('\n', ' ', regex=False))

print(df.head(10))

                 date                                               text  \
0 2021-02-10 23:59:04  Blue Ridge Bank shares halted by NYSE after #b...   
1 2021-02-10 23:58:48  😎 Today, that's this #Thursday, we will do a "...   
2 2021-02-10 23:54:48  Guys evening, I have read this article about B...   
3 2021-02-10 23:54:33  $BTC A big chance in a billion! Price: \487264...   
4 2021-02-10 23:54:06  This network is secured by 9 508 nodes as of t...   
5 2021-02-10 23:53:30  💹 Trade #Crypto on #Binance \n\n📌 Enjoy #Cashb...   
6 2021-02-10 23:53:17  &lt;'fire' &amp; 'man'&gt;\n#Bitcoin #Crypto #...   
7 2021-02-10 23:52:42  🔄 Prices update in $EUR (1 hour):\n\n$BTC   - ...   
8 2021-02-10 23:52:25  #BTC #Bitcoin #Ethereum #ETH #Crypto #cryptotr...   
9 2021-02-10 23:52:08  .@Tesla’s #bitcoin investment is revolutionary...   

                                 preprocessed_tweets  
0  Blue Ridge Bank shares halted by NYSE after  A...  
1   Today thats this  we will do a  Take  with ou... 

# Sentiment Analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
df['sentiment_scores'] = df['preprocessed_tweets'].apply(lambda x: sid.polarity_scores(x)['compound'])

print(df.head(10))

In [None]:
# Using the functions created use them for each dataframe
# and then grouping them by their dates to get the sentiment score and label for each day
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
daily_sentiment = df.groupby('date')['sentiment_scores'].mean().reset_index()
print(daily_sentiment.head(10))

In [None]:
btc_data = pd.read_csv('bitcoin_price.csv')
print(btc_data.head(10))

       Open Time      Open      High       Low     Close     Volume  \
0  1609459200000  28923.63  28961.66  28913.12  28961.66  27.457032   
1  1609459260000  28961.67  29017.50  28961.01  29009.91  58.477501   
2  1609459320000  29009.54  29016.71  28973.58  28989.30  42.470329   
3  1609459380000  28989.68  28999.85  28972.33  28982.69  30.360677   
4  1609459440000  28982.67  28995.93  28971.80  28975.65  24.124339   
5  1609459500000  28975.65  28979.53  28933.16  28937.11  22.396014   
6  1609459560000  28937.11  28963.25  28937.10  28943.87  20.480294   
7  1609459620000  28943.88  28954.48  28930.00  28934.84  20.962343   
8  1609459680000  28934.84  28936.15  28889.24  28900.00  52.645478   
9  1609459740000  28900.00  28920.06  28846.28  28858.94  98.083975   

      Close Time  Quote asset volume  Number of trades  \
0  1609459259999        7.943820e+05              1292   
1  1609459319999        1.695803e+06              1651   
2  1609459379999        1.231359e+06        

In [None]:
from datetime import datetime as dt

# defining function the that turn the timestamp to the date
def calculate_time(timestamp):
    """
    This function turns the timestamp to the date
    :param timestamp: given timestamp
    :return: date according to given timestamp
    """
    return dt.fromtimestamp(timestamp/1000)

In [None]:
# Turn "Open Time" and "Close Time" columns to Date
open_date = []
for i in btc_data["Open Time"]:
    open_date.append(calculate_time(i))
btc_data["Open Time"] = open_date

close_date = []
for i in btc_data["Close Time"]:
    close_date.append(calculate_time(i))
btc_data["Close Time"] = close_date
print(btc_data.head(10))

            Open Time      Open      High       Low     Close     Volume  \
0 2021-01-01 00:00:00  28923.63  28961.66  28913.12  28961.66  27.457032   
1 2021-01-01 00:01:00  28961.67  29017.50  28961.01  29009.91  58.477501   
2 2021-01-01 00:02:00  29009.54  29016.71  28973.58  28989.30  42.470329   
3 2021-01-01 00:03:00  28989.68  28999.85  28972.33  28982.69  30.360677   
4 2021-01-01 00:04:00  28982.67  28995.93  28971.80  28975.65  24.124339   
5 2021-01-01 00:05:00  28975.65  28979.53  28933.16  28937.11  22.396014   
6 2021-01-01 00:06:00  28937.11  28963.25  28937.10  28943.87  20.480294   
7 2021-01-01 00:07:00  28943.88  28954.48  28930.00  28934.84  20.962343   
8 2021-01-01 00:08:00  28934.84  28936.15  28889.24  28900.00  52.645478   
9 2021-01-01 00:09:00  28900.00  28920.06  28846.28  28858.94  98.083975   

               Close Time  Quote asset volume  Number of trades  \
0 2021-01-01 00:00:59.999        7.943820e+05              1292   
1 2021-01-01 00:01:59.999    

In [None]:
# 按日期分组并提取每天最后一分钟的数据
df_daily_last = btc_data.groupby(btc_data['Close Time'].dt.date).tail(1)

# 保存结果到一个新的CSV文件
df_daily_last.to_csv('bitcoin_daily_last_price.csv', index=False)

In [None]:
# process bitcoin data
btc_data['date'] = btc_data['Close Time'].dt.strftime('%Y-%m-%d')
btc_data = btc_data[['date', 'Open', 'High', 'Low', 'Close', 'Volume']]
print(btc_data.head(10))

         date      Open      High       Low     Close     Volume
0  01-01-2021  28923.63  28961.66  28913.12  28961.66  27.457032
1  01-01-2021  28961.67  29017.50  28961.01  29009.91  58.477501
2  01-01-2021  29009.54  29016.71  28973.58  28989.30  42.470329
3  01-01-2021  28989.68  28999.85  28972.33  28982.69  30.360677
4  01-01-2021  28982.67  28995.93  28971.80  28975.65  24.124339
5  01-01-2021  28975.65  28979.53  28933.16  28937.11  22.396014
6  01-01-2021  28937.11  28963.25  28937.10  28943.87  20.480294
7  01-01-2021  28943.88  28954.48  28930.00  28934.84  20.962343
8  01-01-2021  28934.84  28936.15  28889.24  28900.00  52.645478
9  01-01-2021  28900.00  28920.06  28846.28  28858.94  98.083975


In [None]:
# merge bitcoin data and sentiment data
merged_data = pd.merge(btc_data, daily_sentiment, on='date', how='inner')
print(merged_data.head(10))

In [None]:
merged_data.to_csv('bitcoin_price_sentiment.csv', index=False)