# Pre-processing

In [1]:
import pandas as pd

In [None]:
# tweets csv big dataset 2.1GB
file_path = 'bitcoin_tweets.csv'

# read tweet csv
# df_all = pd.read_csv(file_path)
# print(df_all.head())

columns = ['date', 'text']
# read two columns: date, text
df = pd.read_csv(file_path, engine="python", usecols=columns)
df.tail(10)

Changing the data type of the Date column to datetime data type to support grouping based on *dates*

In [None]:
df = df[pd.to_datetime(df['date'], errors='coerce').notnull()]
df['date'] = pd.to_datetime(df['date'])

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

In [None]:
import re

def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", '', text)


Call the functions to pre-process the Tweets column and store the pre-processed tweets to a seperate column
Processed Done:

* Remove Hashtags
* Remove URL's
* Remove Special Characters
* Remove the '\n' in the tweets

Using the functions created pre-process each dataset and store the pre-processed data to a seperate column labeled "preprocessed_tweets"

In [None]:
df['preprocessed_tweets'] = (df['text']
                             .apply(remove_hashtags)
                             .apply(remove_urls)
                             .apply(remove_mentions)
                             .apply(remove_special_characters)
                             .str.replace('\n', ' ', regex=False))

print(df.head(10))

# Sentiment Analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()
df['sentiment_scores'] = df['preprocessed_tweets'].apply(lambda x: sid.polarity_scores(x)['compound'])

print(df.head(10))

In [None]:
# Using the functions created use them for each dataframe
# and then grouping them by their dates to get the sentiment score and label for each day
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
daily_sentiment = df.groupby('date')['sentiment_scores'].mean().reset_index()
print(daily_sentiment.head(10))

In [None]:
btc_data = pd.read_csv('btcusd_1-min_data.csv')
print(btc_data.head(10))

In [None]:
from datetime import datetime as dt

# defining function the that turn the timestamp to the date
def calculate_time(timestamp):
    """
    This function turns the timestamp to the date
    :param timestamp: given timestamp
    :return: date according to given timestamp
    """
    return dt.fromtimestamp(timestamp/1000)

In [None]:
# Turn "Timestamp" column to Date

# bitdate = []
# for i in btc_data["Timestamp"]:
#     bitdate.append(calculate_time(i))
# btc_data["bit_coin_date_time"] = bitdate

# 将时间戳列转换为datetime格式，无法解析的时间戳将变为NaT
btc_data['bit_coin_date_time'] = pd.to_datetime(btc_data['Timestamp'], unit='s', errors='coerce')

# 删除包含非法时间戳（NaT）的行
btc_data = btc_data.dropna(subset=['bit_coin_date_time'])
print(btc_data.head(10))

In [5]:
# group by date and get the last minute price of the day
df_daily_last = btc_data.groupby(btc_data['bit_coin_date_time'].dt.date).tail(1)

# save results to a new CSV file
# df_daily_last.to_csv('bitcoin_daily_last_price.csv', index=False)

In [None]:
# process bitcoin data
btc_data = df_daily_last

btc_data['date'] = btc_data['bit_coin_date_time'].dt.strftime('%Y-%m-%d')
btc_data = btc_data[['date', 'Open', 'High', 'Low', 'Close', 'Volume']]
print(btc_data.head(10))

In [None]:
# merge bitcoin data and sentiment data
merged_data = pd.merge(btc_data, daily_sentiment, on='date', how='inner')
print(merged_data.head(10))

In [None]:
merged_data.to_csv('bitcoin_price_sentiment.csv', index=False)