# Initial Data

In [2]:
import pandas as pd

In [3]:
vader_path = 'datasets/vader_prediction.csv'
bert_path = 'datasets/bert_prediction.csv'
bitcoin_path = 'datasets/bitcoin_price.csv'

In [4]:
df_vader = pd.read_csv(vader_path)
df_bert = pd.read_csv(bert_path)
df_price = pd.read_csv(bitcoin_path)

In [5]:
display(df_vader.head(2))
display(df_bert.head(2))
display(df_price.head(2))

Unnamed: 0.1,Unnamed: 0,date,content_9,Polarity Scores,Score,prediction_using_ps,prediction_using_sc
0,0,2021-06-29 23:59:42+00:00,propose use million bitcoin measure entire ag...,"{'neg': 0.0, 'neu': 0.751, 'pos': 0.249, 'comp...",0.6486,neutral,positive
1,1,2021-06-29 23:59:08+00:00,bitcoin death cross perfect die naturally beco...,"{'neg': 0.422, 'neu': 0.378, 'pos': 0.2, 'comp...",-0.6249,negative,negative


Unnamed: 0.1,Unnamed: 0,sentence,prediction
0,0,propose use million bitcoin measure entire ag...,neutral
1,1,bitcoin death cross perfect die naturally beco...,positive


Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,2021-06-01,37293.792969,37896.734375,35787.085938,36684.925781,36684.925781,34639423297
1,1,2021-06-02,36699.921875,38231.339844,35966.308594,37575.179688,37575.179688,33070867190


In [6]:
print('VADER using Polarity Scores')
for x in ['positive', 'negative', 'neutral']:
    print('Percentage of '+x+':\t\t', round((len(df_vader[df_vader['prediction_using_ps'] == x]) / len(df_vader)) * 100, 2), '%')

VADER using Polarity Scores
Percentage of positive:		 4.28 %
Percentage of negative:		 1.82 %
Percentage of neutral:		 93.9 %


In [7]:
print('VADER using Score')
for x in ['positive', 'negative', 'neutral']:
    print('Percentage of '+x+':\t\t', round((len(df_vader[df_vader['prediction_using_sc'] == x]) / len(df_vader)) * 100, 2), '%')

VADER using Score
Percentage of positive:		 44.15 %
Percentage of negative:		 20.51 %
Percentage of neutral:		 35.34 %


In [8]:
print('BERT')
for x in ['positive', 'negative', 'neutral']:
    print('Percentage of '+x+':\t\t', round((len(df_bert[df_bert['prediction'] == x]) / len(df_bert)) * 100, 2), '%')

BERT
Percentage of positive:		 15.73 %
Percentage of negative:		 8.1 %
Percentage of neutral:		 76.17 %


In [9]:
df_combined = pd.DataFrame(df_vader['date'])

df_combined['Vader PS'] = df_vader['prediction_using_ps']
df_combined['Vader SC'] = df_vader['prediction_using_sc']
df_combined['Bert'] = df_bert['prediction']

df_combined.head(3)

Unnamed: 0,date,Vader PS,Vader SC,Bert
0,2021-06-29 23:59:42+00:00,neutral,positive,neutral
1,2021-06-29 23:59:08+00:00,negative,negative,positive
2,2021-06-29 23:59:00+00:00,neutral,negative,neutral


In [10]:
def checkPrediction(pred):
    if pred == 'neutral':
        return 0
    elif pred == 'positive':
        return 1
    elif pred == 'negative':
        return -1

In [11]:
for col in ['Vader PS', 'Vader SC', 'Bert']:
    df_combined[col] = df_combined[col].apply(lambda pred: checkPrediction(pred))

In [12]:
df_combined.head(3)

Unnamed: 0,date,Vader PS,Vader SC,Bert
0,2021-06-29 23:59:42+00:00,0,1,0
1,2021-06-29 23:59:08+00:00,-1,-1,1
2,2021-06-29 23:59:00+00:00,0,-1,0


In [13]:
import datetime

def convertDate(date):
    return datetime.datetime.strptime(date, '%Y-%m-%d %X%z').strftime('%Y-%m-%d')

In [14]:
df_combined['date'] = df_combined['date'].apply(lambda date: convertDate(date))

In [15]:
print('Number of unique dates:\t', len(df_combined['date'].unique()))

Number of unique dates:	 61


In [16]:
df_combined = df_combined.groupby('date').mean()

In [17]:
df_combined.head(10)

Unnamed: 0_level_0,Vader PS,Vader SC,Bert
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-06-01,0.029919,0.341608,0.096827
2021-06-02,0.030013,0.339552,0.099448
2021-06-03,0.031826,0.339058,0.088073
2021-06-04,0.009708,0.163622,0.018445
2021-06-05,0.025658,0.258201,0.043088
2021-06-06,0.023815,0.275093,0.063081
2021-06-07,0.011892,0.203413,0.045915
2021-06-08,0.012775,0.170574,0.031865
2021-06-09,0.030829,0.357589,0.090449
2021-06-10,0.026891,0.238411,0.048565


In [18]:
for col in df_combined.columns:
    print(len(df_combined[df_combined[col] > 0]) / len(df_combined))

1.0
1.0
1.0


In [19]:
df_combined.head()

Unnamed: 0_level_0,Vader PS,Vader SC,Bert
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-06-01,0.029919,0.341608,0.096827
2021-06-02,0.030013,0.339552,0.099448
2021-06-03,0.031826,0.339058,0.088073
2021-06-04,0.009708,0.163622,0.018445
2021-06-05,0.025658,0.258201,0.043088


In [20]:
len(df_combined)

61

In [21]:
df_price_sub = df_price[['Date', 'Close']][:len(df_combined)]

In [22]:
df_price_sub = df_price_sub.set_index(df_combined.index)

In [23]:
df_combined['Price'] = df_price_sub['Close']

In [26]:
df_combined

Unnamed: 0_level_0,Vader PS,Vader SC,Bert,Price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-06-01,0.029919,0.341608,0.096827,36684.925781
2021-06-02,0.030013,0.339552,0.099448,37575.179688
2021-06-03,0.031826,0.339058,0.088073,39208.765625
2021-06-04,0.009708,0.163622,0.018445,36894.406250
2021-06-05,0.025658,0.258201,0.043088,35551.957031
...,...,...,...,...
2021-07-27,0.027904,0.253635,0.082685,39406.941406
2021-07-28,0.025230,0.245964,0.099008,39995.906250
2021-07-29,0.026324,0.269426,0.112591,40008.421875
2021-07-30,0.023118,0.236569,0.102325,42235.546875


In [1]:
import matplotlib.pyplot as plt

df_combined['Bert'].hist()

df_combined['Vader PS'].hist()

df_combined['Vader SC'].hist()

# df_combined['combined']

NameError: name 'df_combined' is not defined