### Bitcoin Prediction
Using sentiment analysis and price data

In [1]:
import mwclient
import time

In [2]:
site = mwclient.Site("en.wikipedia.org")
page = site.pages["Bitcoin"] 

In [3]:
revs = list(page.revisions()) # Takes about a couple mins

In [4]:
revs[0]

OrderedDict([('revid', 1197117025),
             ('parentid', 1195811696),
             ('user', 'Vgbyp'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=1, tm_mday=19, tm_hour=9, tm_min=46, tm_sec=32, tm_wday=4, tm_yday=19, tm_isdst=-1)),
             ('comment', '/* 2015–2019 */ futures')])

In [5]:
# Sorting the revisions / edits in reverse order 
revs = sorted(revs, key=lambda rev: rev["timestamp"])

In [6]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [7]:
from transformers import pipeline
from torch import *
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    senti = sentiment_pipeline([text[:250]])[0]
    score = senti["score"] # 0 - 1
    if senti["label"] == "NEGATIVE":
        score *= -1
    return score

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
  if rate and total else datetime.utcfromtimestamp(0))
model.safetensors: 100%|██████████| 268M/268M [00:13<00:00, 20.0MB/s] 
  if rate and total else datetime.utcfromtimestamp(0))
tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 558kB/s]
  if rate and total else datetime.utcfromtimestamp(0))
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.52MB/s]


In [14]:
find_sentiment("i love you")

0.9998656511306763

In [35]:
edits = {}

# Processing revisions for sentiments
for rev in revs:
    date = time.strftime("%Y-%m-%d", rev["timestamp"])

    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)

    edits[date]["edit_count"] += 1

    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))
    

Converting the list of sentiments to single values below, so its easier to be used in a dataframe

In [36]:
from statistics import mean 

for key in edits:
    if len(edits[key]["sentiments"]) > 0: # if there were sentiments extracted from comments
        edits[key]["sentiment"] = mean(edits[key]["sentiments"]) # mean sentiment for that edit
        # percent of sentiment that was negative for that date / edit
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"]) 
    else: 
        # if we couldnt find sentiment for that date
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0 

    del edits[key]["sentiments"]

Converting Sentiment data into a Dataframe

In [40]:
import pandas as pd 

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [41]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-08-05,1,0.748121,0.000000
2009-08-06,2,0.995746,0.000000
2009-08-14,1,0.930021,0.000000
2009-10-13,2,-0.227501,0.500000
...,...,...,...
2024-01-10,1,0.876925,0.000000
2024-01-11,1,0.924098,0.000000
2024-01-12,3,0.315349,0.333333
2024-01-15,2,0.080186,0.500000


In [42]:
edits_df.index = pd.to_datetime(edits_df.index)

Since BTC trades everyday, our price data will have data for every day. We need to make the above sentiment data also 
fill in for everyday, so we can merge the two sets of data together for our analysis

In [43]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [44]:
edits_df = edits_df.reindex(dates, fill_value=0) # filling in the missing values with 0 

In [45]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2024-01-27,0,0.000000,0.00
2024-01-28,0,0.000000,0.00
2024-01-29,0,0.000000,0.00
2024-01-30,0,0.000000,0.00


In [46]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [47]:
rolling_edits = rolling_edits.dropna()

In [48]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2024-01-27,0.433333,0.015578,0.127778
2024-01-28,0.433333,0.015578,0.127778
2024-01-29,0.433333,0.015578,0.127778
2024-01-30,0.433333,0.015578,0.127778


In [50]:
rolling_edits.to_csv("wikipedia_edits.csv") # saving to csv to use later