In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mwclient
import time
from datetime import datetime
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)


In [3]:
data = pd.read_csv("dataset/bitcoin.csv")
data.head()

Unnamed: 0,SNo,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap
0,1,Bitcoin,BTC,2013-04-29 23:59:59,147.488007,134.0,134.444,144.539993,0.0,1603769000.0
1,2,Bitcoin,BTC,2013-04-30 23:59:59,146.929993,134.050003,144.0,139.0,0.0,1542813000.0
2,3,Bitcoin,BTC,2013-05-01 23:59:59,139.889999,107.720001,139.0,116.989998,0.0,1298955000.0
3,4,Bitcoin,BTC,2013-05-02 23:59:59,125.599998,92.281898,116.379997,105.209999,0.0,1168517000.0
4,5,Bitcoin,BTC,2013-05-03 23:59:59,108.127998,79.099998,106.25,97.75,0.0,1085995000.0


In [4]:
site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [5]:
revs = list(page.revisions())
revs[0]

OrderedDict([('revid', 1214137600),
             ('parentid', 1214133449),
             ('user', 'Anastrophe'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=3, tm_mday=17, tm_hour=4, tm_min=43, tm_sec=4, tm_wday=6, tm_yday=77, tm_isdst=-1)),
             ('comment',
              "/* Mining */ really poorly crafted sentence. clarifying. maybe i'm overstating it - making it less ambiguous.")])

In [6]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [8]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [11]:
edits = {}

for rev in revs:
    print(rev)
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)

    edits[date]["edit_count"] += 1

    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

OrderedDict([('revid', 275832581), ('parentid', 0), ('user', 'Pratyeka'), ('timestamp', time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)), ('comment', 'creation (stub)')])
OrderedDict([('revid', 275832690), ('parentid', 275832581), ('user', 'Pratyeka'), ('timestamp', time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=44, tm_wday=6, tm_yday=67, tm_isdst=-1)), ('comment', '')])
OrderedDict([('revid', 275849499), ('parentid', 275832690), ('user', 'PamD'), ('timestamp', time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=18, tm_min=12, tm_sec=46, tm_wday=6, tm_yday=67, tm_isdst=-1)), ('comment', 'Stub-sorting. [[Wikipedia:WikiProject Stub sorting|You can help!]]')])
OrderedDict([('revid', 275850009), ('parentid', 275849499), ('user', 'PamD'), ('timestamp', time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=18, tm_min=15, tm_sec=35, tm_wday=6, tm_yday=67, tm_isdst=-1)), ('

In [12]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0

    del edits[key]["sentiments"]

In [13]:
edits_df = pd.DataFrame.from_dict(edits, orient="index")
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-08-05,1,0.748121,0.000000
2009-08-06,2,0.995746,0.000000
2009-08-14,1,0.930021,0.000000
2009-10-13,2,-0.227499,0.500000
...,...,...,...
2024-02-25,2,-0.005358,0.500000
2024-02-26,1,-0.996016,1.000000
2024-02-27,3,-0.321697,0.666667
2024-03-06,1,-0.961399,1.000000


In [14]:
edits_df.index = pd.to_datetime(edits_df.index)

In [15]:
dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [16]:
edits_df = edits_df.reindex(dates, fill_value=0)
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2024-03-20,0,0.000000,0.00
2024-03-21,0,0.000000,0.00
2024-03-22,0,0.000000,0.00
2024-03-23,0,0.000000,0.00


In [17]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()
rolling_edits = rolling_edits.dropna()
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2024-03-20,1.100000,-0.115911,0.201587
2024-03-21,1.033333,-0.112603,0.184921
2024-03-22,0.933333,-0.098838,0.162698
2024-03-23,0.600000,-0.096964,0.146032


In [18]:
rolling_edits.to_csv("dataset/wikipedia_processed.csv")