In [1]:
import mwclient
import time

site = mwclient.Site("en.wikipedia.org")
page = site.pages["Bitcoin"]

In [3]:
revs = list(page.revisions())

In [4]:
revs[0]

OrderedDict([('revid', 1186727898),
             ('parentid', 1186727521),
             ('user', '3df'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=11, tm_mday=25, tm_hour=2, tm_min=47, tm_sec=35, tm_wday=5, tm_yday=329, tm_isdst=-1)),
             ('comment', '/* Units and divisibility */ ce')])

In [5]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])

In [6]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [9]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [12]:
find_sentiment("I feel neutral about you")

-0.9937905669212341

In [15]:
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

In [19]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]

In [20]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [21]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-08-05,1,0.748121,0.000000
2009-08-06,2,0.995746,0.000000
2009-08-14,1,0.930021,0.000000
2009-10-13,2,-0.227501,0.500000
...,...,...,...
2023-11-21,97,-0.025145,0.484536
2023-11-22,90,0.014850,0.466667
2023-11-23,61,-0.368450,0.672131
2023-11-24,89,-0.122848,0.539326


In [22]:
edits_df.index = pd.to_datetime(edits_df.index)

In [33]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [34]:
dates

DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2023-11-16', '2023-11-17', '2023-11-18', '2023-11-19',
               '2023-11-20', '2023-11-21', '2023-11-22', '2023-11-23',
               '2023-11-24', '2023-11-25'],
              dtype='datetime64[ns]', length=5376, freq='D')

In [35]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [36]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-03-09,0,0.000000,0.000000
2009-03-10,0,0.000000,0.000000
2009-03-11,0,0.000000,0.000000
2009-03-12,0,0.000000,0.000000
...,...,...,...
2023-11-21,97,-0.025145,0.484536
2023-11-22,90,0.014850,0.466667
2023-11-23,61,-0.368450,0.672131
2023-11-24,89,-0.122848,0.539326


In [37]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [38]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2023-11-21,8.433333,-0.111240,0.270318
2023-11-22,11.400000,-0.138429,0.285873
2023-11-23,13.366667,-0.122469,0.274944
2023-11-24,16.266667,-0.127673,0.276255


In [41]:
rolling_edits = rolling_edits.dropna()

In [42]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2023-11-21,8.433333,-0.111240,0.270318
2023-11-22,11.400000,-0.138429,0.285873
2023-11-23,13.366667,-0.122469,0.274944
2023-11-24,16.266667,-0.127673,0.276255


In [43]:
rolling_edits.to_csv("wikipedia_edits.csv")