# **Wikipedia: blocking of users and policies**


## Library Imports

In [56]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import csv 
import matplotlib.gridspec as gridspec
# import json
import datetime as dt
import scipy.stats as stat
import seaborn as sns
from sklearn import preprocessing


# setting plotting parameters
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.color'] = 'gray'

# **Part A:** Reading, cleaning and processing original data to our needs

In [57]:
# setting up the working directory in local machine 
folder = "../data"
folder_result = "../results"

## Part A.1 Reading data

### **1. Reading the freq_policies_monthly.csv file and seeing the data**

In [58]:
file = '/block_policies/freq_policies_monthly.csv'
freq_policies_monthly = pd.read_csv(folder + file)
freq_policies_monthly['month'] = pd.to_datetime(freq_policies_monthly['month']) 
print(freq_policies_monthly.shape)
freq_policies_monthly.head()

(215, 2)


Unnamed: 0,month,nb_policies
0,2005-01-01,28
1,2005-02-01,28
2,2005-03-01,28
3,2005-04-01,29
4,2005-05-01,29


### **2. Reading the MonthlyEditAndEditorsFrom2004-2017.csv' file and cleaning**

In [59]:
file = '/wikimedia_data/MonthlyEditAndEditorsFrom2004-2017.csv'
df_edit_editors_04_07 = pd.read_csv(folder + file)
# df_edit_editors_04_07['month'] == pd.to_datetime(df_edit_editors_04_07['month'])
print(df_edit_editors_04_07.shape)
df_edit_editors_04_07.head()

(168, 3)


Unnamed: 0,month,edits,editors
0,2004-01-01T00:00:00.000Z,218853,14858
1,2004-02-01T00:00:00.000Z,310206,19883
2,2004-03-01T00:00:00.000Z,440685,26881
3,2004-04-01T00:00:00.000Z,405181,26519
4,2004-05-01T00:00:00.000Z,420408,28361


In [60]:
# checkingg if there are NaN values
nan_count = df_edit_editors_04_07['editors'].isnull().sum()
nan_count

0

In [61]:
# Now we are designing the MonthlyEditAndEditorsFrom2004-2017.csv data to our needs

# cleaning month column to have proper format
df_edit_editors_04_07['month'] = pd.to_datetime(df_edit_editors_04_07['month'])

print(df_edit_editors_04_07.shape)

# keeping only values from 2005 and onwards
df_edit_editors_04_07 = df_edit_editors_04_07[df_edit_editors_04_07['month'] >= '2005-01-01']

# converting back to date format 
df_edit_editors_04_07['month'] = df_edit_editors_04_07['month'].dt.date

# resetting the windows so starts at 0
df_edit_editors_04_07.reset_index(drop=True, inplace=True)

print(df_edit_editors_04_07.shape)

df_edit_editors_04_07.tail()
# plt.plot(df_edit_editors_04_07['month'],df_edit_editors_04_07['edits'])

(168, 3)
(156, 3)


Unnamed: 0,month,edits,editors
151,2017-08-01,4686591,399409
152,2017-09-01,4666223,398494
153,2017-10-01,4696462,436203
154,2017-11-01,4622820,434451
155,2017-12-01,4797589,409054


### **3. Reading the wikipedia_en_editors_filtered.csv file and cleaning**
1. We are only keeping values from Jan-2005 to Dec-2017 (Because the scraped data is until 2017)
2. Keeping the time in proper format
3. Other basic cleaning  

In [62]:
file = '/wikimedia_data/wikipedia_en_editors_filtered.csv'
df_editors_filtered= pd.read_csv(folder + file)

print(df_editors_filtered.shape)
df_editors_filtered.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_editors_filtered.head()

(276, 7)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
0,2001-01-01T00:00:00.000Z,44,66,,
1,2001-02-01T00:00:00.000Z,24,45,,
2,2001-03-01T00:00:00.000Z,88,79,,
3,2001-04-01T00:00:00.000Z,64,44,,
4,2001-05-01T00:00:00.000Z,85,58,,


In [63]:
# identifying which columns have NaN values
nan_count = df_editors_filtered.isnull().sum()
print(nan_count)

# Replacing NaN values with 0
df_editors_filtered['total.name-bot'].fillna(0, inplace=True)
df_editors_filtered['total.group-bot'].fillna(0, inplace=True)

# checking if there are yet any NaN values
nan_count = df_editors_filtered.isnull().sum()
print("After replacement: \n" ,nan_count)

month               0
total.anonymous     0
total.user          0
total.name-bot     23
total.group-bot    38
dtype: int64
After replacement: 
 month              0
total.anonymous    0
total.user         0
total.name-bot     0
total.group-bot    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_editors_filtered['total.name-bot'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_editors_filtered['total.group-bot'].fillna(0, inplace=True)


In [64]:
# cleaning month column to have proper format
df_editors_filtered['month'] = pd.to_datetime(df_editors_filtered['month'])

print(df_editors_filtered.shape)

# keeping only values from 2005 and 2023
df_editors_filtered = df_editors_filtered[df_editors_filtered['month'] >= '2005-01-01']
df_editors_filtered = df_editors_filtered[df_editors_filtered['month'] < '2024-01-01']

# converting back to date format 
df_editors_filtered['month'] = df_editors_filtered['month'].dt.date

# resetting the windows so starts at 0
df_editors_filtered.reset_index(drop=True, inplace=True)

print(df_editors_filtered.shape)

df_editors_filtered.tail()

# plt.figure(figsize=(10,6))
# plt.plot(df_editors_filtered['month'],df_editors_filtered['total.anonymous'])

(276, 5)
(228, 5)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
223,2023-08-01,218541,121026,78.0,114.0
224,2023-09-01,219391,120653,72.0,116.0
225,2023-10-01,226690,124411,63.0,118.0
226,2023-11-01,225791,122564,71.0,118.0
227,2023-12-01,213825,118306,67.0,119.0


### **4. Reading the wikipedia_en_active_editors.csv file and cleaning**

In [65]:
file = '/wikimedia_data/wikipedia_en_active_editors.csv'
df_active_editors= pd.read_csv(folder + file)
print(df_active_editors.shape)
df_active_editors.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_active_editors.tail()

(276, 4)


Unnamed: 0,month,total.total
271,2023-08-01T00:00:00.000Z,37763
272,2023-09-01T00:00:00.000Z,38760
273,2023-10-01T00:00:00.000Z,40051
274,2023-11-01T00:00:00.000Z,39570
275,2023-12-01T00:00:00.000Z,38264


In [66]:
# identifying which columns have NaN values
nan_count = df_active_editors.isnull().sum()
print(nan_count)

# Replacing NaN values with 0
df_active_editors['total.total'].fillna(0, inplace=True)

# checking if there are yet any NaN values
nan_count = df_active_editors.isnull().sum()
print("After replacement: \n" ,nan_count)

month          0
total.total    0
dtype: int64
After replacement: 
 month          0
total.total    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_active_editors['total.total'].fillna(0, inplace=True)


In [67]:
# cleaning month column to have proper format
df_active_editors['month'] = pd.to_datetime(df_active_editors['month'])

print(df_active_editors.shape)

# keeping only values from 2005 and 2023
df_active_editors = df_active_editors[df_active_editors['month'] >= '2005-01-01']
df_active_editors = df_active_editors[df_active_editors['month'] < '2024-01-01']

# converting back to date format 
df_active_editors['month'] = df_active_editors['month'].dt.date

# resetting the windows so starts at 0
df_active_editors.reset_index(drop=True, inplace=True)

print(df_active_editors.shape)

df_active_editors.tail()

# plt.figure(figsize=(10,6))
# plt.plot(df_active_editors['month'],df_active_editors['total.total'])

(276, 2)
(228, 2)


Unnamed: 0,month,total.total
223,2023-08-01,37763
224,2023-09-01,38760
225,2023-10-01,40051
226,2023-11-01,39570
227,2023-12-01,38264


### **5. Reading the wikipedia_en_edits_filtered.csv file and cleaning**

In [68]:
file = '/wikimedia_data/wikipedia_en_edits_filtered.csv'
df_edits_filtered = pd.read_csv(folder + file)
print(df_edits_filtered.shape)
df_edits_filtered.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_edits_filtered.head()

(276, 7)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
0,2001-01-01T00:00:00.000Z,95,440,,
1,2001-02-01T00:00:00.000Z,144,803,,
2,2001-03-01T00:00:00.000Z,429,1367,,
3,2001-04-01T00:00:00.000Z,231,958,,
4,2001-05-01T00:00:00.000Z,342,2028,,


In [69]:
# identifying which columns have NaN values
nan_count = df_edits_filtered.isnull().sum()
print(nan_count)

# Replacing NaN values with 0
df_edits_filtered['total.name-bot'].fillna(0, inplace=True)
df_edits_filtered['total.group-bot'].fillna(0, inplace=True)

# checking if there are yet any NaN values
nan_count = df_edits_filtered.isnull().sum()
print("After replacement: \n" ,nan_count)

month               0
total.anonymous     0
total.user          0
total.name-bot     23
total.group-bot    38
dtype: int64
After replacement: 
 month              0
total.anonymous    0
total.user         0
total.name-bot     0
total.group-bot    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_edits_filtered['total.name-bot'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_edits_filtered['total.group-bot'].fillna(0, inplace=True)


In [70]:
# cleaning month column to have proper format
df_edits_filtered['month'] = pd.to_datetime(df_edits_filtered['month'])

print(df_edits_filtered.shape)

# keeping only values from 2005 and 2023
df_edits_filtered = df_edits_filtered[df_edits_filtered['month'] >= '2005-01-01']
df_edits_filtered = df_edits_filtered[df_edits_filtered['month'] < '2024-01-01']

# converting back to date format 
df_edits_filtered['month'] = df_edits_filtered['month'].dt.date

# resetting the windows so starts at 0
df_edits_filtered.reset_index(drop=True, inplace=True)

print(df_edits_filtered.shape)

df_edits_filtered.tail()

# plt.figure(figsize=(10,6))
# plt.plot(df_edits_filtered['month'],df_edits_filtered['total.user'])

(276, 5)
(228, 5)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
223,2023-08-01,618094,3550686,6715.0,803995.0
224,2023-09-01,610776,3220975,10965.0,845599.0
225,2023-10-01,626440,3510703,7556.0,646053.0
226,2023-11-01,615859,3455103,7141.0,669320.0
227,2023-12-01,605376,3569871,7460.0,926927.0


### **6. Reading the wikipedia_en_edited_pages.csv file and cleaning**

In [71]:
file = '/wikimedia_data/wikipedia_en_edited_pages.csv'
df_edited_pages = pd.read_csv(folder + file)
print(df_edited_pages.shape)
df_edited_pages.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_edited_pages.head()

(276, 7)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
0,2001-01-01T00:00:00.000Z,18,53,,
1,2001-02-01T00:00:00.000Z,26,161,,
2,2001-03-01T00:00:00.000Z,191,492,,
3,2001-04-01T00:00:00.000Z,111,584,,
4,2001-05-01T00:00:00.000Z,205,1274,,


In [72]:
# identifying which columns have NaN values
nan_count = df_edited_pages.isnull().sum()
print(nan_count)

# Replacing NaN values with 0
df_edited_pages['total.name-bot'].fillna(0, inplace=True)
df_edited_pages['total.group-bot'].fillna(0, inplace=True)

# checking if there are yet any NaN values
nan_count = df_edited_pages.isnull().sum()
print("After replacement: \n" ,nan_count)

month               0
total.anonymous     0
total.user          0
total.name-bot     23
total.group-bot    38
dtype: int64
After replacement: 
 month              0
total.anonymous    0
total.user         0
total.name-bot     0
total.group-bot    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_edited_pages['total.name-bot'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_edited_pages['total.group-bot'].fillna(0, inplace=True)


In [73]:
# cleaning month column to have proper format
df_edited_pages['month'] = pd.to_datetime(df_edited_pages['month'])

print(df_edited_pages.shape)

# keeping only values from 2005 and 2023
df_edited_pages = df_edited_pages[df_edited_pages['month'] >= '2005-01-01']
df_edited_pages = df_edited_pages[df_edited_pages['month'] < '2024-01-01']

# converting back to date format 
df_edited_pages['month'] = df_edited_pages['month'].dt.date

# resetting the windows so starts at 0
df_edited_pages.reset_index(drop=True, inplace=True)

print(df_edited_pages.shape)

df_edited_pages.head()

# plt.figure(figsize=(10,6))
# plt.plot(df_edited_pages['month'],df_edited_pages['total.user'])

(276, 5)
(228, 5)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
0,2005-01-01,66918,175598,1268.0,2970.0
1,2005-02-01,66906,176741,1538.0,1205.0
2,2005-03-01,81284,219188,4975.0,1618.0
3,2005-04-01,99532,250281,22458.0,4557.0
4,2005-05-01,109594,261928,18046.0,3975.0


### **7. Reading the wikipedia_en_edited_pages_conoc.csv file and cleaning**

In [74]:
file = '/wikimedia_data/wikipedia_en_edited_pages_conoc.csv'
df_edited_pages_conoc = pd.read_csv(folder + file)
print(df_edited_pages_conoc.shape)
df_edited_pages_conoc.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_edited_pages_conoc.head()

(276, 5)


Unnamed: 0,month,total.non-content,total.content
0,2001-01-01T00:00:00.000Z,28,38
1,2001-02-01T00:00:00.000Z,29,148
2,2001-03-01T00:00:00.000Z,89,510
3,2001-04-01T00:00:00.000Z,51,604
4,2001-05-01T00:00:00.000Z,71,1351


In [75]:
# cleaning month column to have proper format
df_edited_pages_conoc['month'] = pd.to_datetime(df_edited_pages_conoc['month'])

print(df_edited_pages_conoc.shape)

# keeping only values from 2005 and 2023
df_edited_pages_conoc = df_edited_pages_conoc[df_edited_pages_conoc['month'] >= '2005-01-01']
df_edited_pages_conoc = df_edited_pages_conoc[df_edited_pages_conoc['month'] < '2024-01-01']

# converting back to date format 
df_edited_pages_conoc['month'] = df_edited_pages_conoc['month'].dt.date

# resetting the windows so starts at 0
df_edited_pages_conoc.reset_index(drop=True, inplace=True)

print(df_edited_pages_conoc.shape)

df_edited_pages_conoc.head()

# plt.figure(figsize=(10,6))
# plt.plot(df_edited_pages_conoc['month'],df_edited_pages_conoc['total.non-content'])
# plt.plot(df_edited_pages_conoc['month'],df_edited_pages_conoc['total.content'])

(276, 3)
(228, 3)


Unnamed: 0,month,total.non-content,total.content
0,2005-01-01,38977,162240
1,2005-02-01,39771,161811
2,2005-03-01,49885,198997
3,2005-04-01,59181,233517
4,2005-05-01,62047,245289


### **8. Reading the wikipedia_en_page_views.csv file and cleaning**
(FOR NOW SKIPPING THIS DATAFRAME, NOT ENOUGH DATA)

In [76]:
file = '/wikimedia_data/wikipedia_en_page_views.csv'
df_page_views = pd.read_csv(folder + file)
print(df_page_views.shape)
df_page_views.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_page_views.head()

(96, 6)


Unnamed: 0,month,total.mobile-web,total.mobile-app,total.desktop
0,2016-01-01T00:00:00.000Z,3722494664,106432770,5501942159
1,2016-02-01T00:00:00.000Z,3336997265,92414138,5254955971
2,2016-03-01T00:00:00.000Z,3421694873,97899085,5283017698
3,2016-04-01T00:00:00.000Z,3306862507,81719005,5544327061
4,2016-05-01T00:00:00.000Z,3404277806,98738514,5313329373


In [77]:
# cleaning month column to have proper format
df_page_views['month'] = pd.to_datetime(df_page_views['month'])

print(df_page_views.shape)

# keeping only values from 2005 and 2023
df_page_views = df_page_views[df_page_views['month'] >= '2005-01-01']
df_page_views = df_page_views[df_page_views['month'] < '2024-01-01']

# converting back to date format 
df_page_views['month'] = df_page_views['month'].dt.date

# resetting the windows so starts at 0
df_page_views.reset_index(drop=True, inplace=True)

print(df_page_views.shape)

df_page_views.tail()

# plt.figure(figsize=(10,6))
# plt.plot(df_page_views['month'],df_page_views['total.mobile-web'])

(96, 4)
(96, 4)


Unnamed: 0,month,total.mobile-web,total.mobile-app,total.desktop
91,2023-08-01,6293463148,186338612,4234261325
92,2023-09-01,6158446536,174632844,4697087473
93,2023-10-01,6513933109,182769791,5969584871
94,2023-11-01,6135058646,178276539,5071930038
95,2023-12-01,6371315489,193104070,4645161316


### **9. Reading the wikipedia_en_page_views_legacy.csv file and cleaning**

(FOR NOW SKIPPING THIS DATAFRAME, NOT ENOUGH DATA)

In [78]:
file = '/wikimedia_data/wikipedia_en_page_views_legacy.csv'
df_page_views_legacy = pd.read_csv(folder + file)
print(df_page_views_legacy.shape)
df_page_views_legacy.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_page_views_legacy.head()

(105, 4)


Unnamed: 0,month,total.total
0,2007-12-01T00:00:00.000Z,2998331524
1,2008-01-01T00:00:00.000Z,4930902570
2,2008-02-01T00:00:00.000Z,4818393763
3,2008-03-01T00:00:00.000Z,4955405809
4,2008-04-01T00:00:00.000Z,5159162183


In [79]:
# cleaning month column to have proper format
df_page_views_legacy['month'] = pd.to_datetime(df_page_views_legacy['month'])

print(df_page_views_legacy.shape)

# keeping only values from 2005 and 2023
df_page_views_legacy = df_page_views_legacy[df_page_views_legacy['month'] >= '2005-01-01']
df_page_views_legacy = df_page_views_legacy[df_page_views_legacy['month'] < '2024-01-01']

# converting back to date format 
df_page_views_legacy['month'] = df_page_views_legacy['month'].dt.date

# resetting the windows so starts at 0
df_page_views_legacy.reset_index(drop=True, inplace=True)

print(df_page_views_legacy.shape)

df_page_views_legacy.head()

# plt.figure(figsize=(10,6))
# plt.plot(df_page_views_legacy['month'],df_page_views_legacy['total.total'])

(105, 2)
(105, 2)


Unnamed: 0,month,total.total
0,2007-12-01,2998331524
1,2008-01-01,4930902570
2,2008-02-01,4818393763
3,2008-03-01,4955405809
4,2008-04-01,5159162183


### **10. Reading the wikipedia_en_pages2date.csv file and cleaning**

In [80]:
file = '/wikimedia_data/wikipedia_en_pages2date.csv'
df_pages2date= pd.read_csv(folder + file)
print(df_pages2date.shape)
df_pages2date.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_pages2date.head()

(276, 5)


Unnamed: 0,month,total.non-content,total.content
0,2001-01-01T00:00:00.000Z,28,38
1,2001-02-01T00:00:00.000Z,51,178
2,2001-03-01T00:00:00.000Z,127,630
3,2001-04-01T00:00:00.000Z,172,1197
4,2001-05-01T00:00:00.000Z,230,2437


In [81]:
# cleaning month column to have proper format
df_pages2date['month'] = pd.to_datetime(df_pages2date['month'])

print(df_pages2date.shape)

# keeping only values from 2005 and 2023
df_pages2date = df_pages2date[df_pages2date['month'] >= '2005-01-01']
df_pages2date = df_pages2date[df_pages2date['month'] < '2024-01-01']

# converting back to date format 
df_pages2date['month'] = df_pages2date['month'].dt.date

# resetting the windows so starts at 0
df_pages2date.reset_index(drop=True, inplace=True)

print(df_pages2date.shape)

df_pages2date.head()

# plt.figure(figsize=(10,6))
# plt.plot(df_pages2date['month'],df_pages2date['total.content'])

(276, 3)
(228, 3)


Unnamed: 0,month,total.non-content,total.content
0,2005-01-01,230161,436782
1,2005-02-01,249240,456399
2,2005-03-01,274441,481853
3,2005-04-01,304580,511374
4,2005-05-01,333642,541295


### **11.Reading the wikipedia_en_new_pages.csv file and cleaing**

In [82]:
file = '/wikimedia_data/wikipedia_en_new_pages.csv'
df_new_pages= pd.read_csv(folder + file)
print(df_new_pages.shape)
df_new_pages.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_new_pages.head()

(276, 7)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
0,2001-01-01T00:00:00.000Z,14,52,,
1,2001-02-01T00:00:00.000Z,17,146,,
2,2001-03-01T00:00:00.000Z,141,387,,
3,2001-04-01T00:00:00.000Z,83,529,,
4,2001-05-01T00:00:00.000Z,163,1135,,


In [83]:
# cleaning month column to have proper format
df_new_pages['month'] = pd.to_datetime(df_new_pages['month'])

print(df_new_pages.shape)

# keeping only values from 2005 and 2023
df_new_pages = df_new_pages[df_new_pages['month'] >= '2005-01-01']
df_new_pages = df_new_pages[df_new_pages['month'] < '2024-01-01']

# converting back to date format 
df_new_pages['month'] = df_new_pages['month'].dt.date

# resetting the windows so starts at 0
df_new_pages.reset_index(drop=True, inplace=True)

print(df_new_pages.shape)

# getting rid of nan values
df_new_pages['total.group-bot'].fillna(0, inplace=True)
df_new_pages.head()

# plt.figure(figsize=(10,6))
# plt.plot(df_new_pages['month'],df_new_pages['total.content'])

(276, 5)
(228, 5)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_new_pages['total.group-bot'].fillna(0, inplace=True)


Unnamed: 0,month,total.anonymous,total.user,total.name-bot,total.group-bot
0,2005-01-01,7756,32798,24.0,0.0
1,2005-02-01,7144,31538,14.0,0.0
2,2005-03-01,9471,40994,190.0,0.0
3,2005-04-01,11297,47379,982.0,2.0
4,2005-05-01,12989,45811,180.0,3.0


### **12.Reading the wikipedia_en_new_reg.csv file and cleaing**

In [84]:
file = '/wikimedia_data/wikipedia_en_new_reg.csv'
df_new_reg= pd.read_csv(folder + file)
print(df_new_reg.shape)
df_new_reg.drop(columns=['timeRange.start', 'timeRange.end'], inplace=True)
df_new_reg.head()

(259, 4)


Unnamed: 0,month,total.total
0,2001-01-01T00:00:00.000Z,5
1,2001-02-01T00:00:00.000Z,2
2,2001-03-01T00:00:00.000Z,7
3,2001-05-01T00:00:00.000Z,8
4,2001-06-01T00:00:00.000Z,3


In [85]:
# cleaning month column to have proper format
df_new_reg['month'] = pd.to_datetime(df_new_reg['month'])

print(df_new_reg.shape)

# keeping only values from 2005 and 2023
df_new_reg = df_new_reg[df_new_reg['month'] >= '2005-01-01']
df_new_reg = df_new_reg[df_new_reg['month'] < '2024-01-01']

# converting back to date format 
df_new_reg['month'] = df_new_reg['month'].dt.date

# resetting the windows so starts at 0
df_new_reg.reset_index(drop=True, inplace=True)

print(df_new_reg.shape)

# getting rid of nan values
# df_new_reg['total.group-bot'].fillna(0, inplace=True)
df_new_reg.head()

# plt.figure(figsize=(10,6))
# plt.plot(df_new_reg['month'],df_new_reg['total.total'])

(259, 2)
(228, 2)


Unnamed: 0,month,total.total
0,2005-01-01,2
1,2005-02-01,5
2,2005-03-01,11
3,2005-04-01,13
4,2005-05-01,7


### **13.Reading the block metrics csv files**

In [86]:
# Import block event log metrics on editors
file = '/scraped_data_metrics/block_monthly_editor_metrics.csv'
block_editors = pd.read_csv(folder + file).iloc[1:, 1:].reset_index(drop=True)

# Data cleaning 
block_editors['month'] = block_editors['month_year'] + '-01'
block_editors = block_editors.drop('month_year', axis=1)

block_editors.head()

Unnamed: 0,n_editor_all,n_editor_ip,n_editor_bot,n_editor_user,n_editor_all_block,n_editor_ip_block,n_editor_bot_block,n_editor_user_block,n_editor_all_unblock,n_editor_ip_unblock,n_editor_bot_unblock,n_editor_user_unblock,month
0,1256,826,0,430,962,759,0,203,534,247,0,287,2005-01-01
1,1306,919,0,387,1216,915,0,301,336,189,0,147,2005-02-01
2,1118,663,0,455,1032,656,0,376,217,58,0,159,2005-03-01
3,1393,855,0,538,1332,851,0,481,174,62,0,112,2005-04-01
4,2127,273,0,1854,384,273,0,111,38,7,0,31,2005-05-01


In [87]:
# Import block event log metrics on editors
file = '/scraped_data_metrics/block_monthly_log_metrics.csv'
block_logs = pd.read_csv(folder + file).iloc[1:, 1:].reset_index(drop=True)

# Data cleaning 
block_logs['month'] = block_logs['month_year'] + '-01'
block_logs = block_logs.drop('month_year', axis=1)

block_logs.head()

Unnamed: 0,n_log_all,n_log_editor_ip,n_log_editor_bot,n_log_editor_user,n_log_admin_ip,n_log_admin_bot,n_log_admin_user,n_log_all_block,n_log_editor_ip_block,n_log_editor_bot_block,...,n_log_admin_bot_block,n_log_admin_user_block,n_log_all_unblock,n_log_editor_ip_unblock,n_log_editor_bot_unblock,n_log_editor_user_unblock,n_log_admin_ip_unblock,n_log_admin_bot_unblock,n_log_admin_user_unblock,month
0,1896,1267,0,629,0,0,1896,1299,993,0,...,0,1299,597,274,0,323,0,0,597,2005-01-01
1,2052,1442,0,610,0,0,2052,1547,1093,0,...,0,1547,505,349,0,156,0,0,505,2005-02-01
2,1629,882,0,747,0,0,1629,1379,819,0,...,0,1379,250,63,0,187,0,0,250,2005-03-01
3,1958,1174,0,784,0,0,1958,1758,1097,0,...,0,1758,200,77,0,123,0,0,200,2005-04-01
4,2221,316,0,1905,0,0,2221,451,308,0,...,0,451,44,8,0,36,0,0,44,2005-05-01


### **14. Reading the block policies**

In [88]:
file = '/block_policies/policy_evolution_3.csv'
df_wikipedia_en_policy = pd.read_csv(folder + file)
print(df_wikipedia_en_policy.shape)
df_wikipedia_en_policy.head()

(228, 2)


Unnamed: 0,Month,tag_number
0,2005-01-01,26
1,2005-02-01,26
2,2005-03-01,26
3,2005-04-01,26
4,2005-05-01,26


In [89]:
df_wikipedia_en_policy.rename(columns={'Month':'month'}, inplace=True)
df_wikipedia_en_policy.head()

Unnamed: 0,month,tag_number
0,2005-01-01,26
1,2005-02-01,26
2,2005-03-01,26
3,2005-04-01,26
4,2005-05-01,26


In [90]:
# cleaning month column to have proper format
df_wikipedia_en_policy['month'] = pd.to_datetime(df_wikipedia_en_policy['month'])

print(df_wikipedia_en_policy.shape)

# keeping only values from 2005 and 2023
df_wikipedia_en_policy = df_wikipedia_en_policy[df_wikipedia_en_policy['month'] >= '2005-01-01']
df_wikipedia_en_policy = df_wikipedia_en_policy[df_wikipedia_en_policy['month'] <= '2024-01-01']

# converting back to date format 
df_wikipedia_en_policy['month'] = df_wikipedia_en_policy['month'].dt.date

# resetting the windows so starts at 0
df_wikipedia_en_policy.reset_index(drop=True, inplace=True)

print(df_wikipedia_en_policy.shape)

df_wikipedia_en_policy.head()

# plt.figure(figsize=(10,6))
# plt.plot(df_wikipedia_en_policy['month'],df_wikipedia_en_policy['total.anonymous'])

(228, 2)
(228, 2)


Unnamed: 0,month,tag_number
0,2005-01-01,26
1,2005-02-01,26
2,2005-03-01,26
3,2005-04-01,26
4,2005-05-01,26


## Part A.2 Merge data

### 1. Copying and summing df_editors_filtered dataframe

In [91]:
df_editors_filtered.columns

Index(['month', 'total.anonymous', 'total.user', 'total.name-bot',
       'total.group-bot'],
      dtype='object')

In [92]:
# giving new unique columns names for better identification
new_columns_names={'total.anonymous': 'editors.anonymous',
                   'total.user': 'editors.user',
                   'total.name-bot': 'editors.name-bot',
                   'total.group-bot': 'editors.group-bot'
                  }
editors = df_editors_filtered.copy()

# renaming the copied dataframe
editors.rename(columns=new_columns_names, inplace=True)

# summing the columns and making a new column
editors['editors.total'] = editors.iloc[:, 1:].sum(axis=1)
editors.head()

Unnamed: 0,month,editors.anonymous,editors.user,editors.name-bot,editors.group-bot,editors.total
0,2005-01-01,65476,14694,18.0,2.0,80190.0
1,2005-02-01,60723,15056,28.0,2.0,75809.0
2,2005-03-01,74055,18243,29.0,2.0,92329.0
3,2005-04-01,92101,21373,31.0,2.0,113507.0
4,2005-05-01,102733,23181,35.0,4.0,125953.0


### 2. Copying and summing df_active_editors dataframe

In [93]:
df_active_editors.columns

Index(['month', 'total.total'], dtype='object')

In [94]:
# giving new unique columns names for better identification
new_columns_names={'total.total': 'active-editors.total'
                  }
active_editors = df_active_editors.copy()

# renaming the copied dataframe
active_editors.rename(columns=new_columns_names, inplace=True)
# active_editors.head()

### 3. Copying and summing df_edits_filtered dataframe

In [95]:
df_edits_filtered.columns

Index(['month', 'total.anonymous', 'total.user', 'total.name-bot',
       'total.group-bot'],
      dtype='object')

In [96]:
# giving new unique columns names for better identification
new_columns_names={'total.anonymous': 'edits.anonymous',
                   'total.user': 'edits.user',
                   'total.name-bot': 'edits.name-bot',
                   'total.group-bot': 'edits.group-bot'
                  }
edits = df_edits_filtered.copy()

# renaming the copied dataframe
edits.rename(columns=new_columns_names, inplace=True)

# summing the columns and making a new column
edits['edits.total'] = edits.iloc[:, 1:].sum(axis=1)
edits.head()

Unnamed: 0,month,edits.anonymous,edits.user,edits.name-bot,edits.group-bot,edits.total
0,2005-01-01,199826,594798,1923.0,3160.0,799707.0
1,2005-02-01,193228,592628,2128.0,1364.0,789348.0
2,2005-03-01,242362,770234,9740.0,1728.0,1024064.0
3,2005-04-01,329974,929123,29911.0,4866.0,1293874.0
4,2005-05-01,365311,969462,24931.0,4664.0,1364368.0


### 4. Copying and summing df_new_reg dataframe

In [97]:
df_new_reg.columns

Index(['month', 'total.total'], dtype='object')

In [98]:
# giving new unique columns names for better identification
new_columns_names={'total.total': 'new-reg.total'
}

new_reg = df_new_reg.copy()

# renaming the copied dataframe
new_reg.rename(columns=new_columns_names, inplace=True)
# new_reg.head()

### 5. Copying and summing df_new_pages dataframe

In [99]:
df_new_pages.columns

Index(['month', 'total.anonymous', 'total.user', 'total.name-bot',
       'total.group-bot'],
      dtype='object')

In [100]:
# giving new unique columns names for better identification
new_columns_names={'total.anonymous': 'new-pages.anonymous',
                   'total.user': 'new-pages.user',
                   'total.name-bot': 'new-pages.name-bot',
                   'total.group-bot': 'new-pages.group-bot'
                  }
new_pages = df_new_pages.copy()

# renaming the copied dataframe
new_pages.rename(columns=new_columns_names, inplace=True)

# summing the columns and making a new column
new_pages['new-pages.total'] = new_pages.iloc[:, 1:].sum(axis=1)
new_pages.head()

Unnamed: 0,month,new-pages.anonymous,new-pages.user,new-pages.name-bot,new-pages.group-bot,new-pages.total
0,2005-01-01,7756,32798,24.0,0.0,40578.0
1,2005-02-01,7144,31538,14.0,0.0,38696.0
2,2005-03-01,9471,40994,190.0,0.0,50655.0
3,2005-04-01,11297,47379,982.0,2.0,59660.0
4,2005-05-01,12989,45811,180.0,3.0,58983.0


### 6. Copying and summing df_edited_pages dataframe

In [101]:
df_edited_pages.columns

Index(['month', 'total.anonymous', 'total.user', 'total.name-bot',
       'total.group-bot'],
      dtype='object')

In [102]:
# giving new unique columns names for better identification
new_columns_names={'total.anonymous': 'edited-pages.anonymous',
                   'total.user': 'edited-pages.user',
                   'total.name-bot': 'edited-pages.name-bot',
                   'total.group-bot': 'edited-pages.group-bot'
                  }
edited_pages = df_edited_pages.copy()

# renaming the copied dataframe
edited_pages.rename(columns=new_columns_names, inplace=True)

# summing the columns and making a new column
edited_pages['edited-pages.total'] = edited_pages.iloc[:, 1:].sum(axis=1)
edited_pages.head()

Unnamed: 0,month,edited-pages.anonymous,edited-pages.user,edited-pages.name-bot,edited-pages.group-bot,edited-pages.total
0,2005-01-01,66918,175598,1268.0,2970.0,246754.0
1,2005-02-01,66906,176741,1538.0,1205.0,246390.0
2,2005-03-01,81284,219188,4975.0,1618.0,307065.0
3,2005-04-01,99532,250281,22458.0,4557.0,376828.0
4,2005-05-01,109594,261928,18046.0,3975.0,393543.0


### 7. Copying and summing df_edited_pages dataframe

In [103]:
df_edited_pages_conoc.columns

Index(['month', 'total.non-content', 'total.content'], dtype='object')

In [104]:
# giving new unique columns names for better identification
new_columns_names={'total.non-content': 'edited-pages-conoc.non-content',
                   'total.content': 'edited-pages-conoc.content',
                  }
edited_pages_conoc = df_edited_pages_conoc.copy()

# renaming the copied dataframe
edited_pages_conoc.rename(columns=new_columns_names, inplace=True)

# summing the columns and making a new column
edited_pages_conoc['edited-pages-conoc.total'] = edited_pages_conoc.iloc[:, 1:].sum(axis=1)
edited_pages_conoc.head()

Unnamed: 0,month,edited-pages-conoc.non-content,edited-pages-conoc.content,edited-pages-conoc.total
0,2005-01-01,38977,162240,201217
1,2005-02-01,39771,161811,201582
2,2005-03-01,49885,198997,248882
3,2005-04-01,59181,233517,292698
4,2005-05-01,62047,245289,307336


### 8. Copying and summing df_edited_pages dataframe

In [105]:
df_pages2date.columns

Index(['month', 'total.non-content', 'total.content'], dtype='object')

In [106]:
# giving new unique columns names for better identification
new_columns_names={'total.non-content': 'pages2date.non-content',
                   'total.content': 'pages2date.content',
                  }
pages2date= df_pages2date.copy()

# renaming the copied dataframe
pages2date.rename(columns=new_columns_names, inplace=True)

# summing the columns and making a new column
pages2date['pages2date.total'] = pages2date.iloc[:, 1:].sum(axis=1)
pages2date.head()

Unnamed: 0,month,pages2date.non-content,pages2date.content,pages2date.total
0,2005-01-01,230161,436782,666943
1,2005-02-01,249240,456399,705639
2,2005-03-01,274441,481853,756294
3,2005-04-01,304580,511374,815954
4,2005-05-01,333642,541295,874937


### 9. Checking the shape of the copied dataframes to ensures row numbers are same

In [107]:
print(editors.shape)
print(active_editors.shape)
print(edits.shape)
print(new_reg.shape)
print(new_pages.shape)
print(edited_pages.shape)
print(edited_pages_conoc.shape)
print(pages2date.shape)
print(block_logs.shape)
print(block_editors.shape)
print(df_wikipedia_en_policy.shape)

(228, 6)
(228, 2)
(228, 6)
(228, 2)
(228, 6)
(228, 6)
(228, 4)
(228, 4)
(228, 22)
(228, 13)
(228, 2)


### 10. Merging all the dataframes together

In [108]:
# merging the dataframes
list_wikimedia_data = [editors,active_editors,edits,new_reg,new_pages,edited_pages,edited_pages_conoc,pages2date]
list_block_metrics = [block_logs,block_editors]
list_policy_tags = [df_wikipedia_en_policy]

df_merged = pd.concat(list_wikimedia_data+list_block_metrics+list_policy_tags, axis=1)

# Drop duplicate 'month' columns
df_merged = df_merged.loc[:,~df_merged.columns.duplicated()]

df_merged

Unnamed: 0,month,editors.anonymous,editors.user,editors.name-bot,editors.group-bot,editors.total,active-editors.total,edits.anonymous,edits.user,edits.name-bot,...,n_editor_user,n_editor_all_block,n_editor_ip_block,n_editor_bot_block,n_editor_user_block,n_editor_all_unblock,n_editor_ip_unblock,n_editor_bot_unblock,n_editor_user_unblock,tag_number
0,2005-01-01,65476,14694,18.0,2.0,80190.0,6871,199826,594798,1923.0,...,430,962,759,0,203,534,247,0,287,26
1,2005-02-01,60723,15056,28.0,2.0,75809.0,6872,193228,592628,2128.0,...,387,1216,915,0,301,336,189,0,147,26
2,2005-03-01,74055,18243,29.0,2.0,92329.0,8543,242362,770234,9740.0,...,455,1032,656,0,376,217,58,0,159,26
3,2005-04-01,92101,21373,31.0,2.0,113507.0,10039,329974,929123,29911.0,...,538,1332,851,0,481,174,62,0,112,26
4,2005-05-01,102733,23181,35.0,4.0,125953.0,10911,365311,969462,24931.0,...,1854,384,273,0,111,38,7,0,31,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,2023-08-01,218541,121026,78.0,114.0,339759.0,37763,618094,3550686,6715.0,...,204,497,465,0,32,0,0,0,0,58
224,2023-09-01,219391,120653,72.0,116.0,340232.0,38760,610776,3220975,10965.0,...,7164,445213,438081,0,7132,156,52,1,103,58
225,2023-10-01,226690,124411,63.0,118.0,351282.0,40051,626440,3510703,7556.0,...,6966,468161,461221,1,6939,138,43,0,95,58
226,2023-11-01,225791,122564,71.0,118.0,348544.0,39570,615859,3455103,7141.0,...,7055,446977,439949,0,7028,137,50,0,87,58


### 11. Saving merged dataframe

In [109]:
file = '/merged_data/merged_data_2005-2023.csv'

# uncomment the code below to save
# df_merged.to_csv(folder+file, index=False)