In [1]:
from wmfdata import mariadb
from wmfdata.utils import sql_tuple

from datetime import datetime
import pandas as pd
pd.options.display.max_columns = None

import re
import warnings

import requests
from bs4 import BeautifulSoup

[canonical-data](https://github.com/wikimedia-research/canonical-data/blob/master/wiki/wikis.tsv) is the best available and accessible source to gather data for list of wikis, their visilibity and status.

In [2]:
cd_wikis = pd.read_csv('https://raw.githubusercontent.com/wikimedia-research/canonical-data/master/wiki/wikis.tsv', sep='\t')

# for the scope of this analysis, we are only concerned about publicly visible and editable projects
public_wikis = cd_wikis.query("""(visibility == 'public') & (editability == 'public')""").reset_index(drop=True)

# further limiting to content projects: with this, test wikis, organizational wikis, wikimania wikis etc. will be dropped
content_db_groups = ['commons', 'wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary']
public_content_wikis = public_wikis.query("""database_group == @content_db_groups""")

## Gather git blame data


In [3]:
# get the blame data into text files in git_blame_data directory
# for all, closed, private, and fishbowl
!chmod +x get_blame.sh
!./get_blame.sh

Cloning into 'mediawiki-config'...
remote: Counting objects: 9, done[K
remote: Finding sources: 100% (9/9)[K[K
remote: Getting sizes: 100% (6/6)[K[K
remote: Compressing objects: 100% (121267/121267)[K[K
remote: Total 141772 (delta 1), reused 141766 (delta 1)[K3 MiB/s   
Receiving objects: 100% (141772/141772), 171.26 MiB | 39.88 MiB/s, done.
Resolving deltas: 100% (99601/99601), done.
yes: standard output: Broken pipe


In [4]:
# processing the git_blame_data from txt to dict, and to Pandas Dataframe
blame_data = {}

for category in ['all', 'closed']:
    
    blame_data[category] = {}
    
    with open(f'git_blame_data/{category}.txt') as blame_file:
        lines = blame_file.readlines()
    
    for line in lines:
        if 'Do not edit it' in line:
            pass
        else:
            wiki_db = re.sub('.*\)', '', line).strip()
            dt = re.findall('\d{4}-\d{2}-\d{2}', line)[0]
            blame_data[category][wiki_db] = dt

In [5]:
# dict to dataframe and rename columns
blame_data_df = (pd.DataFrame(blame_data)
                 .reset_index()
                 .rename({'index': 'database_code', 'all': 'git_created_dt', 'closed': 'git_closed_dt'}, axis=1))

# combine git blame data with list of content dbs
public_content_wikis = pd.merge(public_content_wikis, blame_data_df, on='database_code', how='left')

# covert required columns to datatime format
public_content_wikis['git_created_dt'] = pd.to_datetime(public_content_wikis['git_created_dt'], yearfirst=True, errors='coerce')
public_content_wikis['git_closed_dt'] = pd.to_datetime(public_content_wikis['git_closed_dt'], yearfirst=True, errors='coerce')

public_content_wikis.head()

Unnamed: 0,database_code,domain_name,database_group,language_code,language_name,status,visibility,editability,english_name,git_created_dt,git_closed_dt
0,aawiki,aa.wikipedia.org,wikipedia,aa,Afar,closed,public,public,Afar Wikipedia,2012-02-24,2012-02-24
1,aawikibooks,aa.wikibooks.org,wikibooks,aa,Afar,closed,public,public,Afar Wikibooks,2012-02-24,2012-02-24
2,aawiktionary,aa.wiktionary.org,wiktionary,aa,Afar,closed,public,public,Afar Wiktionary,2012-02-24,2012-02-24
3,abwiki,ab.wikipedia.org,wikipedia,ab,Abkhazian,open,public,public,Abkhazian Wikipedia,2012-02-24,NaT
4,abwiktionary,ab.wiktionary.org,wiktionary,ab,Abkhazian,closed,public,public,Abkhazian Wiktionary,2012-02-24,2012-02-24


In [6]:
# frequency of dates when a project was added to mediawiki_config files
print('creation:\n', public_content_wikis.git_created_dt.value_counts().head(3), '\n')
print('closed:\n', public_content_wikis.git_closed_dt.value_counts().head(3))

creation:
 2012-02-24    695
2012-05-08     72
2012-11-06      8
Name: git_created_dt, dtype: int64 

closed:
 2012-02-24    89
2012-05-16     5
2013-07-23     2
Name: git_closed_dt, dtype: int64


## wiki creation date

### 2001 to mid-2006

In [9]:
# wikis having their git date recorded on (or before) 24 February 2012
pre2012 = public_content_wikis[public_content_wikis['git_created_dt'] <= datetime(2012, 2, 24)].reset_index(drop=True)

# list of wikis graduated from incubator until 2012 (this is a manually created flat file based on: https://incubator.wikimedia.org/wiki/Incubator:Site_creation_log)
incubator_pre2012_logs = pd.read_csv('incubator_logs/incubator_site_creation_log_pre2012.csv')

In [None]:
# incubator site logs from 2006 to 2010 
incubator_200610_logs = pd.read_csv('incubator_site_creation_log_2006-2010.csv')
incbator_200610_logs.head()

In [None]:
wikis_200610 = incubator_pre2012_logs.query("""domain_name != @incbator_200610_logs.domain_name.values.tolist()""")
wikis_200610.head()

In [None]:
non_incubator_creations = pre2012_wikis.query("""domain_name != @incubator_pre2012_logs.domain_name.values.tolist()""")

In [None]:
non_incubator_creations.head()

In [None]:
min_rev_date = wmf.mariadb.run("""SELECT MIN(rev_timestamp) AS min_rev_timestamp FROM revision""", non_incubator_creations['database_code'].values.tolist())
min_rev_date['database_code'] = non_incubator_creations['database_code'].values.tolist()
min_rev_date['min_rev_timestamp'] = pd.to_datetime(min_rev_date['min_rev_timestamp'], yearfirst=True, errors='coerce')
min_rev_date['min_rev_timestamp'] = min_rev_date['min_rev_timestamp'].apply(lambda x:x.date())
min_rev_date.head()

In [None]:
def extract_messages(year, month):
    url = f'https://lists.wikimedia.org/hyperkitty/list/newprojects@lists.wikimedia.org/{year}/{month}/'
    result = requests.get(url)
    page_content = BeautifulSoup(result.content, 'html.parser')
    creation_dts = {}
    for thread in page_content.find_all('div', {'class': 'thread-email row'}):
        wiki_db = thread.find('span', class_='thread-title').text.replace('New wiki: ', '').strip()
        date = thread.find('div', class_='threa-date').get('title')
        date = datetime.strptime(date, '%A, %d %B %Y %H:%M:%S').date()
        creation_dts[wiki_db] = date
    return creation_dts

In [None]:
creation_dts = {}
for year in range(2010, 2012+1):
    for month in range(1, 12+1):
        output = extract_messages(year, month)
        creation_dts = creation_dts | output

creation_dts

In [None]:
creation_dt_201012 = pd.DataFrame(creation_dts.values(), index=creation_dts.keys(), columns=['date']).reset_index().rename({'index': 'database_code'}, axis=1)
creation_dt_201012

In [None]:
for i in creation_dt_201012.index:
    db_code = creation_dt_201012.loc[i, 'database_code']
    
    if re.search('\d{2}:\d{2}:\d{2}', db_code):
        creation_dt_201012.drop(i, axis=0, inplace=True)
    
    if re.search('.*..*,.*', db_code):
        dbs = db_code.split(':')[1].split(',')
        df = pd.DataFrame([i.strip() for i in dbs],  columns = ['database_code'])
        df['date'] = creation_dt_201012.loc[i, 'date']
        creation_dt_201012.drop(i, axis=0, inplace=True)
        creation_dt_201012 = pd.concat([creation_dt_201012, df], ignore_index=True)

creation_dt_201012.head()

In [None]:
creation_dt_201012 = pd.merge(creation_dt_201012, cd_wikis[['domain_name', 'database_code']], on='database_code', how='left')

In [None]:
incubator201012 = incubator_pre2012_logs.query("""domain_name != @incbator_200610_logs.domain_name.values.tolist()""")
incubator201012 = pd.merge(incubator201012, creation_dt_201012, on='domain_name', how='left').drop('database_code', axis=1)
incubator201012

In [None]:
missing_dates = {
    'nso.wikipedia.org': datetime(2011, 10, 29).date(),
    'or.wiktionary.org': datetime(2011, 9, 28).date()
}

In [None]:
for i in incubator201012[incubator201012.date.isna()].index:
    incubator201012.loc[i, 'date'] = missing_dates[incubator201012.loc[i, 'domain_name']]

## wiki closure date
(under construction)

In [None]:
closed_content_wikis = public_content_dbs.query("""status == 'closed'""")

In [None]:
closed_content_wikis.git_closed_dt.value_counts()

In [None]:
db_group_prefix_map = {'wikipedia': 'Wp',   
                       'wikibooks': 'Wb',
                       'wiktionary': 'Wt', 
                       'wikiquote': 'Wq', 
                       'wikisource': 'Ws',
                       'wikinews': 'Wn',
                       'wikivoyage': 'Wy',
                       'wikiversity': 'Wv'}

In [None]:
def generate_prefix(db_group, language_code, prefix_map=db_group_prefix_map):
    return f'{db_group_prefix_map[db_group]}/{language_code}'

In [None]:
closed_content_wikis['prefix'] = closed_content_wikis[['database_group', 'language_code']].apply(lambda x:generate_prefix(x.database_group, x.language_code), axis=1)

In [None]:
import_log_query = """
WITH 
    logs AS (
        SELECT 
            log_id,
            log_timestamp,
            log_title,
            REGEXP_SUBSTR(log_title, 'W[a-z]/[a-z]+') AS prefix
        FROM 
            logging
        WHERE 
            log_type = 'import'
        HAVING
            REGEXP_SUBSTR(log_title, 'W[a-z]/[a-z]+') IN {CLOSED_DBS}),
    
    first_log AS (
        SELECT
            MIN(log_timestamp) AS log_timestamp,
            prefix
        FROM
            logs
        GROUP BY
            prefix)
    
SELECT
    prefix,
    CONCAT(YEAR(log_timestamp), '-', MONTH(log_timestamp), '-', DAY(log_timestamp)) AS first_log_timestamp
FROM 
    first_log
"""

incubator_import_log = wmf.mariadb.run(import_log_query.format(CLOSED_DBS=sql_tuple(closed_content_wikis.prefix.values)), dbs='incubatorwiki')
incubator_import_log.head()

In [None]:
incubator_import_log.shape

In [None]:
closed_content_wikis = pd.merge(closed_content_wikis, incubator_import_log, on='prefix', how='left')
closed_content_wikis.head()

In [None]:
close_missing = closed_content_wikis[closed_content_wikis.first_log_timestamp.isna()]
close_missing

In [None]:
response = requests.get('https://meta.wikimedia.org/wiki/Stewards/Former_stewards')
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
former_stewards = pd.read_html(str(soup.find('table', {'class': 'sortable'})))[0]

In [None]:
closed_content_wikis.shape

In [None]:
current_stewards = wmf.mariadb.run("""
SELECT *
FROM global_user_groups ug
JOIN globaluser u
ON u.gu_id = ug.gug_user
WHERE gug_group = 'steward'


""", ['centralauth'], use_x1=True)

In [None]:
stewards = former_stewards.Username.values.tolist() + current_stewards.gu_name.values.tolist()

In [None]:
stewards_sql = sql_tuple(stewards)

In [None]:
df = wmf.mariadb.run(f"""
WITH max_time AS (SELECT MAX(rev_timestamp) AS timestamp
FROM revision r
JOIN actor a
ON r.rev_actor = a.actor_id
WHERE NOT (actor_name IN {stewards_sql} OR actor_name IN ('Flow talk page manager', 'Global rename script', 'MediaWiki message delivery', 'Maintenance script')))

SELECT * FROM revision r
JOIN actor a ON r.rev_actor = a.actor_id
JOIN max_time m WHERE r.rev_timestamp = m.timestamp


""", ['abwiktionary'])


df




In [None]:
warnings.filterwarnings('ignore')
df = wmf.mariadb.run(f"""
SELECT *, MAX(rev_timestamp) AS timestamp
FROM revision r
JOIN actor a
ON r.rev_actor = a.actor_id
WHERE actor_name NOT IN {stewards_sql}


""", close_missing.database_code.values.tolist())

df['wiki_db'] = close_missing.database_code.values.tolist()
df




In [None]:
df.actor_name.unique()