In [3]:
from wmfdata import mariadb
from wmfdata.utils import sql_tuple

from datetime import datetime
import pandas as pd
pd.options.display.max_columns = None

import re
import warnings

import requests
from bs4 import BeautifulSoup

[canonical-data](https://github.com/wikimedia-research/canonical-data/blob/master/wiki/wikis.tsv) is the best available and accessible source to gather data for list of wikis, their visilibity and status.

In [4]:
cd_wikis = pd.read_csv('https://raw.githubusercontent.com/wikimedia-research/canonical-data/master/wiki/wikis.tsv', sep='\t')

# for the scope of this analysis, we are only concerned about publicly visible and editable projects
public_wikis = cd_wikis.query("""(visibility == 'public') & (editability == 'public')""").reset_index(drop=True)

# further limiting to content projects: with this, test wikis, organizational wikis, wikimania wikis etc. will be dropped
content_db_groups = ['commons', 'wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote', 'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary']
public_content_wikis = public_wikis.query("""database_group == @content_db_groups""")

## Gather git blame data


In [108]:
# get the blame data into text files in git_blame_data directory
# for all, closed, private, and fishbowl
!chmod +x get_blame.sh
!./get_blame.sh

Cloning into 'mediawiki-config'...
remote: Counting objects: 45, done[K
remote: Finding sources: 100% (45/45)[K[K
remote: Getting sizes: 100% (37/37)[K[K
remote: Compressing objects: 100% (2273075/2273075)[K[K
remote: Total 142362 (delta 17), reused 142329 (delta 0)[K MiB/s   
Receiving objects: 100% (142362/142362), 173.85 MiB | 41.88 MiB/s, done.
Resolving deltas: 100% (99901/99901), done.
yes: standard output: Broken pipe


In [5]:
# processing the git_blame_data from txt to dict, and to Pandas Dataframe
blame_data = {}

for category in ['all', 'closed']:
    
    blame_data[category] = {}
    
    with open(f'git_blame_data/{category}.txt') as blame_file:
        lines = blame_file.readlines()
    
    for line in lines:
        if 'Do not edit it' in line:
            pass
        else:
            wiki_db = re.sub('.*\)', '', line).strip()
            dt = re.findall('\d{4}-\d{2}-\d{2}', line)[0]
            blame_data[category][wiki_db] = dt

In [6]:
# dict to dataframe and rename columns
blame_data_df = (pd.DataFrame(blame_data)
                 .reset_index()
                 .rename({'index': 'database_code', 'all': 'git_created_dt', 'closed': 'git_closed_dt'}, axis=1))

# combine git blame data with list of content dbs
public_content_wikis = pd.merge(public_content_wikis, blame_data_df, on='database_code', how='left')

# covert required columns to datatime format
for col in ['git_created_dt', 'git_closed_dt']:
    public_content_wikis[col] = pd.to_datetime(public_content_wikis[col], yearfirst=True, errors='coerce')

public_content_wikis.head()

Unnamed: 0,database_code,domain_name,database_group,language_code,language_name,status,visibility,editability,english_name,git_created_dt,git_closed_dt
0,aawiki,aa.wikipedia.org,wikipedia,aa,Afar,closed,public,public,Afar Wikipedia,2012-02-24,2012-02-24
1,aawikibooks,aa.wikibooks.org,wikibooks,aa,Afar,closed,public,public,Afar Wikibooks,2012-02-24,2012-02-24
2,aawiktionary,aa.wiktionary.org,wiktionary,aa,Afar,closed,public,public,Afar Wiktionary,2012-02-24,2012-02-24
3,abwiki,ab.wikipedia.org,wikipedia,ab,Abkhazian,open,public,public,Abkhazian Wikipedia,2012-02-24,NaT
4,abwiktionary,ab.wiktionary.org,wiktionary,ab,Abkhazian,closed,public,public,Abkhazian Wiktionary,2012-02-24,2012-02-24


In [7]:
public_content_wikis.to_csv('git_blame_data.csv')

In [8]:
# frequency of dates when a project was added to mediawiki_config files
print('creation:\n', public_content_wikis.git_created_dt.value_counts().head(3), '\n')
print('closed:\n', public_content_wikis.git_closed_dt.value_counts().head(3))

creation:
 2012-02-24    695
2012-05-08     72
2012-11-06      8
Name: git_created_dt, dtype: int64 

closed:
 2012-02-24    89
2012-05-16     5
2013-07-23     2
Name: git_closed_dt, dtype: int64


## wiki creation date

### 2001 to mid-2006
approach: earliest revision timestamp available from the [revision table](https://www.mediawiki.org/wiki/Manual:Revision_table) and thhe [archive table](https://www.mediawiki.org/wiki/Manual:Archive_table)

In [9]:
# wikis having their git date recorded on (or before) 24 February 2012
pre2012 = public_content_wikis[public_content_wikis['git_created_dt'] <= datetime(2012, 2, 24)].reset_index(drop=True)

# list of wikis graduated from incubator until 2012 (this is a manually created flat file based on: https://incubator.wikimedia.org/wiki/Incubator:Site_creation_log)
incubator_pre2012_logs = pd.read_csv('incubator_logs/incubator_site_creation_log_pre2012.csv')

# projects that were created before 2012, but not through incubator
non_incubator_creations = pre2012.query("""domain_name != @incubator_pre2012_logs.domain_name.values.tolist()""")

In [50]:
# query to fetch the least available rev timestamp between the revision and the archive table
least_rev_timestamp_query = """
WITH least_rev_dt
     AS (SELECT rev_timestamp AS dt,
                'least_rev'   AS approach
           FROM revision
          WHERE rev_id = (SELECT Min(rev_id)
                            FROM revision)),
                            
     least_ar_rev_dt
     AS (SELECT ar_timestamp   AS dt,
                'leart_ar_rev' AS approach
           FROM archive
          WHERE ar_rev_id = (SELECT Min(ar_rev_id)
                               FROM archive)),
                               
     min_rev_timestamp
     AS (SELECT Min(rev_timestamp) AS dt,
                'min_rev_dt'       AS approach
           FROM revision),
           
     min_ar_rev_timestamp
     AS (SELECT Min(ar_timestamp) AS dt,
                'min_ar_rev_dt'   AS approach
           FROM archive),
           
     all_dts
     AS (SELECT *
           FROM least_rev_dt
         UNION ALL
         SELECT *
           FROM least_ar_rev_dt
         UNION ALL
         SELECT *
           FROM min_rev_timestamp
         UNION ALL
         SELECT *
           FROM min_ar_rev_timestamp),
           
     ranking
     AS (SELECT *,
                ROW_NUMBER()
                  OVER (
                    ORDER BY dt ) AS row_num
           FROM all_dts
          WHERE NOT DATE (dt) < DATE ('2001-01-15'))
          
SELECT DATE (dt) AS dt,
       approach
  FROM ranking
 WHERE row_num = 1
"""

In [51]:
warnings.filterwarnings('ignore')

least_rev_dt = mariadb.run(least_rev_timestamp_query, non_incubator_creations['database_code'].values.tolist())
least_rev_dt['database_code'] = non_incubator_creations['database_code'].values.tolist()
least_rev_dt['dt'] = pd.to_datetime(least_rev_dt['dt'], yearfirst=True, errors='coerce')
least_rev_dt['dt'] = least_rev_dt['dt'].apply(lambda x:x.date())
least_rev_dt.head()

Unnamed: 0,dt,approach,database_code
0,2004-02-24,min_ar_rev_dt,aawiki
1,2004-08-13,min_rev_dt,aawikibooks
2,2004-01-29,min_ar_rev_dt,aawiktionary
3,2003-10-12,min_rev_dt,abwiki
4,2004-01-29,leart_ar_rev,abwiktionary


### mid-2006 to August 2010
approach: manually curated list based on [Wikimedia Incubator Site Creation Log](https://incubator.wikimedia.org/wiki/Incubator:Site_creation_log)

In [25]:
# loading a manually extracted and cleaned file: incubator site creation logs from 2006 to 2010
incubator_200610_logs = pd.read_csv('incubator_logs/incubator_site_creation_log_2006-2010.csv').rename({'date': 'dt'}, axis=1)
incubator_200610_logs['dt'] = pd.to_datetime(incubator_200610_logs['dt'], errors='coerce')
incubator_200610_logs = pd.merge(incubator_200610_logs, cd_wikis[['domain_name', 'database_code']], on='domain_name', how='left')
incubator_200610_logs.drop('domain_name', axis=1, inplace=True)
incubator_200610_logs.head()

Unnamed: 0,dt,database_code
0,2006-09-30,barwiki
1,2006-09-30,bpywiki
2,2006-09-30,bxrwiki
3,2006-09-30,cdowiki
4,2006-09-30,cbk_zamwiki


### August 2010 to March 2012
approach: scraped from automated messages to [new projects mailing list](https://lists.wikimedia.org/hyperkitty/list/newprojects@lists.wikimedia.org/)

In [18]:
# function to extract wiki from each team and the timestamp
def extract_messages(year, month):
    url = f'https://lists.wikimedia.org/hyperkitty/list/newprojects@lists.wikimedia.org/{year}/{month}/'
    
    result = requests.get(url)
    page_content = BeautifulSoup(result.content, 'html.parser')
    
    creation_dts = {}
    for thread in page_content.find_all('div', {'class': 'thread-email row'}):
        wiki_db = thread.find('span', class_='thread-title').text.replace('New wiki: ', '').strip()
        date = thread.find('div', class_='threa-date').get('title')
        date = datetime.strptime(date, '%A, %d %B %Y %H:%M:%S').date()
        creation_dts[wiki_db] = date
        
    return creation_dts

In [19]:
# extract from 2010 to 2012
creation_dts = {}
for year in range(2010, 2012+1):
    for month in range(1, 12+1):
        output = extract_messages(year, month)
        creation_dts = creation_dts | output

creation_dt_201012 = pd.DataFrame(creation_dts.values(), index=creation_dts.keys(), columns=['dt']).reset_index().rename({'index': 'database_code'}, axis=1)
creation_dt_201012

Unnamed: 0,database_code,dt
0,movementroleswiki,2010-07-31
1,noboard_chapterswikimedia,2010-07-20
2,"Tue, 20 Jul 2010 13:30:57 GMT",2010-07-20
3,frrwiki,2010-08-24
4,mkwikimedia,2010-08-20
5,kowikinews,2010-08-20
6,tenwiki,2010-11-09
7,tenwikipedia,2010-10-27
8,10wikipedia,2010-10-27
9,etwikimedia,2010-10-23


In [22]:
# clean erroneous data
for i in creation_dt_201012.index:
    db_code = creation_dt_201012.loc[i, 'database_code']
    
    if re.search('\d{2}:\d{2}:\d{2}', db_code):
        creation_dt_201012.drop(i, axis=0, inplace=True)
    
    elif re.search('.*..*,.*', db_code):
        dbs = db_code.split(':')[1].split(',')
        df = pd.DataFrame([i.strip() for i in dbs],  columns = ['database_code'])
        df['dt'] = creation_dt_201012.loc[i, 'dt']
        
        creation_dt_201012.drop(i, axis=0, inplace=True)
        creation_dt_201012 = pd.concat([creation_dt_201012, df], ignore_index=True)

### post 2013

In [23]:
post2013 = (public_content_wikis.query("""git_created_dt >= @datetime(2013, 1, 1)""")[['git_created_dt', 'database_code']].rename({'git_created_dt': 'dt'}, axis=1))
post2013.head()

Unnamed: 0,dt,database_code
6,2016-02-02,adywiki
15,2021-02-22,altwiki
16,2021-10-23,amiwiki
25,2023-03-22,anpwiki
36,2020-07-12,arywiki


### consolidating all the sources

In [52]:
sources = {
    'least_rev': least_rev_dt,
    'site_creation_log': incubator_200610_logs,
    'mail_notification': creation_dt_201012,
    'git_db_creation': post2013
}

In [53]:
creation_dts = pd.DataFrame()

for approach, df in sources.items():
    df['approach'] = approach
    creation_dts = pd.concat([creation_dts, df])

creation_dts = creation_dts.query("""database_code == @public_content_wikis.database_code.values.tolist()""")
creation_dts['approach'].value_counts()

least_rev            675
git_db_creation      117
site_creation_log     60
mail_notification     38
Name: approach, dtype: int64

In [40]:
# public content wikis missing from the data
public_content_wikis.query("""database_code != @creation_dts.database_code.values.tolist()""").database_code.values

array(['nsowiki', 'orwiktionary'], dtype=object)

In [41]:
# from https://incubator.wikimedia.org/wiki/Incubator:Site_creation_log#2010
missing_dbs = pd.DataFrame([[datetime(2011, 10, 29), 'nsowiki', 'site_creation_log'],
                            [datetime(2011, 9, 28), 'orwiktionary', 'site_creation_log']],
                           columns = creation_dts.columns.tolist())
missing_dbs

Unnamed: 0,dt,database_code,approach
0,2011-10-29,nsowiki,site_creation_log
1,2011-09-28,orwiktionary,site_creation_log


In [42]:
creation_dts = pd.concat([creation_dts, missing_dbs]).sort_values('dt', ignore_index=True)

In [43]:
assert public_content_wikis.query("""database_code != @creation_dts.database_code.values.tolist()""").shape[0] == 0

In [44]:
creation_dts[creation_dts.dt.isna()]

Unnamed: 0,dt,database_code,approach
886,NaT,iowiki,least_rev
887,NaT,roa_rupwiki,least_rev
888,NaT,roa_rupwiktionary,least_rev
889,NaT,scwiki,least_rev
890,NaT,tpiwiki,least_rev
891,NaT,zh_min_nanwiki,least_rev


In [45]:
creation_dts['dt'] = pd.to_datetime(creation_dts['dt'], yearfirst=True)

In [46]:
creation_dts.query("""dt <= @datetime(2001, 1, 15)""")

Unnamed: 0,dt,database_code,approach


In [49]:
creation_dts.query("""database_code == 'zhwikisource'""")

Unnamed: 0,dt,database_code,approach
106,2003-11-26,zhwikisource,least_rev
