In [34]:
#!/usr/bin/env python3
import wmfdata as wmf
import pandas as pd
import matplotlib

matplotlib.style.use('ggplot')

spark = wmf.spark.get_custom_session(
    master="yarn",
    spark_config={
        "spark.driver.memory": "16g",
        "spark.dynamicAllocation.maxExecutors": 128,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 512
    }
)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [11]:
query = """
WITH active_editors AS (
SELECT
    cast(substr(event_timestamp,1,4) as int) as year, 
    cast(substr(event_timestamp,6,2) as int) as month,
    COUNT(DISTINCT event_user_id) AS num_active_editors
FROM wmf.mediawiki_history  
WHERE snapshot = '2022-07'
AND wiki_db = 'enwiki'
AND array_contains(event_user_groups,'bot') = FALSE
AND revision_is_identity_reverted = FALSE
GROUP BY
    cast(substr(event_timestamp,1,4) as int),
    cast(substr(event_timestamp,6,2) as int)
),

bytes_added AS (
SELECT
    cast(substr(event_timestamp ,1,4) as int) as year, 
    cast(substr(event_timestamp,6,2) as int) as month,
    SUM(revision_text_bytes_diff) AS num_bytes_added
FROM wmf.mediawiki_history   
WHERE snapshot = '2022-07'
AND wiki_db = 'enwiki'
AND array_contains(event_user_groups,'bot') = FALSE
AND revision_is_identity_reverted = FALSE
AND revision_text_bytes_diff > 0
GROUP BY
    cast(substr(event_timestamp,1,4) as int),
    cast(substr(event_timestamp,6,2) as int)
),

articles_created AS (
SELECT
    cast(substr(start_timestamp ,1,4) as int) as year, 
    cast(substr(start_timestamp,6,2) as int) as month,
    COUNT(*) AS num_articles_created
FROM wmf.mediawiki_page_history 
WHERE snapshot = '2022-07'
AND page_namespace = 1
GROUP BY
    cast(substr(start_timestamp,1,4) as int),
    cast(substr(start_timestamp,6,2) as int)
),

articles_deleted AS (
SELECT
    cast(substr(end_timestamp ,1,4) as int) as year, 
    cast(substr(end_timestamp,6,2) as int) as month,
    COUNT(*) AS num_articles_deleted
FROM wmf.mediawiki_page_history 
WHERE snapshot = '2022-07'
AND page_namespace = 1
GROUP BY
    cast(substr(end_timestamp,1,4) as int),
    cast(substr(end_timestamp,6,2) as int)
),

num_articles AS (
SELECT
    articles_created.month,
    articles_created.year,
    COALESCE(articles_deleted.num_articles_deleted,0) AS articles_deleted,
    articles_created.num_articles_created,
    articles_created.num_articles_created-COALESCE(articles_deleted.num_articles_deleted,0) AS article_diff,
    SUM(articles_created.num_articles_created-COALESCE(articles_deleted.num_articles_deleted,0)) OVER (ORDER BY articles_created.year, articles_created.month) AS num_articles 
FROM articles_created
LEFT JOIN articles_deleted
ON (articles_created.month = articles_deleted.month AND articles_created.year = articles_deleted.year)
WHERE articles_created.month IS NOT NULL
AND articles_created.year IS NOT NULL
)

SELECT active_editors.month, active_editors.year, active_editors.num_active_editors, bytes_added.num_bytes_added, num_articles.num_articles
FROM active_editors
INNER JOIN bytes_added
ON active_editors.year = bytes_added.year AND active_editors.month = bytes_added.month
INNER JOIN num_articles
ON active_editors.year = num_articles.year AND num_articles.month = bytes_added.month
"""

df = wmf.spark.run(query)
df

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
22/08/07 20:27:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                ]]

Unnamed: 0,month,year,num_active_editors,num_bytes_added,num_articles
0,1,2001,59,299346,1
1,9,2014,114682,809505763,17869786
2,7,2018,110440,790143950,25803728
3,8,2018,112766,807195982,26099767
4,9,2004,9864,320410731,150428
...,...,...,...,...,...
255,4,2006,102760,3039710025,818200
256,7,2014,118782,792280857,17459777
257,8,2021,115129,1081811555,32424097
258,8,2008,140757,2747485664,4807467


In [12]:
result_df = df.copy()
min_year = result_df['year'].min()
result_df['wiki_age'] = result_df['year'].subtract(min_year).multiply(12).add(result_df['month'])
result_df = result_df.sort_values(['year','month'])

In [14]:
FILEPATH = '/home/jmads/datasets/momentum/active_editors_content_added_8-7-22.csv'

result_df.to_csv(FILEPATH,index=False)