# Getting reliability templates from mediawiki_wikitext 

In [10]:
#get reliability related templates
templates = [l.strip() for l in  open('templates_selfPromotion.txt')]
#config folders
outputHDFS = 'YOUR-FOLDER-HERE'

In [11]:
!hadoop fs -mkdir $outputHDFS

mkdir: `collaborationPatterns': File exists


In [12]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import ArrayType, StringType
import re 
@udf(returnType=ArrayType(StringType()))
def getTemplatesRegex(wikitext):
    """Extract list of templates from wikitext for an article via simple regex.
    Known Issues:
    * Doesn't handle nested templates (just gets the first)
    -- e.g., '{{cite web|url=http://www.allmusic.com/|ref={{harvid|AllMusic}}}}' would be just web
    """
    try:
        return list(set([m.split('|')[0].strip() for m in re.findall('(?<=\{\{)(.*?)(?=\}\})', wikitext, flags=re.DOTALL)]))
    except Exception:
        return None
    
    
@udf(returnType=ArrayType(StringType()))
def getTemplatesRegexRelaibility(wikitext):
    """
    Same function than getTemplatesRegex, but filtered by list of templates
    TODO: Check how to call another function (getTemplatesRegex) from here.
    """
    global templates
    try:
        all_templates = list(set([m.split('|')[0].strip() for m in re.findall('(?<=\{\{)(.*?)(?=\}\})', wikitext, flags=re.DOTALL)]))
        matching_templates = [template for template in all_templates if template.lower() in templates]
        if len(matching_templates) > 0:
            return matching_templates
        else:
            return None
    except Exception:
        return None



In [13]:
snapshot ="2020-09"
wikidb = "enwiki"
wikitext_history = spark.sql('''SELECT page_id,revision_id,revision_text,page_namespace FROM wmf.mediawiki_wikitext_history 
    WHERE snapshot ="{snapshot}" and wiki_db ="{wikidb}"'''.format(wikidb=wikidb,snapshot=snapshot))


In [14]:
#Apply getTemplatesRegexRelaibility over all wikitext history
wikitext_history = wikitext_history.withColumn("templates",getTemplatesRegexRelaibility(col('revision_text')))


In [15]:
from pyspark.sql.functions import explode
revisions_with_template = wikitext_history.select(wikitext_history.page_id,wikitext_history.revision_id,explode(wikitext_history.templates))

In [16]:
revisions_with_template.write.parquet(outputHDFS+'/templates.parquet',mode='overwrite')

# Generating Dumps

* Given all the pages containing a template, generate the full list of revisions for that list
* Enrich that list with additional meta information

In [17]:
revisions_with_template = spark.read.parquet(outputHDFS+'/templates.parquet')
#revisions_with_template.cache()


In [18]:
revisions_with_template.select('col').groupBy('col').count().orderBy('count', ascending=False).show(100)

+-------------+------+
|          col| count|
+-------------+------+
|       advert|968433|
|       Advert|755355|
|      peacock|310418|
|       weasel|302263|
|       Weasel|221102|
|      Peacock|185196|
|       fanpov|120206|
|autobiography| 92810|
|       Fanpov| 74542|
|Autobiography| 25031|
|       FanPOV|   212|
|       WEASEL|    98|
|       ADVERT|    25|
|      PEACOCK|    18|
|       fanPOV|    11|
|AUTOBIOGRAPHY|     2|
|       AdVert|     1|
|autoBiography|     1|
+-------------+------+



In [22]:
revisions_with_template.select('col').groupBy('col').count().orderBy('count', ascending=False).show(100, truncate=False)

+-------------+------+
|col          |count |
+-------------+------+
|advert       |968433|
|Advert       |755355|
|peacock      |310418|
|weasel       |302263|
|Weasel       |221102|
|Peacock      |185196|
|fanpov       |120206|
|autobiography|92810 |
|Fanpov       |74542 |
|Autobiography|25031 |
|FanPOV       |212   |
|WEASEL       |98    |
|ADVERT       |25    |
|PEACOCK      |18    |
|fanPOV       |11    |
|AUTOBIOGRAPHY|2     |
|AdVert       |1     |
|autoBiography|1     |
+-------------+------+



In [39]:
from time  import time


pages_templates_subset = revisions_with_template.select('page_id').distinct()
pages_templates_subset.createOrReplaceTempView('pages_templates_subset')

mediawiki_history_subset =  spark.sql('''
        SELECT w.event_timestamp, w.page_title,w.page_id,w.page_namespace, 
        w.revision_id, w.revision_is_identity_reverted, 
        w.revision_minor_edit, w.revision_text_bytes, 
        w.revision_first_identity_reverting_revision_id, w.revision_seconds_to_identity_revert,
        w.event_user_id,w.event_user_registration_timestamp, 
        w.event_user_is_anonymous,w.event_user_revision_count,

        w.event_comment
        FROM wmf.mediawiki_history w
        WHERE w.snapshot ="2020-09" and w.wiki_db ="enwiki" AND  
      w.event_entity = 'revision' AND w.page_id IN (
                    SELECT  page_id FROM pages_templates_subset)                   
        ''')
mediawiki_history_subset.cache()
mediawiki_history_subset.createOrReplaceTempView('mediawiki_history_subset')

for template in templates:
    try:
        t1 = time()
        print(template)
        df = revisions_with_template.where(revisions_with_template['col']==template)
        df.cache()
        t2 = time()
        print('read table, done',t2-t1)
        t1 = time()        
        page_ids = df.select('page_id').distinct()
        page_ids.createOrReplaceTempView('tmp_page_ids')
        revision_ids = df.select('revision_id').distinct()
        revision_ids.createOrReplaceTempView('tmp_revision_ids')
        reverts= spark.sql('''
        SELECT w.event_timestamp, w.page_title,w.page_id, w.page_namespace,
        w.revision_id, w.revision_is_identity_reverted, 
        w.revision_minor_edit, w.revision_text_bytes, 
        w.revision_first_identity_reverting_revision_id, w.revision_seconds_to_identity_revert,
        w.event_user_id,w.event_user_registration_timestamp, 
        w.event_user_is_anonymous,w.event_user_revision_count,
       CASE WHEN r.revision_id IS NOT NULL  THEN 1 ELSE 0 END has_template,
        w.event_comment
       
FROM mediawiki_history_subset w LEFT OUTER JOIN tmp_revision_ids r 
                            ON (w.revision_id = r.revision_id)

WHERE  w.page_id IN (
                    SELECT  page_id FROM tmp_page_ids) 
ORDER BY page_id, w.revision_id
''') 
        reverts.repartition(1).write.csv(outputHDFS+'/'+template,header=True,mode='overwrite',sep='\t')
        t2 = time()
        print('save table, done',t2-t1)
        t1 = time()   
        templateout = template.replace(' ','_')
        !hadoop fs -text "$outputHDFS/$template/*"  | gzip > $outputHDFS-$template-meta-info.csv.gz
        t2 = time()
        print('-----',t2-t1)
    except Exception as e:
        print('error',e)


autobiography
read table, done 0.0330965518951416
save table, done 23.176331043243408
21/02/13 18:33:18 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 20.40220880508423
advert
read table, done 0.04973602294921875
save table, done 80.69189429283142
21/02/13 18:35:00 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 116.54059410095215
fanpov
read table, done 0.05591607093811035
save table, done 23.369151830673218
21/02/13 18:37:20 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 22.441009998321533
peacock
read table, done 0.050488948822021484
save table, done 70.40525674819946
21/02/13 18:38:53 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 57.17970681190491
weasel
read table, done 0.05831432342529297
save table, done 82.42752432823181
21/02/13 18:41:12 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 90.90377163887024


In [34]:
import pandas as pd
df = pd.read_csv('collaborationPatterns-autobiography-meta-info.csv.gz',compression='gzip',sep='\t',error_bad_lines=False)

b'Skipping line 307394: expected 16 fields, saw 17\n'
b'Skipping line 345644: expected 16 fields, saw 17\n'
b'Skipping line 923094: expected 16 fields, saw 17\n'
b'Skipping line 1085902: expected 16 fields, saw 17\nSkipping line 1085903: expected 16 fields, saw 17\n'
b'Skipping line 1169025: expected 16 fields, saw 17\n'
b'Skipping line 1620566: expected 16 fields, saw 17\n'
b'Skipping line 1746451: expected 16 fields, saw 17\n'
b'Skipping line 1776388: expected 16 fields, saw 17\n'
b'Skipping line 2037272: expected 16 fields, saw 18\n'
b'Skipping line 3116407: expected 16 fields, saw 17\n'
b'Skipping line 3187195: expected 16 fields, saw 18\n'
b'Skipping line 3413792: expected 16 fields, saw 17\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [38]:
?reverts.write.csv