## 0a - Get the Outlinks 

Output: 

    dictionary revid: outlinks in that revision
    dictionary revid: time of that revid
    
    01_en_as_rev_outlinks_m.pkl
    01_en_as_rev_time_m.pkl
    01_ar_as_rev_outlinks_m.pkl
    01_ar_as_rev_time_m.pkl
     
The Plan:
- get all of the article revisions
- group by month
- resolve the page redirects (for articles that redirect to another article)
- save data

1. English (Arab Spring)
2. Arabic (الربيع العربي)

In [2]:
import wikifunctions as wf
import pickle
import os
import pandas as pd

## 1. English Outlinks and Revisions History

In [3]:
#get all of the page revisions

en_rev_df = wf.get_all_page_revisions('Arab Spring')

en_rev_df.head()

Unnamed: 0,revid,parentid,user,userid,timestamp,size,sha1,comment,anon,userhidden,sha1hidden,commenthidden,page,date,diff,lag,age
0,410393753,0,Jmj713,4589092,2011-01-27 16:55:08+00:00,822,cb89ac8f03c78acd30186e37f8c5280000201347,[[WP:AES|←]]Created page with 'The '''2010–201...,,,,,Arab Spring,2011-01-27,,,0.0
1,410405823,410393753,Jmj713,4589092,2011-01-27 18:01:13+00:00,936,60514406d46c7f7ff92c0863af12a0204c167d8b,,,,,,Arab Spring,2011-01-27,114.0,3965.0,0.045891
2,410406936,410405823,Jmj713,4589092,2011-01-27 18:07:38+00:00,999,a2ab6330b31fc8b41539bee02f4ed2222159d83d,,,,,,Arab Spring,2011-01-27,63.0,385.0,0.050347
3,410407396,410406936,Jmj713,4589092,2011-01-27 18:10:05+00:00,1094,23dd3ad50ee8431094e311d323b3a08e19c6ca5d,,,,,,Arab Spring,2011-01-27,95.0,147.0,0.052049
4,410413036,410407396,Jmj713,4589092,2011-01-27 18:43:56+00:00,1168,a9302194b9f68fefdabd3f379185c2e07392b399,,,,,,Arab Spring,2011-01-27,74.0,2031.0,0.075556


In [4]:
#group by month
en_freq_revids = en_rev_df.groupby(pd.Grouper(key='timestamp',freq='M')).agg({'revid':'min'})['revid']

In [5]:
#make outlinks dict
outlinks = {}

for _revid in en_freq_revids.values:
    outlinks[_revid] = wf.get_revision_outlinks(_revid)

In [6]:
#resolve redirects

for i in outlinks:
    outlinks[i] = wf.resolve_redirects(outlinks[i])

In [9]:
#save data

#move to the data file
os.chdir("Data")

#save outlinks as revid:outlinnk
with open('01_en_as_rev_outlinks_m.pkl', 'wb') as f:
    pickle.dump(outlinks, f)
    
#save df of time:revid of that revid
with open('01_en_as_rev_time_m.pkl', 'wb') as f:
    pickle.dump(en_freq_revids, f)
    
with open('01_en_as_rev_everything.pkl', 'wb') as f:
    pickle.dump(en_rev_df, f)
    

## 2. Arabic Outlink and Revisions History

In [10]:
#get all of the page revisions

ar_rev_df = wf.get_all_page_revisions('الربيع العربي', endpoint='ar.wikipedia.org/w/api.php')

In [11]:
#group by month
#clean a tad by dropping na
ar_freq_revids = ar_rev_df.groupby(pd.Grouper(key='timestamp',freq='M')).agg({'revid':'min'})['revid']
ar_freq_revids = ar_freq_revids.dropna()

In [12]:
#make outlinks dict
ar_outlinks = {}

for _revid in ar_freq_revids.values:
    #print(_revid)
    ar_outlinks[_revid] = wf.get_revision_outlinks(int(_revid), endpoint='ar.wikipedia.org/w/api.php')

In [13]:
#resolve redirects
for i in ar_outlinks:
    ar_outlinks[i] = wf.resolve_redirects(ar_outlinks[i])

In [14]:
#save data

#save outlinks as revid:outlinnk
with open('01_ar_as_rev_outlinks_m.pkl', 'wb') as f:
    pickle.dump(ar_outlinks, f)
    
#save df of revid:time of that revid
with open('01_ar_as_rev_time_m.pkl', 'wb') as f:
    pickle.dump(ar_freq_revids, f)
    
with open('01_ar_as_rev_everything.pkl', 'wb') as f:
    pickle.dump(ar_rev_df, f)