# Pulling info from Wikimedia

In [None]:
import pandas
from pandas import read_csv
import json
import requests
import mwclient
from mwclient import Site
import time

useragent = {
    'User-Agent': 'Gene Wiki Review Impact (youremail@domain)'
}

site = Site('en.wikipedia.org', clients_useragent=useragent['User-Agent'])

datapath = 'data/'
exppath = 'results/'

# Functions

In [None]:
###############################################################################
## This module uses mwclient to pull page size and edit stats on wikipedia pages  
## for each gene given a list of gene wikipedia titles
###############################################################################
def get_wiki_volume_info (site,titlelist):
    print('obtaining wikipedia volume information')
    pageinfo=[]
    pagefails = []
    for eachpage in titlelist:
        tempdict={} #title, length/size, last_revised, last_revision_id
        try:
            checkitem = site.api('query', prop='info', titles=eachpage)
            results1 = checkitem['query']['pages']
            for item in results1:
                base = str(item)
                results2 = results1[base]
                tempdict['title']=str(results2['title'])
                tempdict['page_length']=int(results2['length'])
                tempdict['last_touched']=str(results2['touched'])
                tempdict['lastrevid']=str(results2['lastrevid'])
                pageinfo.append(tempdict)               
        except:
            pagefails.append(pagetitle)
            pass 
        time.sleep(0.5)
    return(pageinfo,pagefails)

In [None]:
###############################################################################
## This module uses pulls pageview data from the Media Wiki PageViews API
## More on the API here: https://wikimedia.org/api/rest_v1/#/Pageviews%20data/
## The module pulls in a parameter dictionary, and the list of wiki titles
## Parameters include:
## project: en.wikipedia.org, other wikimedia projects
## access: all-access, desktop, mobile-app, mobile-web
## agent: all-agents, user, spider, bot
## granularity: daily, monthly
###############################################################################
def get_monthly_pvs(page_view_parameters, useragent, titlelist):
    pginfo = []
    pgfails = []
    timestart = datetime.datetime.now().time()
    print(timestart,'obtaining wikipedia pageview information')
    pv_api_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/"
    for eachtitle in titlelist:
        try:
            url = pv_api_url+pv_params['access']+pv_params['agent']+eachtitle+"/"+pv_params['granularity']+pv_params['start']+"/"+pv_params['end']
            r = requests.get(url, headers=useragent)
            items = r.json()
            try:
                for item in items["items"]:
                    tmpdict = {'title':item["article"], 'views':int(item["views"]), 'granularity':item['granularity'],
                               'timestamp':item["timestamp"],'access':item['access'],'agent':item['agent']}
                    pginfo.append(tmpdict)
            except:
                tmpdict = {'title':title, 'views':-1, 'granularity':"no data",
                               'timestamp':"00000000",'access':"not data",'agent':"no data"}
                pginfo.append(tmpdict)            
        except:
            pgfails.append(eachtitle)
        time.sleep(0.5)

    pginfodf = pandas.DataFrame(pginfo)
    
    return(pginfodf, pgfails)    

# Pulling gene specific infor by Wikipedia titles

In [None]:
## Import the urls for the genes
gene_wiki_info = read_csv(datapath+'GeneWikiReviewlist.tsv',delimiter='\t', header=0)
#print(gene_wiki_info.head(n=2))
pagelist = gene_wiki_info['Gene Wiki Page'].loc[~gene_wiki_info['Gene Wiki Page'].isna()].tolist()
titlelist = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in pagelist]
print(titlelist[0])

In [None]:
## Get Wikipedia info for gene wiki articles
pageinfo,pagefails = get_wiki_volume_info(titlelist)
geneinfo = pandas.DataFrame(pageinfo)
print(geneinfo.head(n=2))## Get Wikipedia info for genes
pageinfo,pagefails = get_wiki_volume_info(site,titlelist)
wikiinfo = pandas.DataFrame(pageinfo)
print(wikiinfo.head(n=2))

wikiinfo.to_csv(exppath+'gene_wiki_vol_info.tsv',sep='\t',header=True)

In [None]:
#### Get Page views for each Gene Wiki Review wikipedia entry

#pages = ["Cyclin-dependent kinase 1", "Reelin"] ## for unit test

pv_params = {'project':'en.wikipedia',
             'access':'all-access/',
             'agent':'user/',
             'granularity':'monthly/',
             'start':'20130101',
             'end':'20211115'}

gene_monthly_pvs,pgfails = get_monthly_pvs(pv_params,useragent, titlelist)
print(gene_monthly_pvs.head(n=2))

gene_monthly_pvs.to_csv(exppath+'gene_wiki_views.tsv',sep='\t',header=True)

gene_monthly_views = pandas.pivot_table(gene_monthly_pvs[['timestamp','title','views']],
                                        values='views',index='title',columns='timestamp')
gene_pvs = gene_monthly_views.reset_index()
gene_pvs.rename(columns={'title':'wikipedia'},inplace=True)
#print(gene_pvs)
gene_pvs.to_csv(exppath+'gw_pvs.tsv',sep='\t',header=True)

# Pull all statements added for series via SPARQL queries

Query Wikidata for P179 (part of series) of Q108807010 (Gene Wiki Review Series). Then identify statements that use any member of the query results as a reference

In [None]:
## Run the sparql query to retrieve all Articles in this series

url = 'https://query.wikidata.org/sparql'
query = """
SELECT ?item ?itemLabel 
WHERE 
{
  ?item wdt:P179 wd:Q108807010. 
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()
#print(data)
print("query completed")

In [None]:
## Run query to retrieve all statements that reference the above articles