## Checksum Function

In [1]:
import hashlib
import random
from bs4 import BeautifulSoup
import json
import requests
import pandas as pd

In [15]:
## This function calculates the checksum based on the contents of the file
def get_checksum(file_content):
    sha = hashlib.sha1()
    for chunk in file_content.read(512):
        sha.update(str(chunk).encode())
    digest = sha.hexdigest()
    return digest

## Obtaining Data

We query the wiki to retrieve the names of all the DataDownload pages along with their Content URL

In [17]:
query = """PREFIX wiki: <http://localhost:8080/enigma_dev/index.php/Special:URIResolver/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX enigma: <https://w3id.org/enigma#>
SELECT ?w ?a
WHERE 
{
    ?w enigma:hasContentUrl ?a.
}
ORDER BY ?w"""

response = requests.post(url, data = {'query': query})
res = json.loads(response.text)

query_results=[]
print("Data Downloads:")    
for item in res['results']['bindings']:
    w1 = item['w']['value'].replace(replace,"")
    a1 = item['a']['value'].replace(replace,"")
    query_results.append([w1,a1])

df = pd.DataFrame(query_results)
df.columns=['Data Download','Content URL']
df.head(20)

Data Downloads:


Unnamed: 0,Data Download,Content URL
0,1000BRAINS_ENIGMA3_Cortical_GWAS_Results,http://organicdatacuration.org/enigma_dev/imag...
1,ADMRI_HV_Dataset_Download,http://organicdatacuration.org/enigma_new/imag...
2,ADNI1_ENIGMA3_Cortical_GWAS_Results,http://organicdatacuration.org/enigma_dev/imag...
3,ADNI1_Meta-2Danalysis_Dataset_Download,http://organicdatacuration.org/enigma_new/imag...
4,ADNI1_test_APOE.csv,https://drive.google.com/open?id=1wjm3UdMpzDyz...
5,ADNI1_test_Subcort_vol.csv,https://drive.google.com/open?id=1AWsVjRV5ESKW...
6,ADNI1_test_covariates.csv,https://drive.google.com/open?id=1cEHsD8e0hSaG...
7,ADNI2GO_ENIGMA3_Cortical_GWAS_Results,http://organicdatacuration.org/enigma_dev/imag...
8,ADNI2_Meta-2Danalysis_Dataset_Download,http://organicdatacuration.org/enigma_new/imag...
9,ADNI2_test_APOE.csv,https://drive.google.com/open?id=14m6kPBXSyM00...


## Logging into Wiki 

In [None]:
#First we log in to the wiki
S = requests.Session()

URL = "http://organicdatacuration.org/enigma_dev/api.php"

# Retrieve login token
PARAMS_0 = {
    'action':"query",
    'meta':"tokens",
    'type':"login",
    'format':"json"
}

DATA = S.get(url=URL, params=PARAMS_0).json()
LOGIN_TOKEN = DATA['query']['tokens']['logintoken']

print("Login Token: ",LOGIN_TOKEN)

# Go to http://organicdatacuration.org/enigma_new/index.php/Special:BotPasswords for lgname & lgpassword, and add them below

PARAMS_1 = {
    'action':"login",
    'lgname':"",
    'lgpassword':"",
    'lgtoken':LOGIN_TOKEN,
    'format':"json"
}

DATA = S.post(URL, data=PARAMS_1).json()

print(DATA)

In [35]:
##Given a page and checksum value, the function writes the checksum value to the wiki

def update_checksum_wiki(page_to_write,checksum_value):  
    
    text_to_append="{{#set:|Checksum (E)="+checksum_value+"}}"
    PARAMS_2 = {
        "action": "query",
        "meta": "tokens",
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS_2)
    DATA = R.json()

    CSRF_TOKEN = DATA['query']['tokens']['csrftoken']

    # Step 4: POST request to edit a page

    PARAMS_EDIT = {
        "action": "edit",
        "title": page_to_write,
        "section": "new",
        "format": "json",
        "text": text_to_append,
        "token": CSRF_TOKEN,
    }

    R = S.post(URL, data=PARAMS_EDIT)
    DATA = R.json()

    print(DATA)

## Update Checksums in Wiki

In [None]:
##Cell to update all checksum values for the Data Downloads in the DataFrame

for idx, row in df.iterrows():
    page_name = row[0]
    #checksum is not calculated for files in google drive
    if(page_name.endswith('.csv')):
        continue
    else:
        url = row[1]
        response = S.get(datadownload)
        checksum = get_checksum(response.content)
        update_checksum_wiki(page_name,checksum)