In [1]:
import os, itertools, httplib2, PyPDF2, shlex, signal, subprocess, requests, urllib.request, shutil
from bs4 import BeautifulSoup
from shutil import copyfile
import pandas as pd
import numpy as np
from numpy import linalg
from time import sleep
from datetime import timedelta

# BrainMap

### Metadata

In [126]:
date = 180803
bm = pd.read_csv("brainmap/experiments_%d.csv" %date, header=0, index_col=None, encoding="cp858")
citations = open("brainmap/citations_%d.txt" %date).readlines()
coordinates = open("brainmap/coordinates_%d.txt" %date).readlines()

In [4]:
# Extract relevant data from experiments
bm = bm[["BRAINMAP_ID", "YEAR", "1st_AUTHOR", "JOURNAL", "NUM_COORDINATES", "EXPERIMENT", "BEHAVIORAL_DOMAIN"]]

In [5]:
# Consolidate experiments within studies, summing the number of coordinates
dic = {id: {"NUM_COORDINATES": 0, "EXPERIMENT": [], "BEHAVIORAL_DOMAIN": []} for id in sorted(list(set(bm["BRAINMAP_ID"])))}
for i, row in bm.iterrows():
    dic[row["BRAINMAP_ID"]]["YEAR"] = row["YEAR"]
    dic[row["BRAINMAP_ID"]]["1st_AUTHOR"] = row["1st_AUTHOR"]
    dic[row["BRAINMAP_ID"]]["JOURNAL"] = row["JOURNAL"]
    dic[row["BRAINMAP_ID"]]["NUM_COORDINATES"] += row["NUM_COORDINATES"]
    dic[row["BRAINMAP_ID"]]["EXPERIMENT"].append(row["EXPERIMENT"])
    dic[row["BRAINMAP_ID"]]["BEHAVIORAL_DOMAIN"].append(row["BEHAVIORAL_DOMAIN"])
bm = pd.DataFrame(dic).transpose()

In [6]:
# Add data from citations
for splitter, study in itertools.groupby(citations, lambda line: line == "\n"):
    dic = {item.split()[0]: ' '.join(item.split()[1:]).strip() for item in list(study) if len(item.split()) > 1}
    if "%2" in dic.keys():
        id = int(dic["%2"].split("= ")[1])
        if "%1" in dic.keys():
            bm.at[id,"PMID"] = dic["%1"].split("= ")[1]
        if "%T" in dic.keys():
            bm.at[id,"TITLE"] = dic["%T"]
        if "%A" in dic.keys():
            bm.at[id,"AUTHORS"] = dic["%A"]
        if "%V" in dic.keys():
            bm.at[id,"VOLUME"] = dic["%V"]
        if "%8" in dic.keys():
            bm.at[id,"MONTH"] = dic["%8"].split()[0]
        if "%P" in dic.keys():
            bm.at[id,"PAGES"] = dic["%P"]
        if "%Z" in dic.keys():
            bm.at[id,"DESCRIPTION"] = dic["%Z"].split("= ")[1]
        if "%U" in dic.keys():
            bm.at[id,"ABSTRACT_URL"] = dic["%U"]

In [7]:
# Compute subject-author key for matching with coordinate data
for i, row in bm.iterrows():
    bm.at[i,"KEY"] = row["1st_AUTHOR"] + ", " + str(row["YEAR"])

In [8]:
# Sort columns
bm.columns = list(bm.columns)
bm = bm[["PMID", "KEY", "1st_AUTHOR", "AUTHORS", "YEAR", "TITLE", "JOURNAL",
        "VOLUME", "MONTH", "PAGES", "BEHAVIORAL_DOMAIN", "EXPERIMENT",
        "DESCRIPTION", "ABSTRACT_URL", "NUM_COORDINATES"]]

In [9]:
# Save metadata to file
bm.to_csv(path_or_buf="brainmap/brainmap_metadata_180803.csv")

### Coordinates

In [8]:
# Reload PMIDs from manually updated file
coordinates = open("brainmap/coordinates_180803.txt").readlines()
bm = pd.read_csv("brainmap/brainmap_metadata_180809.csv", header=0, index_col=None, encoding="cp858")

In [5]:
# Match coordinates to PMIDs and sample sizes to BrainMap IDs
coord, samp = {}, {}
for splitter, study in itertools.groupby(coordinates, lambda line: line == "\n"):
    if not splitter:
        study = list(study)
        key = study[0].replace("// ", "").split(": ")[0]
        if key in list(bm["KEY"]):
            try:
                pmid = int(bm.loc[bm["KEY"] == key, "PMID"])
                bmid = int(bm.loc[bm["KEY"] == key, "BRAINMAP_ID"])
            except:
                print(key)
            if pmid not in coord.keys():
                coord[pmid] = []
            for line in study:
                if not line.startswith("//"):
                    coord[pmid].append(line.replace("\t", ",").strip())

In [103]:
# Add sample sizes to metadata
for bmid, val in samp.items():
    bm.at[bmid,"NUM_SUBJECTS"] = val
bm.to_csv(path_or_buf="brainmap/brainmap_metadata_180809b.csv")

In [104]:
# Save number of coordinates to BrainMap metadata 
# and subset by studies with coordinates
for i, row in bm.iterrows():
    if row["PMID"] in coord.keys():
        bm.at[i,"NUM_COORDINATES"] = len(coord[row["PMID"]])
bm_sub = bm.loc[bm["NUM_COORDINATES"] > 0]
bm_sub.to_csv(path_or_buf="brainmap/brainmap_metadata_180810.csv", index=None)

In [12]:
long = []
for sigma in [5]:
    
    # Create directory for current smoothing sigma
    if not os.path.exists("brainmap/brainmap_preproc_coords/{}mm".format(sigma)):
        os.makedirs("brainmap/brainmap_preproc_coords/{}mm".format(sigma))
    if not os.path.exists("brainmap/brainmap_preproc_coords/{}mm/logs".format(sigma)):
        os.makedirs("brainmap/brainmap_preproc_coords/{}mm/logs".format(sigma))
    
    for pmid in list(bm["PMID"]):
        if pmid in coord.keys():

            # Format preprocessing command
            comm = "pc.run_preproc(path, {}, {}, smoothing_sigma={}, mask_path=mask_path)".format(coord[pmid], int(pmid), sigma)
            
            # Write python script with command for executing preprocessing
            pyfile = open("brainmap/brainmap_preproc_coords/{}mm/preproc_{}.py".format(sigma, int(pmid)), "w+")
            pyfile.write("#!/bin/python\n\nimport preproc_coords as pc\n\npath = '/scratch/PI/aetkin/ebeam/cogneuro/brainmap'\nmask_path = '/scratch/PI/aetkin/ebeam/cogneuro/masks'\n\n{}".format(comm))
            pyfile.close()
            
            # Scale script duration to number of coordinates
            mins = 15*len(set(coord[pmid]))
            qos = "#"
            partition = "normal"
            if mins > 600:
                qos = "#"
                partition = "normal"
                long.append(pmid)

            # Write bash script for slurm submission
            bashfile = open("brainmap/brainmap_preproc_coords/{}mm/preproc_{}.sbatch".format(sigma, int(pmid)), "w+")
            lines = ["#!/bin/bash\n",
                     "#SBATCH --job-name={}_{}".format(sigma, int(pmid)),
                     "#SBATCH --output=logs/{}.%j.out".format(int(pmid)),
                     "#SBATCH --error=logs/{}.%j.err".format(int(pmid)),
                     "#SBATCH --time={}".format(str(timedelta(minutes=mins)).replace("2 days, ", "01-").replace("1 day, ", "01-")),
                     "#SBATCH -p {}".format(partition),
                     "{}#SBATCH --qos=long".format(qos),
                     "#SBATCH --nodes=1",
                     "#SBATCH --mem=350",
                     "#SBATCH -c 1",
                     "#SBATCH --mail-type=FAIL # notifications for job failure only",
                     "#SBATCH --mail-user=ebeam@stanford.edu\n",
                     "module load python/2.7.13 biology fsl/5.0.10",
                     "srun python preproc_{}.py".format(int(pmid))]
            for line in lines:
                bashfile.write(line + "\n")
            bashfile.close()
        
        # Copy over preprocessing and wrap scripts
        copyfile("brainmap/brainmap_preproc_coords/preproc_coords.py", "brainmap/brainmap_preproc_coords/{}mm/preproc_coords.py".format(sigma))
        copyfile("brainmap/brainmap_preproc_coords/wrap_{}mm.sh".format(sigma), "brainmap/brainmap_preproc_coords/{}mm/wrap.sh".format(sigma))

In [6]:
# Function to split list into list of n-sized chunks
def chunkify(l, n):
    chunks = []
    for i in range(0, len(l), n):
        chunks.append(l[i:i + n])
    leftover = len(l)-(len(l)*n)
    chunks.append(l[-leftover:])
    return chunks

In [7]:
chunks = chunkify(list(bm["PMID"]), 150)
for sigma in [5]:
    for i, chunk in enumerate(chunks):
        lines = ["#!/bin/sh", 
                 'IDS="{}"'.format(" ".join([str(id) for id in chunk])),
                "for ID in $IDS; do",
                "if [ ! -f '/scratch/PI/aetkin/ebeam/cogneuro/brainmap/coordinates/5mm/${ID}.txt' ]", 
                "then", 
                "echo `sbatch preproc_${ID}.sbatch`", 
                "sleep 1", 
                "fi", 
                "done"]
        file = open("brainmap/brainmap_preproc_coords/{}mm/wrap_{}.sh".format(sigma, i), "w+")
        for line in lines:
            file.write(line + "\n")

In [173]:
len(bm)

3351

# PMID to DOI

In [19]:
bm = pd.read_csv("brainmap/brainmap_metadata_180803.csv", header=0, index_col=None, encoding="cp858")

In [20]:
def get_url(url, delay=0.0, verbose=False):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'}
    r = requests.get(url, headers=headers, timeout=5.0)
    return r.text

In [21]:
for i, row in bm.iterrows():
    try:
        text = get_url("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&retmode=text&rettype=medline".format(row["PMID"]))
        for line in text.split("\n"):
            if line.startswith("AID"):
                try:
                    doi = line.split("- ")[1].replace(" [doi]","").strip()
                    bm.at[i,"DOI"] = doi
                except:
                    print(row["PMID"])
    except:
        print(row["PMID"])

12202083.0
9025115.0
15721961.0
21427167.0


In [24]:
bm.to_csv(path_or_buf="brainmap/brainmap_metadata_180803.csv")

### PDFs

In [29]:
bm = pd.read_csv("brainmap/brainmap_metadata_180804.csv", header=0, index_col=None, encoding="cp858")
pmids = [pmid for pmid in bm["PMID"] if not np.isnan(pmid)]

In [33]:
# Download PloS One articles
http = httplib2.Http(".cache", disable_ssl_certificate_validation = True)
for i, row in bm.iterrows():
    if row["JOURNAL"] == "PloS one" and not np.isnan(row["PMID"]):
        print("Downloading PDF for {}".format(row["id"]))
        pdf_url = "http://journals.plos.org/plosone/article/file?id={}&type=printable".format(row["doi"])
        pdf_file = "pdf/{}.pdf".format(row["PMID"])
        comm = "wget -O {} {}".format(pdf_file, pdf_url)
        args = shlex.split(comm)
        proc = subprocess.call(args)

In [28]:
# Attempt download with ruby script
for pmid in pmids:
    try:
        cogneuro_file = "../texts/pdf/{}.pdf".format(int(pmid))
        pubmed_file = "../../pubmed/vetted/{}.pdf".format(int(pmid))
        if not os.path.isfile(cogneuro_file):
            if os.path.isfile(pubmed_file):
                shutil.move(pubmed_file, cogneuro_file)
            else:
                comm = "ruby /Users/ehbeam/Dropbox/Stanford/Research/Projects/Psychiatlas/scripts/borrowed/Pubmed-Batch-Download-master/pubmedid2pdf.rb {}".format(int(pmid))
                args = shlex.split(comm)
                proc = subprocess.call(args)
    except:
        pass

In [12]:
# Second-pass attempt to download
http = httplib2.Http(".cache", disable_ssl_certificate_validation = True)
import requests
for pmid in pmids:
    if not os.path.isfile("../texts/pdf/{}.pdf".format(int(pmid))):

        # Insert the Medline ID into the PubMed entrez url
        try:
            url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={}&cmd=prlinks&retmode=ref".format(int(pmid))
            pdf_url = ""

            # Attempt to download pdf
            response = requests.get(url).text
            for line in str(response).split():
                if "http" in line and "pdf" in line:
                    pdf_url = line.split('"')[1]
                    pdf_file = "pdf/{}.pdf".format(int(pmid))
                    comm = "wget -O {} {}".format(pdf_file, pdf_url)
                    args = shlex.split(comm)
                    proc = subprocess.call(args)
            sleep(2)
        except:
            pass

        # Wait for PDF to download, then remove if corrupt
        try:
            if os.path.getsize(pdf_file) == 0:
                os.remove(pdf_file)
                print("Removed empty PDF for {}".format(int(pmid)))
                continue
            else:
                try:
                    PyPDF2.PdfFileReader(open(pdf_file, "rb"))
                except PyPDF2.utils.PdfReadError:
                    os.remove(pdf_file)
                    print("Removed corrupt PDF for {}".format(int(pmid)))
                    continue
        except:
            pass

Removed empty PDF for 8989012
Removed empty PDF for 9051780
Removed empty PDF for 9141092
Removed empty PDF for 9462480
Removed empty PDF for 9665617
Removed empty PDF for 9674604
Removed empty PDF for 9696465
Removed corrupt PDF for 10372081
Removed empty PDF for 10380965
Removed empty PDF for 10568854
Removed empty PDF for 10923655
Removed empty PDF for 11201097




Removed empty PDF for 12151759
Removed empty PDF for 12195096
Removed empty PDF for 12634477
Removed empty PDF for 12692460




Removed empty PDF for 12858037
Removed empty PDF for 12858037
Removed empty PDF for 14561934
Removed empty PDF for 14625454
Removed empty PDF for 14625459
Removed empty PDF for 15073516
Removed empty PDF for 15570157
Removed empty PDF for 15597038
Removed empty PDF for 15973144
Removed empty PDF for 16237317
Removed empty PDF for 16237324
Removed empty PDF for 16513004
Removed empty PDF for 16603917
Removed empty PDF for 16790655
Removed empty PDF for 17471059
Removed empty PDF for 17545731
Removed empty PDF for 18158370




Removed empty PDF for 18797307
Removed empty PDF for 18797310
Removed empty PDF for 19218875
Removed empty PDF for 19339907
Removed empty PDF for 19512976
Removed empty PDF for 19617860
Removed empty PDF for 20300040
Removed empty PDF for 20508544




Removed empty PDF for 22048836
Removed empty PDF for 26962820


In [30]:
# Identify missing PDFs
downloaded = [int(file.replace(".pdf", "")) for file in os.listdir("../texts/pdf") if not file.startswith(".")]
missing = [int(pmid) for pmid in pmids if int(pmid) not in downloaded]
print(len(missing))

294


In [31]:
# Copy over missing pdfs from prior dataset
prior_pdfs = [int(file.replace(".pdf", "")) for file in os.listdir("../../brainmap/texts/pdfs") if not file.startswith(".")]
for pmid in missing:
    if pmid in prior_pdfs:
        copyfile("../../brainmap/texts/pdfs/{}.pdf".format(pmid), "pdf/{}.pdf".format(pmid))

FileNotFoundError: [Errno 2] No such file or directory: '../../brainmap/texts/pdfs'

In [32]:
# Save list of missing PDFs with URL for download
missing_url = ["https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={}&cmd=prlinks&retmode=ref".format(pmid) for pmid in missing]
missing_df = pd.DataFrame({"URL": missing_url, "PMID": missing})
missing_df.to_csv(path_or_buf="../texts/download/failed_pdfs_bm.csv", index=None, columns=["URL","PMID"])

# Neurosynth

### Metadata

In [5]:
# Load most recent data release (July 2018)
ns18 = pd.read_csv("neurosynth/neurosynth_180712.csv", header=0, index_col=None)
len(list(set(list(ns18["id"]))))

14371

In [104]:
# Load prior data release, which includes manually updated space information (July 2015)
ns15 = pd.read_csv("neurosynth/neurosynth_180623.csv", header=0, index_col=None)
len(list(set(list(ns15["id"]))))

11233

In [105]:
# Load prior data release, without updated space information (July 2015)
ns15_raw = pd.read_csv("neurosynth/neurosynth_180623_raw.csv", header=0, index_col=None)
len(list(set(list(ns15_raw["id"]))))

11406

In [106]:
# Load ACE data
ace = pd.read_csv("ace/ace_180711.csv", header=0, index_col=None)
len(list(set(list(ace["id"]))))

3826

In [107]:
# Count up rows with unknown space in new data
len(ns18[ns18["space"] == "UNKNOWN"])

53878

In [108]:
# Add missing space information to new data
for i, r in ns18.iterrows():
    if r["space"] == "UNKNOWN":
        space = ns15.loc[(ns15["id"] == r["id"]) & (ns15["x"] == r["x"]) & (ns15["y"] == r["y"]) & (ns15["z"] == r["z"]), "space"].values
        if len(space) > 0:
            ns18.set_value(i, "space", space[0])
        else:
            ace_space = ace.loc[(ace["id"] == r["id"]) & (ace["x"] == r["x"]) & (ace["y"] == r["y"]) & (ace["z"] == r["z"]), "space"].values
            raw_space = ns15_raw.loc[(ns15_raw["id"] == r["id"]) & (ns15_raw["x"] == r["x"]) & (ns15_raw["y"] == r["y"]) & (ns15_raw["z"] == r["z"]), "space"].values
            if len(ace_space) > 0:
                if ace_space[0] != "UNKNOWN":
                    ns18.set_value(i, "space", ace_space[0])
                elif ace_space[0] == "UNKNOWN":
                    ns18.set_value(i, "space", "DISCARD")
            elif len(raw_space) > 0:
                if raw_space[0] == "UNKNOWN":
                    ns18.set_value(i, "space", "DISCARD")

In [109]:
# Recount rows with unknown space
len(ns18[ns18["space"] == "UNKNOWN"])

6533

In [110]:
# Count rows with known unknown space to be discarded
len(ns18[ns18["space"] == "DISCARD"])

3349

In [111]:
# Save new data with filled in space information
ns18.to_csv("neurosynth/neurosynth_180713.csv", index=None)

### PDFs

In [125]:
# Download PloS One articles by DOI
http = httplib2.Http(".cache", disable_ssl_certificate_validation = True)
added = []
for i, row in ns18.iterrows():
    if row["journal"] == "PloS one":
        if row["id"] not in added and "{}.pdf".format(row["id"]) not in os.listdir("pdf"):
            print("Downloading PDF for {}".format(row["id"]))
            pdf_url = "http://journals.plos.org/plosone/article/file?id={}&type=printable".format(row["doi"])
            pdf_file = "pdf/{}.pdf".format(row["id"])
            comm = "wget -O {} {}".format(pdf_file, pdf_url)
            args = shlex.split(comm)
            proc = subprocess.call(args)
            added.append(row["id"])

Downloading PDF for 17183636
Downloading PDF for 17327919
Downloading PDF for 17389911
Downloading PDF for 17579718
Downloading PDF for 17712410
Downloading PDF for 17849020
Downloading PDF for 17971871
Downloading PDF for 18231575
Downloading PDF for 18301756
Downloading PDF for 18335036
Downloading PDF for 18365029
Downloading PDF for 18493621
Downloading PDF for 18509462
Downloading PDF for 18523591
Downloading PDF for 18682729
Downloading PDF for 18698355
Downloading PDF for 18728773
Downloading PDF for 18769538
Downloading PDF for 18797499
Downloading PDF for 18827897
Downloading PDF for 18846222
Downloading PDF for 18958158
Downloading PDF for 18958161
Downloading PDF for 18985147
Downloading PDF for 19018279
Downloading PDF for 19030105
Downloading PDF for 19050764
Downloading PDF for 19517024
Downloading PDF for 19584921
Downloading PDF for 19636426
Downloading PDF for 19672296
Downloading PDF for 19680553
Downloading PDF for 19707568
Downloading PDF for 19750227
Downloading PD

Downloading PDF for 25603126
Downloading PDF for 25625285
Downloading PDF for 25629899
Downloading PDF for 25659130
Downloading PDF for 25671563
Downloading PDF for 25774886
Downloading PDF for 25774979
Downloading PDF for 25790002
Downloading PDF for 25793718
Downloading PDF for 25811453
Downloading PDF for 25859660
Downloading PDF for 25875000
Downloading PDF for 25875594
Downloading PDF for 25885897
Downloading PDF for 25938442
Downloading PDF for 25945925
Downloading PDF for 25996480
Downloading PDF for 26053316
Downloading PDF for 26061877
Downloading PDF for 26079805
Downloading PDF for 26301900
Downloading PDF for 26727514
Downloading PDF for 27010196
Downloading PDF for 27414048
Downloading PDF for 27560361


In [102]:
# Attempt download with ruby script
for pmid in list(set(ns18["id"])):
    if not os.path.isfile("../texts/pdf/{}.pdf".format(pmid)):
        comm = "ruby /Users/ehbeam/Dropbox/Stanford/Research/Projects/Psychiatlas/scripts/borrowed/Pubmed-Batch-Download-master/pubmedid2pdf.rb {}".format(pmid)
        args = shlex.split(comm)
        proc = subprocess.call(args)

In [2]:
# Identify missing PDFs (note data were updated below)
ns = pd.read_csv("neurosynth/neurosynth_180716.csv", index_col=None, header=0)
downloaded = [int(file.replace(".pdf", "")) for file in os.listdir("../texts/pdf") if not file.startswith(".")]
missing = [pmid for pmid in list(set(ns["id"])) if pmid not in downloaded]
print(len(missing))

1


In [3]:
# Save list of missing PDFs with URL for download
missing_url = ["https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={}&cmd=prlinks&retmode=ref".format(pmid) for pmid in missing]
missing_df = pd.DataFrame({"URL": missing_url, "PMID": missing})
missing_df.to_csv(path_or_buf="../texts/failed_pdfs_ns.csv", index=None, columns=["URL","PMID"])

### Coordinates

In [12]:
ns = pd.read_csv("neurosynth/neurosynth_180716.csv", header=0, index_col=None)
len(list(set(list(ns["id"]))))

14147

In [13]:
# Drop coordinates in an unknown space
ns = ns[ns["space"] != "UNKNOWN"]
ns = ns[ns["space"] != "DISCARD"]
len(list(set(list(ns["id"]))))

14147

In [14]:
# Reset index to eliminate dropped lines
ns.index = range(len(ns))

In [15]:
# Add column for MNI coordinates
ns["mni_coord"] = ns["x"].map(str) + "," + ns["y"].map(str) + "," + ns["z"].map(str)
ns.loc[ns.space != "MNI", ["mni_coord"]] = ""

In [16]:
# Functions to convert Talairach coordinates to MNI space
# Adapted from https://github.com/neurosynth/neurosynth/blob/master/neurosynth/base/transformations.py

def transform(foci, mat):
    """ Convert coordinates from one space to another using provided
    transformation matrix. """
    t = linalg.pinv(mat)
    foci = np.hstack((foci, np.ones((foci.shape[0], 1))))
    return np.dot(foci, t)[:, 0:3]

def t88_to_mni():
    """ Convert Talairach to MNI coordinates using the Lancaster transform.
    Adapted from BrainMap scripts; see http://brainmap.org/icbm2tal/
    Details are described in Lancaster et al. (2007)
    (http://brainmap.org/new/pubs/LancasterHBM07.pdf). """
    return np.array([[0.9254, 0.0024, -0.0118, -1.0207],
                     [-0.0048, 0.9316, -0.0871, -1.7667],
                     [0.0152, 0.0883, 0.8924, 4.0926],
                     [0.0, 0.0, 0.0, 1.0]]).T

In [17]:
# Convert Talairach coordinates to MNI space by Lancaster transform
tal2mni = transform(ns[['x', 'y', 'z']].values, t88_to_mni())
tal2mni_ser = pd.Series([str(row[0]) + "," + str(row[1]) + "," + str(row[2]) for row in tal2mni])
ns.loc[ns["space"] == "TAL", "mni_coord"] = tal2mni_ser
ns[ns["space"] == "TAL"].head(5)

Unnamed: 0,id,doi,x,y,z,space,peak_id,table_id,table_num,title,authors,year,journal,mni_coord
397839,9185551,,17.0,37.0,-20.0,TAL,548701,28699,3,A role for the right anterior temporal lobe in...,"Small DM, Jones-Gotman M, Zatorre RJ, Petrides...",1997,The Journal of neuroscience : the official jou...,"18.975472613463868,38.79753000971181,-31.15962..."
397840,9185551,,28.0,48.0,-9.0,TAL,548702,28699,3,A role for the right anterior temporal lobe in...,"Small DM, Jones-Gotman M, Zatorre RJ, Petrides...",1997,The Journal of neuroscience : the official jou...,"30.967127226778764,51.681125243160615,-20.3123..."
397841,9185551,,9.0,41.0,-27.0,TAL,548703,28699,3,A role for the right anterior temporal lobe in...,"Small DM, Jones-Gotman M, Zatorre RJ, Petrides...",1997,The Journal of neuroscience : the official jou...,"10.218962457576001,42.29431987922428,-39.20048..."
397842,9185551,,-21.0,36.0,-12.0,TAL,548704,28699,3,A role for the right anterior temporal lobe in...,"Small DM, Jones-Gotman M, Zatorre RJ, Petrides...",1997,The Journal of neuroscience : the official jou...,"-21.963194872504793,38.420005679102495,-21.460..."
397843,9185551,,-26.0,29.0,-18.0,TAL,548705,28699,3,A role for the right anterior temporal lobe in...,"Small DM, Jones-Gotman M, Zatorre RJ, Petrides...",1997,The Journal of neuroscience : the official jou...,"-27.41963359173434,30.33283250836168,-27.29068..."


In [18]:
# Save metadata with transformed coordinates
ns.to_csv(path_or_buf="neurosynth/neurosynth_180805.csv", index=None)

In [11]:
# Write scripts to preprocess coordinates
ns = pd.read_csv("neurosynth/neurosynth_180805.csv", index_col=None, header=0)
long = []
for sigma in [5]:#[0, 5]:
    
    # Create directory for current smoothing sigma
    if not os.path.exists("neurosynth/neurosynth_preproc_coords/{}mm".format(sigma)):
        os.makedirs("neurosynth/neurosynth_preproc_coords/{}mm".format(sigma))
    if not os.path.exists("neurosynth/neurosynth_preproc_coords/{}mm/logs".format(sigma)):
        os.makedirs("neurosynth/neurosynth_preproc_coords/{}mm/logs".format(sigma))
    
    for pmid in list(set(list(ns["id"]))):

        # Format preprocessing command
        coords = list(ns[ns["id"] == pmid]["mni_coord"])
        comm = "pc.run_preproc(path, {}, {}, smoothing_sigma={}, mask_path=mask_path)".format(coords, pmid, sigma)

        # Write python script with command for executing preprocessing
        pyfile = open("neurosynth/neurosynth_preproc_coords/{}mm/preproc_{}.py".format(sigma, pmid), "w+")
        pyfile.write("#!/bin/python\n\nimport preproc_coords as pc\n\npath = '/scratch/PI/aetkin/ebeam/cogneuro/neurosynth'\nmask_path = '/scratch/PI/aetkin/ebeam/cogneuro/masks'\n\n{}".format(comm))
        pyfile.close()
        
        # Scale script duration to number of coordinates
        mins = 10*len(set(coords))
        qos = "#"
        partition = "normal"
        if mins > 600:
            qos = ""
            partition = "normal"
            long.append(pmid)

        # Write bash script for slurm submission
        bashfile = open("neurosynth/neurosynth_preproc_coords/{}mm/preproc_{}.sbatch".format(sigma, pmid), "w+")
        lines = ["#!/bin/bash\n",
                 "#SBATCH --job-name={}_{}_ns".format(pmid, sigma),
                 "#SBATCH --output=logs/{}.%j.out".format(pmid),
                 "#SBATCH --error=logs/{}.%j.err".format(pmid),
                 "#SBATCH --time={}".format(str(timedelta(minutes=mins)).replace("1 day, ", "01-")),
                 "#SBATCH -p {}".format(partition),
                 "{}#SBATCH --qos=long".format(qos),
                 "#SBATCH --nodes=1",
                 "#SBATCH --mem=350",
                 "#SBATCH -c 1",
                 "#SBATCH --mail-type=FAIL # notifications for job failure only",
                 "#SBATCH --mail-user=ebeam@stanford.edu\n",
                 "module load python/2.7.13 biology fsl/5.0.10",
                 "srun python preproc_{}.py".format(pmid)]
        for line in lines:
            bashfile.write(line + "\n")
        bashfile.close()
        
        # Copy over preprocessing and wrap scripts
        copyfile("neurosynth/neurosynth_preproc_coords/preproc_coords.py", "neurosynth/neurosynth_preproc_coords/{}mm/preproc_coords.py".format(sigma))
        copyfile("neurosynth/neurosynth_preproc_coords/wrap.sh", "neurosynth/neurosynth_preproc_coords/{}mm/wrap.sh".format(sigma))

In [20]:
chunks = chunkify(list(set(ns["id"])), 150)
for i, chunk in enumerate(chunks):
    lines = ["#!/bin/sh", 
             'IDS="{}"'.format(" ".join([str(id) for id in chunk])),
            "for ID in $IDS; do",
            "if [ ! -f '/scratch/PI/aetkin/ebeam/cogneuro/neurosynth/coordinates/5mm/${ID}.txt' ]", 
            "then", 
            "echo `sbatch preproc_${ID}.sbatch`", 
            "sleep 1", 
            "fi", 
            "done"]
    file = open("neurosynth/neurosynth_preproc_coords/5mm/wrap_{}.sh".format(i), "w+")
    for line in lines:
        file.write(line + "\n")

# ACE

### Metadata

In [2]:
# Metadata with coordinates in unknown space removed
ac_old = pd.read_csv("ace/ace_180711.csv", header=0, index_col=None)
len(list(set(list(ac_old["id"]))))

3826

In [3]:
# Metadata with coordinates in unknown space removed
ac_new = pd.read_csv("ace/ace_180804.csv", header=0, index_col=None)
len(list(set(list(ac_new["id"]))))

4132

In [4]:
ac_new = ac_new[~(ac_new["id"].isin(list(ac_old["id"])))]

In [5]:
ac_new = ac_new[~(ac_new["space"] == "UNKNOWN")]

In [6]:
len(list(set(list(ac_new["id"]))))

237

In [7]:
ac = ac_old.append(ac_new)
ac.to_csv("ace_180805.csv", index=None)

In [8]:
len(list(set(list(ac["id"]))))

4063

### PDFs and HTML

In [9]:
# Attempt download with ruby script
for pmid in list(set(ac_new["id"])):
    try:
        cogneuro_file = "../texts/pdf/{}.pdf".format(int(pmid))
        pubmed_file = "../../pubmed/vetted/{}.pdf".format(int(pmid))
        if not os.path.isfile(cogneuro_file):
            if os.path.isfile(pubmed_file):
                shutil.move(pubmed_file, "pdf/{}.pdf".format(pmid))
            else:
                comm = "ruby /Users/ehbeam/Dropbox/Stanford/Research/Projects/Psychiatlas/scripts/borrowed/Pubmed-Batch-Download-master/pubmedid2pdf.rb {}".format(int(pmid))
                args = shlex.split(comm)
                proc = subprocess.call(args)
    except:
        pass

In [11]:
# Download PDFs of PloS One articles
http = httplib2.Http()
downloaded = [int(file.replace(".pdf","")) for file in os.listdir("../texts/pdf") if not file.startswith(".")]
for i, row in ac_new.iterrows():
    if row["journal"] == "PloS one" and row["id"] not in downloaded:
        pdf_url = "http://journals.plos.org/plosone/article/file?id={}&type=printable".format(row["doi"])
        pdf_file = "pdf/{}.pdf".format(row["id"])
        comm = "wget -O {} {}".format(pdf_file, pdf_url)
        args = shlex.split(comm)
        proc = subprocess.Popen(args)
        sleep(0.5)

In [43]:
# Save PubMed links that redirect to publisher site for PDF download 
downloaded = [int(file.replace(".pdf", "")) for file in os.listdir("../texts/pdf") if not file.startswith(".")]
missing = [pmid for pmid in list(set(ac["id"])) if pmid not in downloaded]
missing_url = ["https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={}&cmd=prlinks&retmode=ref".format(pmid) for pmid in missing]
missing_df = pd.DataFrame({"URL": missing_url, "PMID": missing})
missing_df.to_csv(path_or_buf="../texts/download/failed_pdfs_ace.csv", index=None, columns=["URL","PMID"])

In [None]:
# Get list of PMIDs for HTML download
journal = "Journal of neuroimaging"
min_pmid = 0
query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=({}[Journal]+journal+article[pt]+fmri)&retmax=15000".format(journal.replace(" ", "+"))
req = requests.get(query, timeout=5.0).text
soup = BeautifulSoup(req, "lxml")
ids = [t.string for t in soup.find_all('id')]
downloaded = [file.replace(".html", "") for file in os.listdir("ace/ACE/articles/html/{}".format(journal))]
to_download = [id for id in ids if id not in downloaded and int(id) > min_pmid]
print("Number to download: {}".format(len(to_download)))
i = 1
for pmid in to_download:
    print("{} {} http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={}&cmd=prlinks&retmode=ref".format(i, pmid, pmid))
    i += 1

### Coordinates

In [61]:
# Add column for MNI coordinates
ac = pd.read_csv("ace/ace_180805.csv", header=0, index_col=None)
ac["mni_coord"] = ac["x"].map(str) + "," + ac["y"].map(str) + "," + ac["z"].map(str)
ac.loc[ac.space != "MNI", ["mni_coord"]] = ""

In [62]:
# Functions to convert Talairach coordinates to MNI space
# Adapted from https://github.com/neurosynth/neurosynth/blob/master/neurosynth/base/transformations.py

def transform(foci, mat):
    """ Convert coordinates from one space to another using provided
    transformation matrix. """
    t = linalg.pinv(mat)
    foci = np.hstack((foci, np.ones((foci.shape[0], 1))))
    return np.dot(foci, t)[:, 0:3]

def t88_to_mni():
    """ Convert Talairach to MNI coordinates using the Lancaster transform.
    Adapted from BrainMap scripts; see http://brainmap.org/icbm2tal/
    Details are described in Lancaster et al. (2007)
    (http://brainmap.org/new/pubs/LancasterHBM07.pdf). """
    return np.array([[0.9254, 0.0024, -0.0118, -1.0207],
                     [-0.0048, 0.9316, -0.0871, -1.7667],
                     [0.0152, 0.0883, 0.8924, 4.0926],
                     [0.0, 0.0, 0.0, 1.0]]).T

In [63]:
# Convert Talairach coordinates to MNI space by Lancaster transform
tal2mni = transform(ac[['x', 'y', 'z']].values, t88_to_mni())
tal2mni_str = []
for row in tal2mni:
    tal2mni_str.append(str(row[0]) + "," + str(row[1]) + "," + str(row[2]))
tal2mni_ser = pd.Series(tal2mni_str)
ac.loc[ac["space"] == "TAL", "mni_coord"] = tal2mni_ser
ac[ac["space"] == "TAL"].head(5)

Unnamed: 0,id,doi,x,y,z,space,peak_id,table_id,table_num,title,authors,year,journal,mni_coord
79974,10585521,10.1016/S0010-0277(99)00060-8,15.0,16.0,19.0,TAL,20094,1337,2,Temporal cortex activation during speech recog...,"Sato H, Takeuchi T, Sakai KL",1999,Cognition,"17.442356860158288,20.505386127425055,14.37881..."
79975,10585521,10.1016/S0010-0277(99)00060-8,15.0,16.0,19.0,TAL,20095,1337,2,Temporal cortex activation during speech recog...,"Sato H, Takeuchi T, Sakai KL",1999,Cognition,"17.442356860158288,20.505386127425055,14.37881..."
79976,10585521,10.1016/S0010-0277(99)00060-8,15.0,19.0,20.0,TAL,20096,1337,2,Temporal cortex activation during speech recog...,"Sato H, Takeuchi T, Sakai KL",1999,Cognition,"17.443944140124717,23.799948025882983,15.17337..."
79977,10666562,10.1016/S0926-6410(99)00029-4,-44.0,5.0,37.0,TAL,20256,1348,1,Prefrontal cortex activation in task switching...,"Dove A, Pollmann S, Schubert T, Wiggins CJ, vo...",2000,Brain research. Cognitive brain research,"-46.00411768534114,10.45071901107778,36.624679..."
79978,10666562,10.1016/S0926-6410(99)00029-4,40.0,8.0,36.0,TAL,20257,1348,1,Prefrontal cortex activation in task switching...,"Dove A, Pollmann S, Schubert T, Wiggins CJ, vo...",2000,Brain research. Cognitive brain research,"44.720313185949635,13.85767378403707,33.621715..."


In [64]:
# Drop coordinates in an unknown space
ac = ac[ac["space"] != "UNKNOWN"]
ac.shape

(117264, 14)

In [65]:
# Save ACE metadata with transformed coordinates
ac.to_csv(path_or_buf="ace/ace_180805.csv", index=None)

In [66]:
# Reload data from file
ac = pd.read_csv("ace/ace_180805.csv", header=0, index_col=None)

In [67]:
len(list(set(list(ac["id"]))))

3868

In [21]:
# Write scripts to preprocess coordinates
long = []
for sigma in [0, 5]:
    
    # Create directory for current smoothing sigma
    if not os.path.exists("ace/ace_preproc_coords/{}mm".format(sigma)):
        os.makedirs("ace/ace_preproc_coords/{}mm".format(sigma))
    if not os.path.exists("ace/ace_preproc_coords/{}mm/logs".format(sigma)):
        os.makedirs("ace/ace_preproc_coords/{}mm/logs".format(sigma))
    
    for pmid in list(set(list(ac["id"]))):

        # Format preprocessing command
        coords = list(ac[ac["id"] == pmid]["mni_coord"])
        comm = "pc.run_preproc(path, {}, {}, smoothing_sigma={}, mask_path=mask_path)".format(coords, pmid, sigma)
        
        # Write python script with command for executing preprocessing
        pyfile = open("ace/ace_preproc_coords/{}mm/preproc_{}.py".format(sigma, pmid), "w+")
        pyfile.write("#!/bin/python\n\nimport preproc_coords as pc\n\npath = '/scratch/PI/aetkin/ebeam/cogneuro/ace'\nmask_path = '/scratch/PI/aetkin/ebeam/cogneuro/masks'\n\n{}".format(comm))
        pyfile.close()
        
        # Scale script duration to number of coordinates
        mins = 5*len(set(coords))
        qos = "#"
        partition = "normal"
        if mins > 600:
            qos = ""
            partition = "normal"
            long.append(pmid)
        
        # Write bash script for slurm submission
        bashfile = open("ace/ace_preproc_coords/{}mm/preproc_{}.sbatch".format(sigma, pmid), "w+")
        lines = ["#!/bin/bash\n",
                 "#SBATCH --job-name={}".format(pmid),
                 "#SBATCH --output=logs/{}.%j.out".format(pmid),
                 "#SBATCH --error=logs/{}.%j.err".format(pmid),
                 "#SBATCH --time={}".format(str(timedelta(minutes=mins))),
                 "#SBATCH -p {}".format(partition),
                 "{}#SBATCH --qos=long".format(qos),
                 "#SBATCH --nodes=1",
                 "#SBATCH --mem=350",
                 "#SBATCH -c 1",
                 "#SBATCH --mail-type=FAIL # notifications for job failure only",
                 "#SBATCH --mail-user=ebeam@stanford.edu\n",
                 "module load python/2.7.13 biology fsl/5.0.10",
                 "srun python preproc_{}.py".format(pmid)]
        for line in lines:
            bashfile.write(line + "\n")
        bashfile.close()
        
    # Copy over preprocessing and wrap scripts
    copyfile("ace/ace_preproc_coords/preproc_coords.py", "ace/ace_preproc_coords/{}mm/preproc_coords.py".format(sigma))
    copyfile("ace/ace_preproc_coords/wrap.sh", "ace/ace_preproc_coords/{}mm/wrap.sh".format(sigma))

In [22]:
len(long) / len(list(set(list(ac["id"]))))

0.04446742502585315

In [23]:
# Function to split list into list of n-sized chunks
def chunkify(l, n):
    chunks = []
    for i in range(0, len(l), n):
        chunks.append(l[i:i + n])
    leftover = len(l)-(len(l)*n)
    chunks.append(l[-leftover:])
    return chunks

In [25]:
chunks = chunkify(list(set(list(ac["id"]))), 100)
for i, chunk in enumerate(chunks):
    lines = ["#!/bin/sh", 
             'IDS="{}"'.format(" ".join([str(id) for id in chunk])),
            "for ID in $IDS; do",
            "if [ ! -f '/scratch/PI/aetkin/ebeam/cogneuro/ace/coordinates/0mm/${ID}.txt' ]", 
            "then", 
            "echo `sbatch preproc_${ID}.sbatch`", 
            "sleep 1", 
            "fi",
            "done"]
    file = open("ace/ace_preproc_coords/0mm/wrap_{}.sh".format(i), "w+")
    for line in lines:
        file.write(line + "\n")

# Combined data

In [42]:
# Attempt scraping of missing PDFs
df_filt = pd.read_csv("metadata_filt_180805.csv", 
                      index_col=None, header=0, encoding="cp858")
for pmid in df_filt["PMID"]:
    cogneuro_file = "../texts/pdf/{}.pdf".format(int(pmid))
    pubmed_file = "../../pubmed/vetted/{}.pdf".format(int(pmid))
    try:
        if not os.path.isfile(cogneuro_file):
            if os.path.isfile(pubmed_file):
                shutil.move(pubmed_file, "pdf/{}.pdf".format(pmid))
            else:
                comm = "ruby /Users/ehbeam/Dropbox/Stanford/Research/Projects/Psychiatlas/scripts/borrowed/Pubmed-Batch-Download-master/pubmedid2pdf.rb {}".format(int(pmid))
                args = shlex.split(comm)
                proc = subprocess.call(args)
    except:
        pass

  interactivity=interactivity, compiler=compiler, result=result)


In [45]:
# Save PubMed links that redirect to publisher site for PDF download 
downloaded = [int(file.replace(".pdf", "")) for file in os.listdir("../texts/pdf") if not file.startswith(".")]
missing = [pmid for pmid in df_filt["PMID"] if pmid not in downloaded]
missing_url = ["https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={}&cmd=prlinks&retmode=ref".format(pmid) for pmid in missing]
missing_df = pd.DataFrame({"URL": missing_url, "PMID": missing})
missing_df.to_csv(path_or_buf="../texts/download/failed_pdfs_combo.csv", index=None, columns=["URL","PMID"])

### Author manuscript XML

In [105]:
path = "/Volumes/Samsung_T5/open_access"
oa = pd.read_csv("{}/manuscript/oa_manuscript.csv".format(path), index_col="PMID", header=0)
oa.head()

Unnamed: 0_level_0,File,PMCID,MID
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17017123,PMC0016XXXXX/PMC1626270.xml,PMC1626270,NIHMS12889
12837037,PMC0019XXXXX/PMC1913286.xml,PMC1913286,NIHMS16708
19023455,PMC0021XXXXX/PMC2136438.xml,PMC2136438,NIHMS10936
18769527,PMC0021XXXXX/PMC2185066.xml,PMC2185066,NIHMS31656
17661176,PMC0022XXXXX/PMC2268633.xml,PMC2268633,NIHMS37031


In [106]:
pmids = [int(pmid) for pmid in df["PMID"] if int(pmid) in oa.index]
len(pmids)

2180

In [107]:
def extract_manuscript(raw_file, xml_file, pmid):
    soup = BeautifulSoup(open(xml_file, "r").read()).article
    text = " ".join(soup.findAll(text=True))
    with open(raw_file, "w+") as raw:
        raw.write(text)

In [108]:
added = []
for pmid in pmids:
    pmc = oa.loc[pmid, "PMCID"]
    pmc_pre = pmc.replace("PMC", "PMC00")
    raw_file = "{}/raw/cogneuro_180805/{}.txt".format(path, pmid)
    if not os.path.isfile(raw_file):
        if pmc[3] in ["1", "2", "3", "4", "5"]:
            pmc_pre = pmc.replace("PMC", "PMC00")[:7]
            xml_file = "{}/manuscript/{}XXXXXX.xml/{}/{}.xml".format(path, pmc_pre[:-1], pmc_pre + "XXXXX", pmc)
            extract_manuscript(raw_file, xml_file, pmid)
            added.append(pmid)
        elif pmc[3] == "6":
            xml_file = "{}/manuscript/PMC0060XXXXX/{}.xml".format(path, pmc)
            extract_manuscript(raw_file, xml_file, pmid)
            added.append(pmid)
len(added)

0

### Open access XML

In [98]:
oa = pd.read_csv("../../pubmed/open_access/oa_file_list.csv", index_col="PMID", header=0)
oa.head()



Unnamed: 0_level_0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),License
PMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11250746.0,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,PMC13900,2017-04-26 12:15:50,NO-CC CODE
11250747.0,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,PMC13901,2016-01-20 10:58:46,NO-CC CODE
11250748.0,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,PMC13902,2006-02-02 19:37:52,NO-CC CODE
11056684.0,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,PMC13911,2013-03-17 14:00:52,NO-CC CODE
11400682.0,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,PMC13912,2013-03-17 14:00:52,NO-CC CODE


In [99]:
pmids = [pmid for pmid in df["PMID"] if float(pmid) in oa.index]
len(pmids)

2905

In [100]:
import tarfile
path = "/Volumes/Samsung_T5/open_access"
for pmid in pmids:
    tar_file = "{}/oa_package/{}.tar.gz".format(path, pmid)
    pmid_dir = "{}/oa_package/{}".format(path, pmid)
    out_file = "{}/xml/cogneuro_180805/{}.xml".format(path, pmid)
    if not os.path.isfile(tar_file):
        tar_url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/{}".format(oa.loc[float(pmid),"File"])
        comm = "wget -O {} {}".format(tar_file, tar_url)
        args = shlex.split(comm)
        proc = subprocess.call(args)
        tar = tarfile.open(tar_file)
        tar.extractall(path=pmid_dir)
        tar.close()
    if not os.path.isfile(out_file):
        pmcid = oa.loc[float(pmid), "Accession ID"]
        xml_file = [file for file in os.listdir("{}/{}".format(pmid_dir, pmcid)) if file.endswith("xml")][0]
        shutil.copy("{}/{}/{}".format(pmid_dir, pmcid, xml_file), out_file)

In [102]:
from bs4 import BeautifulSoup
path = "/Volumes/Samsung_T5/open_access"
for pmid in pmids:
    xml_file = "{}/xml/cogneuro_180805/{}.xml".format(path, pmid)
    raw_file = "{}/raw/cogneuro_180805/{}.txt".format(path, pmid)
    if os.path.isfile(xml_file):
        soup = BeautifulSoup(open(xml_file, "r").read()).article
        text = " ".join(soup.findAll(text=True))
        with open(raw_file, "w+") as raw:
            raw.write(text)

### Identify duplicates

In [42]:
# Identify duplicates
from collections import Counter
duplicates = [k for k, v in Counter(df["PMID"]).items() if v > 1]
duplicates

[12135962, 17803835, 18209203, 18822455, 21958514]

In [44]:
df[df["PMID"].isin(duplicates)]

Unnamed: 0,PMID,DOI,KEY,SOURCE,AUTHORS,YEAR,MONTH,JOURNAL,TITLE,PAGES,...,ABSTRACT_URL,NUM_COORDINATES,MNI_COORDS,NUM_SUBJECTS,BRAINMAP_ID,BEHAVIORAL_DOMAIN,EXPERIMENT,DESCRIPTION,COORDS_AVAILABLE,TEXTS_AVAILABLE
753,12135962,,"Berthoz S, 2002",BrainMap,Berthoz S|Armony J L|Blair R J R|Dolan R J,2002,Jan,Brain,An fMRI study of intentional and unintentional...,1696-1708,...,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,64.0,,,18040043.0,"['Emotion.Negative.Anxiety', 'Emotion.Negative...",['Violation of social norms > Normal behaviour...,,1,1
754,12135962,,"Berthoz S, 2002",BrainMap,Berthoz S|Armony J L|Blair R J R|Dolan R J,2002,Jan,Brain,An fMRI study of intentional and unintentional...,1696-1708,...,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,64.0,,,17110231.0,"['Emotion.Negative.Anxiety', 'Emotion.Negative...",['Violation of social norms > Normal behaviour...,,1,1
1996,17803835,,"Lindauer R J, 2008",BrainMap,Lindauer R J|Booij J|Habraken J B|van Meijel E...,2008,Apr,Psychological Medicine,Effects of psychotherapy on regional cerebral ...,543-554,...,,7.0,,,16120224.0,"['Emotion.Other,Cognition.Memory.Explicit', 'E...","['PTSD > Traumatized Controls, Trauma vs. Base...",,1,1
1997,17803835,10.1017/S0033291707001432,"Liu L, 2012",BrainMap,Liu L|Wang W|You W|Li Y|Awati N|Zhao X|Booth J...,2012,Jun,Neuropsychologia,Similar alterations in brain function for phon...,2224-2232,...,,6.0,,,17010014.0,"['Cognition.Attention,Cognition.Language.Phono...","['Control > Reading Disabled, All tasks', 'Sem...",,1,1
2103,18209203,10.1212/01.wnl.0000287115.85956.87,"Raboyeau G, 2008",BrainMap,Raboyeau G|De Boissezon X|Marie N|Balduyck S|P...,2008,Jan,Neurology,Right hemisphere activation in recovery from A...,290-298,...,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,25.0,"-8,-90,6;8,6,28;6,-20,-14;-42,2,22;-38,-12,58;...",,11010022.0,"['Action.Execution.Speech,Cognition.Language.S...","['Naming - Rest, After - Before Computer Assis...",,1,1
2104,18209203,10.1212/01.wnl.0000287115.85956.87,"Bookheimer S Y, 1995",BrainMap,Bookheimer S Y|Zeffiro T A|Blaxton T A|Gaillar...,1995,Sep,Human Brain Mapping,Regional cerebral blood flow during object nam...,93-106,...,,133.0,"-8,-90,6;8,6,28;6,-20,-14;-42,2,22;-38,-12,58;...",,30241.0,"['Cognition.Language.Orthography', 'Cognition....","['Read Words Silently - Words Control', 'Read ...",Subjects underwent 6 conditions in which they ...,1,1
2279,18822455,10.1016/j.bandl.2008.07.003,"De Nil L F, 2017",BrainMap,De Nil L F|Beal D S|Lafaille S J|Kroll R M|Cra...,2017,Jan,Brain and Language,The effects of simulated stuttering and prolon...,114-123,...,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,70.0,"-58,-20,4;-54,-16,2;-46,-2,-10;-6,-34,-8;0,16,...",,17040087.0,"['Perception.Audition', 'Perception.Audition',...","['Listen - Baseline, Controls', 'Listen - Base...",PURPOSE: Functional magnetic resonance imaging...,1,1
2280,18822455,10.1016/j.bandl.2008.07.003,"De Nil L F, 2008",BrainMap,De Nil L F|Beal D S|Lafaille S J|Kroll R M|Cra...,2008,Nov,Brain and Language,The effects of simulated stuttering and prolon...,114-123,...,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,70.0,"-58,-20,4;-54,-16,2;-46,-2,-10;-6,-34,-8;0,16,...",,13010002.0,"['Perception.Audition,Cognition.Language.Speec...","['Listen minus Baseline, Healthy Controls', 'L...",,1,1
2823,21958514,10.1016/j.pscychresns.2011.05.001,"Barros-Loscertales A, 2011",BrainMap,Barros-Loscertales A|Bustamante JC|Ventura-Cam...,2011,Nov,Psychiatry Research,Lower activation in the right frontoparietal n...,111-118,...,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,50.0,,,17110230.0,"['Cognition.Attention', 'Cognition.Attention',...","['Counting stroop task > Control, Healthy Cont...",,1,1
2824,21958514,10.1016/j.pscychresns.2011.05.001,"Barros-Loscertales A, 2011",BrainMap,Barros-Loscertales A|Bustamante JC|Ventura-Cam...,2011,Nov,Psychiatry Research: Neuroimaging,Lower activation in the right frontoparietal n...,111-118,...,http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?...,22.0,,,14050043.0,"[nan, nan]","['Stroop > Control, Healthy Controls', 'Stroop...",,1,1


In [35]:
df.to_csv("metadata_filt_180809.csv", index=None)

### Copy over preprocessed texts for studies with coordinates

In [174]:
df = pd.read_csv("metadata_filt_180805.csv", index_col=None, header=0)

In [175]:
for pmid in df["PMID"]:
    preproc_file = "../../synonyms/corpus/{}.txt".format(pmid)
    if not os.path.isfile(preproc_file):
        print(pmid)
    else:
        shutil.copy(preproc_file, "../texts/rdoc_180716/preproc/{}.txt".format(pmid))

8896772
19093617


### DTM of coordinates

In [36]:
# Reload dataframe
df = pd.read_csv("metadata_filt_180721.csv", header=0, index_col=None, encoding="cp858")

In [54]:
# Load region labels
inlab = open("../labels/harvard-oxford_148struct.csv", "r").readlines()[1:]
labels_bilateral = sorted(set([line.split(",")[2] for line in inlab]))
labels_unilateral = sorted(set([line.split(",")[2].replace("left_", "").replace("right_", "") for line in inlab]))

In [55]:
# Functions to reformat atlasquery output labels
def gen_label_bilateral(label):
    parts_to_replace = ["_iiv", "_v", "_vi", "_vermis_vi", "_crus_i", "_vermis_crus_i", "_crus_ii", "_vermis_crus_ii", "_viib", "_vermis_viib", "_viiia", "_vermis_viiia", "_viiib", "_vermis_viiib", "_ix", "_vermis_ix", "_x", "_vermis_x"]
    for part in parts_to_replace:
        if label.endswith(part):
            label = label.replace(part, "_cerebellum")
    parts_to_remove = ["juxtapositional_lobule_cortex_(formerly_", ")", "_(includes_h1_and_h2"]
    for part in parts_to_remove:
        label = label.replace(part, "")
    return label.strip()

def gen_label_unilateral(label):
    parts_to_replace = ["_iiv", "_v", "_vi", "_vermis_vi", "_crus_i", "_vermis_crus_i", "_crus_ii", "_vermis_crus_ii", "_viib", "_vermis_viib", "_viiia", "_vermis_viiia", "_viiib", "_vermis_viiib", "_ix", "_vermis_ix", "_x", "_vermis_x"]
    for part in parts_to_replace:
        if label.endswith(part):
            label = label.replace(part, "_cerebellum")
    parts_to_remove = ["left_", "right_", "left", "right", "juxtapositional_lobule_cortex_(formerly_", ")", "_(includes_h1_and_h2"]
    for part in parts_to_remove:
        label = label.replace(part, "")
    return label.strip()

In [56]:
# Functions to compute winner-takes-all (WTA) features for document-coordinate matrix (DCM)
def wta_dcm(df, featurizer, suffix="", sigma=0):
    dcm = {}
    for pmid in sorted(list(df["PMID"])):
        dcm[pmid] = {"PMID": pmid}
        lines = open("../coordinates/{}mm/{}.txt".format(sigma, pmid), "r").readlines()
        hits = []
        for line in lines:
            if len(line.split()) > 0:
                label = gen_label_unilateral(line.split()[0])
                if label in labels_unilateral:
                    hits.append(label)
        for label in labels_unilateral:
            dcm[pmid][label] = featurizer(hits, label)
    output = "../coordinates/dcm/unilateral/dcm_{}mm{}.csv".format(sigma, suffix)
    with open(output, "w+"):
        out = pd.DataFrame(dcm)
        out_trans = pd.DataFrame.transpose(out)
        out_trans.to_csv(output, index = False, quoting = 1, columns = ["PMID"] + labels_unilateral)

In [8]:
# Compute winner-takes all DCMs
wta_dcm(df, lambda hits, label: hits.count(label), suffix="_wta_count", sigma=0)
wta_dcm(df, lambda hits, label: int(hits.count(label) > 0), suffix="_wta_binary", sigma=0)

In [57]:
# Function to compute binarized DCM above a probability threshold
# Probability is given as a percentage between 0 and 100
def prob_dcm(df, prob, sigma=0, atlas="unilateral", labs=[]): 
    dcm = {}
    for pmid in sorted(list(df["PMID"])):
        dcm[pmid] = {}
        dcm[pmid]["PMID"] = pmid
        lines = open("../coordinates/{}mm/{}.txt".format(sigma, pmid), "r").readlines()
        hits = []
        for line in lines:
            for struct in line.split(","):
                if len(struct.split()) == 2:
                    label, p = struct.split()
                    if float(p) > prob:
                        if atlas == "unilateral":
                            hits += [gen_label_unilateral(label)]
                        elif atlas == "bilateral":
                            hits += [gen_label_bilateral(label)]
        for label in labs:
            count = hits.count(label)
            if count > 0:
                dcm[pmid][label] = 1
            else:
                dcm[pmid][label] = 0
    outfile = "../coordinates/dcm/{}/dcm_{}mm_thres_{}.csv".format(atlas, sigma, prob)
    with open(outfile, "w+"):
        out = pd.DataFrame(dcm).transpose()
        out.to_csv(outfile, index=False, quoting=1, columns=['PMID'] + labs)

In [59]:
# Compute DTM without smoothing for unilateral atlas
prob_dcm(df, 0, sigma=0, atlas="unilateral", labs=labels_unilateral)

### Filtering by coordinates

In [60]:
# Get list of studies that failed to map onto any structures at the probability=0 level
dcm = pd.read_csv("../coordinates/dcm/unilateral/dcm_0mm_thres_0.csv", index_col=0, header=0)
all0 = dcm[(dcm.T == 0).all()].index
all0.shape

(0,)

In [61]:
# Drop structures with no mapped coordinates from data frame
df_filt = df[~df["PMID"].isin(all0)]
df_filt.shape

(17186, 19)

In [62]:
# Sort the columns
columns = ["PMID", "DOI", "KEY", "SOURCE", "AUTHORS", "YEAR", "MONTH", "JOURNAL", "TITLE", "PAGES", "VOLUME", "ABSTRACT_URL", "NUM_COORDINATES", "MNI_COORDS", "NUM_SUBJECTS", "BRAINMAP_ID", "BEHAVIORAL_DOMAIN", "EXPERIMENT", "DESCRIPTION"]

In [63]:
# Save the filtered data frame
df_filt.to_csv(path_or_buf="metadata_filt_180721.csv", index=None, columns=columns)

In [70]:
# Recompute DCMs with filtered data
df_filt = pd.read_csv("metadata_filt_180721.csv", index_col=None)
for prob in range(0,55,5):
    for sigma in [0, 5]:
        if not os.path.isfile("../coordinates/dcm/unilateral/dcm_{}mm_thres_{}.csv".format(sigma, prob)):
            prob_dcm(df_filt, prob, sigma=sigma, atlas="unilateral", labs=labels_unilateral)
        if not os.path.isfile("../coordinates/dcm/bilateral/dcm_{}mm_thres_{}.csv".format(sigma, prob)):
            prob_dcm(df_filt, prob, sigma=sigma, atlas="bilateral", labs=labels_bilateral)

In [134]:
# List extra 5-mm coordinates
for file in os.listdir("../coordinates/5mm"):
    if file not in os.listdir("../coordinates/0mm"):
        print(file)

24867712.txt
999999992.txt


In [11]:
# Fix left/right in smoothed coordinates
dif_length, not_in_meta = [], []
df_filt = pd.read_csv("metadata_filt_180721.csv", index_col=None)
for pmid in [file.replace(".txt", "") for file in os.listdir("../coordinates/5mm_raw") if not file.startswith(".")]:
    mappings = [line.strip() for line in open("../coordinates/5mm_raw/{}.txt".format(pmid), "r").readlines()]
    if int(pmid) in df_filt["PMID"].values:
        coords = list(df_filt[df_filt["PMID"] == int(pmid)]["MNI_COORDS"])[0].split(";")
        if len(mappings) != len(coords):
            coords = sorted(set(coords), key=coords.index)
            if len(mappings) != len(coords):
                dif_length.append(pmid)
            else:
                for i, coord in enumerate(coords):
                    if coord[0] == "-":
                        mappings[i] = mappings[i].replace("right", "left")
                with open("../coordinates/5mm/{}.txt".format(pmid), "w+") as outfile:
                    for line in mappings:
                          outfile.write(line + "\n")
        else:
            for i, coord in enumerate(coords):
                if coord[0] == "-":
                    mappings[i] = mappings[i].replace("right", "left")
            with open("../coordinates/5mm/{}.txt".format(pmid), "w+") as outfile:
                for line in mappings:
                      outfile.write(line + "\n")
    else:
        not_in_meta.append(pmid)

In [26]:
ns_ids = list(ns["id"])
bm_ids = list(bm['PMID'])
ac_ids = list(ac["id"])
source = {}
for pmid in dif_length:
#     mappings = [line.strip() for line in open("../coordinates/5mm_raw/{}.txt".format(pmid), "r").readlines()]
#     coords = list(df_filt[df_filt["PMID"] == int(pmid)]["MNI_COORDS"])[0].split(";")
    source[pmid] = []
    if int(pmid) in bm_ids:
        source[pmid].append("brainmap")
    if int(pmid) in ns_ids:
        source[pmid].append("neurosynth")
    if int(pmid) in ac_ids:
        source[pmid].append("ace")
    
    #print("{}: {} mappings, {} coordinates".format(pmid, len(mappings), len(coords)))

In [29]:
for pmid in dif_length + not_in_meta:
    mappings = [line.strip() for line in open("../coordinates/5mm_raw/{}.txt".format(pmid), "r").readlines()]
    for i, mapping in enumerate(mappings):
        if "left" in mapping:
            mappings[i] = mapping.replace("right", "left")
    with open("../coordinates/5mm/{}.txt".format(pmid), "w+") as outfile:
        for line in mappings:
            outfile.write(line + "\n")