Now that the entities are on Wikidata, while there is no has_positive_marker property there, we can make a local RDF file using Wikidata IDs. 

In [1]:
import pandas as pd 

gene_reference = pd.read_csv("../results/human_gene_reference_from_panglao_to_wikidata_04_11_2020.csv")

cell_type_reference = pd.read_csv("../results/cell_type_reference_from_panglao_to_wikidata_31_10_2020.csv")

markers = pd.read_csv("../data/PanglaoDB_markers_27_Mar_2020.tsv", sep="\t")

We want, though, the species specific cell types. Let's get them. 

In [2]:
from wikidata2df import wikidata2df

# A SPARQL query to return all cats in Wikidata!

query = """
SELECT ?item ?itemLabel ?superclass
WHERE
{
?item wdt:P31 wd:Q189118. 
?item wdt:P361 ?superclass. 
?superclass  wdt:P31 wd:Q189118. # cell type
?item wdt:P703 wd:Q15978631. # human cell type
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

dataframe_to_join = wikidata2df(query) 

In [3]:
cell_type_reference = cell_type_reference.merge(dataframe_to_join, left_on="wikidata", right_on="superclass")

In [4]:
cell_type_reference.to_csv("../results/human_cell_type_reference_13_11_2020.csv")
cell_type_reference.head()

Unnamed: 0.1,Unnamed: 0,panglao,wikidata,item,superclass,itemLabel
0,18,Decidual cells,Q5249153,Q101404881,Q5249153,human decidual cell
1,36,Smooth muscle cells,Q66508979,Q101404901,Q66508979,human smooth muscle cell
2,55,Epsilon cells,Q4532277,Q101404922,Q4532277,human epsilon cell
3,72,Myoblasts,Q1956694,Q101404940,Q1956694,human myoblast
4,73,Stromal cells,Q4381253,Q101404941,Q4381253,human stromal cell


In [5]:
human_markers = markers[["Hs" in val for val in markers["species"]]]

In [6]:
human_markers_lean = human_markers[["official gene symbol", "cell type"]]
human_markers_lean.to_csv("../results/human_markers_reconciled_13_11_2020.csv")
human_markers_lean.head()

Unnamed: 0,official gene symbol,cell type
0,CTRB1,Acinar cells
1,KLK1,Acinar cells
2,RBPJL,Acinar cells
3,PTF1A,Acinar cells
5,CELA3A,Acinar cells


In [7]:
human_markers_lean = human_markers_lean.merge(cell_type_reference, left_on="cell type", right_on="panglao")[["official gene symbol", "cell type", "item"]]

human_markers_lean.columns = ["official gene symbol", "cell type", "cell type id"]

In [8]:
human_markers_lean = human_markers_lean.merge(gene_reference, left_on="official gene symbol", right_on="panglao")[["official gene symbol", "cell type", "cell type id", "wikidata"]]

human_markers_lean.columns = ["official gene symbol", "cell type", "cell type id", "gene id"]
human_markers_lean

Unnamed: 0,official gene symbol,cell type,cell type id,gene id
0,XCL1,Decidual cells,Q101404881,Q8041711
1,IGFBP1,Decidual cells,Q101404881,Q18027673
2,XCL2,Decidual cells,Q101404881,Q8041712
3,CTSW,Decidual cells,Q101404881,Q5052493
4,IL32,Decidual cells,Q101404881,Q18033933
...,...,...,...,...
362,COL15A1,Stromal cells,Q101404941,Q5145899
363,GDF10,Stromal cells,Q101404941,Q14912142
364,COL4A1,Stromal cells,Q101404941,Q5145886
365,WNT2,Stromal cells,Q101404941,Q18032359


In [12]:
import rdflib

g = rdflib.Graph()

for index, row in human_markers_lean.iterrows():

    ctp = rdflib.Namespace("http://celltypes.wiki.opencura.com/entity/")
    wd = rdflib.Namespace("http://www.wikidata.org/entity/")
    wdt = rdflib.Namespace("http://www.wikidata.org/prop/direct/")

    s = rdflib.term.URIRef(wd+row["cell type id"])
    p1 = rdflib.term.URIRef(ctp+"P9")
    o1 = rdflib.term.URIRef(wd+row["gene id"])
    
    g.bind("ctp", ctp)
    g.bind("wd", wd)
    g.bind("wdt", wdt)
    g.add((s, p1, o1))

g.serialize(destination='../results/human_cell_type_markers_13_11_2020.ttl', format='turtle')