In [1]:
import pandas as pd 

gene_reference = pd.read_csv("../results/human_gene_reference_from_panglao_to_wikidata_04_11_2020.csv")

cell_type_reference = pd.read_csv("../results/cell_type_reference_from_panglao_to_wikidata_31_10_2020.csv")

markers = pd.read_csv("../data/PanglaoDB_markers_27_Mar_2020.tsv", sep="\t")

In [2]:
from wikidata2df import wikidata2df

# A SPARQL query to return all cats in Wikidata!

query = """
SELECT ?item ?itemLabel ?superclass
WHERE
{
?item wdt:P31 wd:Q189118. 
?item wdt:P279 ?superclass. 
?superclass  wdt:P31 wd:Q189118. # cell type
?item wdt:P703 wd:Q15978631. # human cell type
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

dataframe_to_join = wikidata2df(query)

In [3]:
cell_type_reference = cell_type_reference.merge(dataframe_to_join, left_on="wikidata", right_on="superclass")

In [4]:
human_markers = markers[["Hs" in val for val in markers["species"]]]
human_markers_lean = human_markers[["official gene symbol", "cell type"]]


In [5]:
human_markers_lean = human_markers_lean.merge(cell_type_reference, left_on="cell type", right_on="panglao")[["official gene symbol", "cell type", "item"]]

human_markers_lean.columns = ["official gene symbol", "cell type", "cell type id"]
human_markers_lean = human_markers_lean.merge(gene_reference, left_on="official gene symbol", right_on="panglao")[["official gene symbol", "cell type", "cell type id", "wikidata"]]

human_markers_lean.columns = ["official gene symbol", "cell type (general)", "cell type id (human)", "gene id"]
human_markers_lean = human_markers_lean.drop_duplicates()
human_markers_lean

Unnamed: 0,official gene symbol,cell type (general),cell type id (human),gene id
0,CEBPA,Adipocyte progenitor cells,Q101404942,Q17861031
1,CEBPA,Adipocyte progenitor cells,Q101404943,Q17861031
2,CEBPA,Basophils,Q101405089,Q17861031
3,CEBPA,Hepatoblasts,Q101404910,Q17861031
4,CEBPA,Hepatocytes,Q101405101,Q17861031
...,...,...,...,...
7991,SLC14A1,Urothelial cells,Q101404883,Q18031563
7992,UPK3A,Urothelial cells,Q101404883,Q18032295
7993,UPK1A,Urothelial cells,Q101404883,Q18036055
7994,UPK2,Urothelial cells,Q101404883,Q18032294


In [6]:
human_markers_lean.to_csv("../src/human_markers_to_add_to_wikidata_27_11_2020.csv")

In [10]:
with open("../quickstatements/markers_to_wikidata_27_11_2020.qs", "w+") as f:
    for i, row in human_markers_lean.iterrows():
        s = row["cell type id (human)"]
        p = "|P8872|"
        o = row["gene id"]
        r1 = "|S813|"
        or1 = "+2020-11-27T00:00:00Z/11"
        r2 = "|S854|"
        or2 = '"https://panglaodb.se/markers.html"'
        r3 = "|S248|"
        or3 = "Q99936939"
        f.write(s + p + o + r1 + or1 + r2 + or2 + r3 + or3)