In [4]:
from collections import defaultdict
import pandas as pd
import re
import sys
import time
import mwclient

import cleaning
import querying

sys.path.append("../oats")
from oats.utils.utils import flatten

In [2]:
# Get a list of the genes on SNPedia.
site = mwclient.Site('bots.snpedia.com', path="/")
snpedia_gene_names = [page.name for page in site.Categories["Is_a_gene"]]
print(len(snpedia_gene_names))
print(snpedia_gene_names[:10])

2161
['A4GALT', 'AANAT', 'AARS', 'AARS2', 'ABCA1', 'ABCA12', 'ABCA3', 'ABCA4', 'ABCA7', 'ABCB1']


In [3]:
# Looking at just a subset of them for now.
snpedia_gene_names = snpedia_gene_names[1:20]

In [6]:
# We might need to scrape for all the gene names in SNPEedia, because we can only use the ones mentioned in KEGG.
kegg_filename = "/Users/irbraun/phenologs-with-oats/outputs/06_30_2020_h15m05s52_r1082/part_1_kegg_groupings.csv"
kegg_df = pd.read_csv(kegg_filename)
kegg_df = kegg_df[kegg_df["species"]=="hsa"]
kegg_gene_names = flatten([x.split("|") for x in kegg_df["gene_names"].values])
kegg_gene_names = [g.upper() for g in kegg_gene_names]
genes_in_snpedia_and_kegg = list(set(kegg_gene_names).intersection(set(snpedia_gene_names)))
print(len(genes_in_snpedia_and_kegg))

18


  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# The web scraping step.
gene_num_limit = 500
pause_after = 50
genes_to_snps_to_text = defaultdict(dict)
for i,gene_name in enumerate(genes_in_snpedia_and_kegg,1):
    genes_to_snps_to_text[gene_name] = querying.gene_to_snp_texts(site, gene_name)
    if i%pause_after == 0:
        time.sleep(10)
        print(i)
    if i%gene_num_limit == 0:
        break
print("Completed the web scraping step.")

done


In [12]:
# Producing a dataset in CSV format that shows genes, SNPs, and the text that was cleaned from each page.

# Create each row one at a time.
row_tuples = []
for gene in genes_to_snps_to_text.keys():
    for snp,raw_text in genes_to_snps_to_text[gene].items():
        cleaned_text = cleaning.clean_raw_page_text(raw_text)
        row_tuples.append((gene,snp,cleaned_text))
        
# Generate the dataframe and subset to only include SNPs that had some amount of text extracted, and save as CSV file.
df = pd.DataFrame(row_tuples, columns = ["gene","snp","text"])
df = df[df["text"] != ""]
df.to_csv("dataset_sample.csv", index=False)
df.head(10)

Unnamed: 0,gene,snp,text
1,ABCB1,Rs10248420,rs10248420 is a SNP in the ABCB1 gene (also kn...
4,ABCB1,Rs1045642,"rs1045642, also known as C3435T, is a SNP loca..."
6,ABCB1,Rs1128503,rs1128503 is a SNP in the transporter P-glycop...
7,ABCB1,Rs11983225,rs11983225 is a SNP in the ABCB1 gene (also kn...
10,ABCB1,Rs12720067,rs12720067 is a SNP in the ABCB1 gene (also kn...
12,ABCB1,Rs2032582,"rs2032582, also known as G2677T, is a nonsynon..."
13,ABCB1,Rs2032583,rs2032583 is a SNP in the ABCB1 gene (also kno...
20,ABCB1,Rs2235015,rs2235015 is a SNP in the ABCB1 gene (also kno...
21,ABCB1,Rs2235035,C allele linked to colorectal cancers Identify...
22,ABCB1,Rs2235040,rs2235040 is a SNP in the ABCB1 gene (also kno...


In [14]:
# Produce a dataset in a format that can be used by the oats package.
concatenated_text_dict = {g:" ".join([text for text in genes_to_snps_to_text[g].values()]) for g in genes_to_snps_to_text}
cleaned_text_dict = {g:cleaning.clean_raw_page_text(text) for g,text in concatenated_text_dict.items()}
len(cleaned_text_dict)

18

In [16]:
# Make a dataframe that has this information in it.
row_tuples = []
for gene,text in cleaned_text_dict.items():
    row_tuples.append(("hsa",gene,text))

# Generate the dataframe and save as a CSV file.
df = pd.DataFrame(row_tuples, columns=["species","gene_names","description"])
df["gene_synonyms"] = ""
df["term_ids"] = ""
df["sources"] = "SNPedia"
df.to_csv("dataset_sample_reshaped_for_oats.csv", index=False)
df.head(10)        

Unnamed: 0,species,gene_names,description,gene_synonyms,term_ids,sources
0,hsa,ABCB1,rs10248420 is a SNP in the ABCB1 gene (also kn...,,,SNPedia
1,hsa,ABCC2,"rs2273697, also known as c1249G>A or p.V471I, ...",,,SNPedia
2,hsa,ABCC4,The human multidrug resistance protein 4 (MRP4...,,,SNPedia
3,hsa,ABCC8,Familial Hyperinsulinism (ABCC8-related)/ Fami...,,,SNPedia
4,hsa,ABCA3,"aka c.316C>T rs149989682, also known as c.875A...",,,SNPedia
5,hsa,ABCC11,"rs17822931, also known as c.538G>A or G180R, i...",,,SNPedia
6,hsa,ABCC6,"A systems genetics approach implicates USF1, F...",,,SNPedia
7,hsa,ABCC9,A K(ATP) channel gene effect on sleep duration...,,,SNPedia
8,hsa,ABCB4,"rs45575636, also known as c.1769G>A, p.Arg590G...",,,SNPedia
9,hsa,ABCD1,"aka c.488G>A (p.Arg163His), c.488G>T (p.Arg163...",,,SNPedia
