In [1]:
from collections import defaultdict
import pandas as pd
import re
import sys
import time
import mwclient


sys.path.append("../oats")
from oats.utils.utils import flatten

In [2]:
def gene_to_snp_texts(site, gene_name):
    snp_name_to_raw_text = {}
    gene_page = mwclient.page.Page(site,gene_name)
    snp_category_str = "Category:Is a snp"
    for linked_page in gene_page.links():
        if snp_category_str in [c.name for c in linked_page.categories()]:
            snp_name = linked_page.name
            raw_text = linked_page.text()
            snp_name_to_raw_text[snp_name] = raw_text
    return(snp_name_to_raw_text)


In [3]:
def remove_with_regex(pattern, text, replace_with=""):
    matches = re.findall(pattern, text)
    for match in matches:
        text = text.replace(match[0], replace_with)
    return(text.strip())


In [4]:
def clean_raw_page_text(text):
    
    # Remove all newline characters from the text
    text = text.replace("\n","").replace("\r","").replace("\t","")
    
    # Remove all the text that is inside of double curly braces, i.e from tables.
    text = remove_with_regex(r'(\{\{(.|\n)*?\}\})', text)
        
    # Remove the square brackets that indicate links, but keep the interior strings that aren't links.
    text = remove_with_regex(r'((www|http:|https:)+[^\s]+[\w])', text)
    text = text.replace("[","").replace("]","")
    
    
    return(text.strip())


In [5]:
# Get a list of the genes on SNPedia.
site = mwclient.Site('bots.snpedia.com', path="/")
snpedia_gene_names = [page.name for page in site.Categories["Is_a_gene"]]
print(len(snpedia_gene_names))
print(snpedia_gene_names[:10])

2161
['A4GALT', 'AANAT', 'AARS', 'AARS2', 'ABCA1', 'ABCA12', 'ABCA3', 'ABCA4', 'ABCA7', 'ABCB1']


In [6]:
#for k,v in gene_to_snp_texts(site, "CCR5").items():
#    print()
#    print(k)
#    print(clean_raw_page_text(v))

In [7]:
# We might need to scrape for all the gene names in SNPEedia, because we can only use the ones mentioned in KEGG.
kegg_filename = "/Users/irbraun/phenologs-with-oats/outputs/06_30_2020_h15m05s52_r1082/part_1_kegg_groupings.csv"
kegg_df = pd.read_csv(kegg_filename)
kegg_df = kegg_df[kegg_df["species"]=="hsa"]
kegg_gene_names = flatten([x.split("|") for x in kegg_df["gene_names"].values])
kegg_gene_names = [g.upper() for g in kegg_gene_names]
genes_in_snpedia_and_kegg = list(set(kegg_gene_names).intersection(set(snpedia_gene_names)))
print(len(genes_in_snpedia_and_kegg))

1371


  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# The web scraping step.
genes_to_snps_to_text = defaultdict(dict)
for i,gene_name in enumerate(genes_in_snpedia_and_kegg,1):
    genes_to_snps_to_text[gene_name] = gene_to_snp_texts(site, gene_name)
    if i%50 == 0:
        time.sleep(10)
        print(i)
    if i%500 == 0:
        break
print("done")

50
100
150
200
250
300
350
400
450
500
done


In [10]:
concatenated_text_dict = {g:" ".join([text for text in genes_to_snps_to_text[g].values()]) for g in genes_to_snps_to_text}
cleaned_text_dict = {g:clean_raw_page_text(text) for g,text in concatenated_text_dict.items()}
len(cleaned_text_dict)

500

In [16]:
# Make a dataframe that has this information in it.
row_tuples = []
for gene,text in cleaned_text_dict.items():
    row_tuples.append(("hsa",gene,text))
        

df = pd.DataFrame(row_tuples, columns=["species","gene_names","description"])
df["gene_synonyms"] = ""
df["term_ids"] = ""
df["sources"] = "SNPedia"

df.to_csv("snpedia_reshaped_data.csv")

df.head(20)        

Unnamed: 0,species,gene_names,description,gene_synonyms,term_ids,sources
0,hsa,HLA-G,"rs12722477, also known as Leu134Ile, a SNP of ...",,,SNPedia
1,hsa,HDC,Relation of polymorphism of the histidine deca...,,,SNPedia
2,hsa,ADCY3,"rs541941351, also known as c.191A>T or p.Asn64...",,,SNPedia
3,hsa,RAD51D,Mentioned as a pathogenic/likely pathogenic mu...,,,SNPedia
4,hsa,OR7D4,"rs5020278, a SNP in the OR7D4 olfactory recept...",,,SNPedia
5,hsa,FAAH,"rs324420, also known as c.385C>A or Pro129Thr,...",,,SNPedia
6,hsa,DHRS3,,,,SNPedia
7,hsa,SULT2A1,,,,SNPedia
8,hsa,POLE,"aka c.955G>A, p.Asp319AsnHeterozygotes are pre...",,,SNPedia
9,hsa,HHEX,rs10811661 replicated as significant for type-...,,,SNPedia
