In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from Bio import Entrez
from tqdm import tqdm
import pickle

In [9]:
data = pd.read_csv (r'data/SFARI-Genes.csv')
cols = data.columns
N, D = data.shape

In [10]:
def save_to_txt_file(contents, filename):
    DIR = os.getcwd() + "/results/" + filename
    
    with open(DIR, "w") as f:
        f.write(contents)
        
def save_to_pickle_file(contents, filename):
    DIR = os.getcwd() + "/data/" + filename
    
    with open(DIR, "wb") as handle:
        pickle.dump(contents, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def open_pickle_file(filename):
    DIR = os.getcwd() + "/data/" + filename
    
    with open(DIR, "rb") as handle:
        data = pickle.load(handle)
        
    return data

def save_to_npy_file(contents, filename):
    np.save(os.getcwd() + "/data/" + filename, contents)
    
def open_npy_file(filename):
    np.load(os.getcwd() + "/data/" + filename, allow_pickle=True)
    
def parse_eid():
    DIR = os.getcwd() + "/results/task2-1-eid.txt"
    eid = {}
    
    with open(DIR) as f:
        lines = f.readlines()
        
    for l in lines:
        t1 = l.replace("\n", "").split(": ")
        gene = t1[0]

        if len(t1) > 1:
            eid[gene] = t1[1].split(", ")
        else:
            eid[gene] = t1[1]
               
    return eid

def get_entrezs(symbols):
    entrezs = {}
    
    for s in symbols:
        entrezs[s] = eid[s]
        
    return entrezs

In [11]:
eid = parse_eid()

# Task 1

In [12]:
gene_scores = data["gene-score"]
score_indexes =  gene_scores[gene_scores == 1].index
symbols = data["gene-symbol"].filter(items=score_indexes, axis=0)  

print(symbols)

6         ACTB
14        ADNP
16        ADSL
17        AFF2
27       AHDC1
         ...  
984       UBR1
998     ZBTB20
1004    ZNF292
1006    ZNF462
1019    ZMYND8
Name: gene-symbol, Length: 206, dtype: object


In [14]:
ncbi_ids = get_entrezs(symbols)

print(ncbi_ids)

{'ACTB': ['60', '728378'], 'ADNP': ['23394'], 'ADSL': ['158'], 'AFF2': ['2334'], 'AHDC1': ['27245'], 'ALDH5A1': ['7915'], 'ANK2': ['287'], 'ANK3': ['288'], 'ANKRD11': ['29123'], 'ARHGEF9': ['23229'], 'ARID1B': ['57492'], 'ARX': ['170302', '10054', '170640'], 'ASH1L': ['55870'], 'ASXL3': ['80816'], 'ATRX': ['546'], 'AUTS2': ['26053', '282553'], 'AP2S1': ['1175'], 'BAZ2B': ['29994'], 'BCKDK': ['10295'], 'BCL11A': ['53335'], 'BRAF': ['673'], 'BRSK2': ['9024'], 'CACNA1A': ['773'], 'CACNA1C': ['775'], 'CACNA1E': ['777'], 'CACNA2D3': ['55799'], 'CAPRIN1': ['4076'], 'CASK': ['8573'], 'CASZ1': ['54897'], 'CDKL5': ['6792'], 'CELF4': ['56853'], 'CHAMP1': ['283489'], 'CHD2': ['1106', '1826'], 'CHD3': ['1107'], 'CHD7': ['55636', '338420'], 'CHD8': ['57680'], 'CIC': ['23152', '93977'], 'CNOT3': ['4849'], 'CREBBP': ['1387'], 'CSDE1': ['7812'], 'CTCF': ['10664'], 'CTNNB1': ['1499'], 'CUL3': ['8452'], 'CORO1A': ['11151'], 'DDX3X': ['1654'], 'DEAF1': ['10522'], 'DHCR7': ['1717'], 'DIP2A': ['23181'], 'D