In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from Bio import Entrez
from tqdm import tqdm
import pickle

In [9]:
data = pd.read_csv (r'data/SFARI-Genes.csv')
cols = data.columns
N, D = data.shape

In [10]:
def save_to_txt_file(contents, filename):
    DIR = os.getcwd() + "/results/" + filename
    
    with open(DIR, "w") as f:
        f.write(contents)
        
def save_to_pickle_file(contents, filename):
    DIR = os.getcwd() + "/data/" + filename
    
    with open(DIR, "wb") as handle:
        pickle.dump(contents, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def open_pickle_file(filename):
    DIR = os.getcwd() + "/data/" + filename
    
    with open(DIR, "rb") as handle:
        data = pickle.load(handle)
        
    return data

def save_to_npy_file(contents, filename):
    np.save(os.getcwd() + "/data/" + filename, contents)
    
def open_npy_file(filename):
    np.load(os.getcwd() + "/data/" + filename, allow_pickle=True)
    
def parse_eid():
    DIR = os.getcwd() + "/results/task2-1-eid.txt"
    eid = {}
    
    with open(DIR) as f:
        lines = f.readlines()
        
    for l in lines:
        t1 = l.replace("\n", "").split(": ")
        gene = t1[0]

        if len(t1) > 1:
            eid[gene] = t1[1].split(", ")
        else:
            eid[gene] = t1[1]
               
    return eid

def get_entrezs(symbols):
    entrezs = {}
    
    for s in symbols:
        entrezs[s] = eid[s]
        
    return entrezs

In [11]:
eid = parse_eid()

# Task 1

In [12]:
gene_scores = data["gene-score"]
score_indexes =  gene_scores[gene_scores == 1].index
symbols = data["gene-symbol"].filter(items=score_indexes, axis=0)  

print(symbols)

6         ACTB
14        ADNP
16        ADSL
17        AFF2
27       AHDC1
         ...  
984       UBR1
998     ZBTB20
1004    ZNF292
1006    ZNF462
1019    ZMYND8
Name: gene-symbol, Length: 206, dtype: object


In [19]:
ncbi_ids = get_entrezs(symbols)

txt = ""
for s in symbols:
    txt += " ".join(ncbi_ids[s]) + "\n"
    
save_to_txt_file(txt, "task3.1-gs1-ids.txt")