In [1]:
"""A module that makes downloading and using many GSE datasets fast and easy.

Author: Joshua Blanchard, last update: 4/28/2021

A typical workflow:

    Download wanted GSE family files using the download() function.

    Extract content from these files using the extract() function.

    Use the family_dict() function to convert data to pandas.DataFrame objects.

    With the info() function, perform data analysis on the previously created pandas.DataFrame objects.
"""

'A module that makes downloading and using many GSE datasets fast and easy.\n\nAuthor: Joshua Blanchard, last update: 4/28/2021\n\nA typical workflow:\n\n    Download wanted GSE family files using the download() function.\n\n    Extract content from these files using the extract() function.\n\n    Use the family_dict() function to convert data to pandas.DataFrame objects.\n\n    With the info() function, perform data analysis on the previously created pandas.DataFrame objects.\n'

In [28]:
import pandas as pd
import os
import numpy as np
import re as re
import xmltodict
import shutil
import ftplib
import tarfile
import pickle
import gzip
# import requests
# import wget

In [3]:
__base_path = "data"

In [4]:
__hidden_path = ".aidp_files"

In [5]:
def family_dict(GSE_family):
    
    """
    Given a family ID, will output a dictionary. Keys will be the sample IDs, values will be the corresponding
    pandas.DataFrame object.
    """

#     let's check if we already have this dictionary saved
    if not os.path.exists("./" + __hidden_path):
        os.mkdir(__hidden_path)
    
    dict_path = __hidden_path + "/" + GSE_family + "_dict"    
    if not os.path.exists(dict_path):
    
        family_directory = __family_path(GSE_family)
        total_list = os.listdir(family_directory)
        valid_files = []

        for file_name in total_list:
            match = re.match(r"GSM", file_name)
            if match:
                valid_files.append(file_name)

        family_dict = {}
        for file_name in valid_files:
            file_df = __load_file(os.path.join(family_directory, file_name))
            sample_id = re.match(r"GSM\d+", file_name).group(0)
            family_dict[sample_id] = file_df
            
        dict_file = open(dict_path, 'wb')
        pickle.dump(family_dict, dict_file)
        dict_file.close()
        
    else:
        dict_file = open(dict_path, 'rb')
        family_dict = pickle.load(dict_file)
        dict_file.close()
        
        
    return family_dict


In [6]:
def __family_path(GSE_family):
    
    """
    Exists to build paths to the family's directory.
    
    I downloaded the files using the download() function in conjuction with the extract() function.
    """
        
    return os.path.join(__base_path, GSE_family + "/")


In [7]:
def __load_file(file_directory):
#     clean data file
#     convert to a dataframe object and return

    """
    Given a file name will output a corresponding pandas.DataFrame object.
    """
    
    try:
        clean_dict = __clean(file_directory)
    except PermissionError:
        print("You likely inputted the path of a directory, not a file.")
    
#     return pd.DataFrame({"site": clean_dict["col_1"], "measurement": clean_dict["col_2"]})
    return pd.DataFrame(data= clean_dict["col_2"], index= clean_dict["col_1"], columns= ["measurement"])


In [8]:
def __clean(file_path):
    
    """
    Cleans a given .txt file.
    
    Returns a dictionary:
    
    "site": first column
    "measurement": second column
    "bad_rows": list of all the invalid rows
    """

    valid_rows = []
    not_valid_rows = []
    file = open(file_path, 'r')
    
    for line in file:
        
#         checks for only the first two columns
        line_match = re.match(r"\S+\t\S+", line)
        if line_match:
            valid_rows.append(line_match.group(0))
        else:
            not_valid_rows.append(line)
        
    file.close()
    
#     now let's split our valid_rows list into two lists, one for each column
    col_1 = []
    col_2 = []
    for row in valid_rows:
        row_match = re.match(r"(\S+)\t(\S+)", row)
        col_1.append(row_match.group(1))
        col_2.append(row_match.group(2))
        
    return {"col_1":col_1, "col_2":col_2, "bad_rows":not_valid_rows}


In [9]:
def info(GSE_family, sample_id, info):
    
    """
    Given a GSE family and the ID of the wanted sample will return the desired information of the sample.
    
    Possible values for the info parameter:
    
    "age": The unit of the outputted age is the same as how it is documented by the study (typically in years).
    """
    
#             match = re.search(r"(a|A)(g|G)(e|E)", tag)


In [52]:
series("GSE20236")

Unnamed: 0_level_0,GSM507152,GSM507153,GSM507154,GSM507155,GSM507156,GSM507157,GSM507158,GSM507159,GSM507160,GSM507161,...,GSM507235,GSM507236,GSM507237,GSM507238,GSM507239,GSM507240,GSM507241,GSM507242,GSM507243,GSM507244
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg00000292,0.792011,0.796509,0.763023,0.820765,0.773299,0.747340,0.713949,0.813502,0.778094,0.766275,...,0.827582,0.778556,0.806697,0.832015,0.803638,0.832121,0.757736,0.761311,0.797745,0.828347
cg00002426,0.840962,0.826457,0.815775,0.843155,0.795394,0.807331,0.798747,0.813411,0.810941,0.840154,...,0.801613,0.803991,0.807978,0.804708,0.801489,0.774900,0.822104,0.832229,0.752009,0.707571
cg00003994,0.055870,0.059189,0.058670,0.050900,0.055990,0.055881,0.052813,0.056153,0.054572,0.049189,...,0.070055,0.048356,0.044994,0.042610,0.042428,0.049025,0.061797,0.057777,0.074021,0.070846
cg00005847,0.271481,0.121249,0.166173,0.154110,0.135165,0.119422,0.131562,0.134494,0.209015,0.142466,...,0.204003,0.158159,0.135777,0.168138,0.216327,0.211452,0.167889,0.189549,0.204359,0.149633
cg00006414,0.046894,0.102483,0.078157,0.077850,0.119930,0.094493,0.094629,0.058979,0.097523,0.077423,...,0.098393,0.093267,0.087434,0.120243,0.078206,0.113893,0.078242,0.098270,0.090512,0.113279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg27657283,0.038590,0.043396,0.057902,0.055350,0.052622,0.053440,0.053204,0.058580,0.086130,0.039717,...,0.092522,0.076717,0.081453,0.064019,0.075889,0.088107,0.104608,0.081338,0.134938,0.086519
cg27661264,0.312317,0.284173,0.374900,0.368735,0.271154,0.423313,0.384813,0.257877,0.313200,0.330983,...,0.342140,0.397453,0.349603,0.295005,0.405466,0.381595,0.440640,0.297063,0.410779,0.372395
cg27662379,0.028933,0.022489,0.030850,0.021640,0.028420,0.027967,0.032077,0.027049,0.025549,0.029613,...,0.033037,0.026356,0.027554,0.026756,0.032688,0.026404,0.027877,0.028456,0.027097,0.025086
cg27662877,0.063923,0.041120,0.034233,0.025494,0.033892,0.038796,0.036746,0.040125,0.046239,0.039154,...,0.044613,0.045068,0.045846,0.045139,0.050488,0.057111,0.046197,0.045450,0.034840,0.052291


In [10]:
def series(GSE):
    
    download(GSE, file_type= "series_matrix")
    print("downloaded")
    extract()
    
    file_path = os.path.join(__base_path, GSE, GSE + "_series_matrix.txt")
    
    return __matrix_to_df(file_path)
    
    
# download and extract .soft file using the download() and extract() function
# convert file to a dataframe

In [11]:
def __matrix_to_df(file_path):
    
    """Returns the pandas.dataframe corresponding to the series_matrix file."""
    
    start_row, num_rows = __matrix_helper(file_path)
    df = pd.read_csv(file_path, header= start_row, sep= "\t", low_memory= False, nrows= num_rows)
    df.set_index("ID_REF", inplace= True)
    
    return df

In [12]:
def __matrix_helper(file_path):
    
    """Returns a tuple containing the start line for reading (0) and the number of rows to read (1)."""
    
    file = open(file_path)
    
    line_num = 0
    while True:
        line = file.readline()

        if line == "!series_matrix_table_begin\n":
            start_row = line_num
        elif line == "!series_matrix_table_end\n":
            end_row = line_num - 1
        elif line == "":
            break

        line_num += 1
        
    num_rows = end_row - start_row - 1
        
    return start_row, num_rows

In [13]:
def __xml_path(GSE_family):
    
    """
    Exists to build a path to the family's .xml.
    
    I downloaded these families using the download() function in conjuction with the extract() function.
    """
    
    return os.path.join(__base_path, GSE_family + "/" + GSE_family + "_family.xml")
    

In [14]:
def __xml_to_dict(xml_path):
    
    """
    Exists to convert a .xml file at xml_path to a dictionary.
    """
    
    family_file = open(xml_path,'r+b')
    family_dict = xmltodict.parse(family_file)
    family_file.close()
    
    return family_dict


In [15]:
def __dict_index(GSE_family, sample_id):
    
    """
    Gives the index of where in the dictionary the sample's information is.
    """
    
    return __sample_indices(GSE_family)[sample_id]


In [16]:
def __sample_indices(GSE_family):
    
    """
    Exists to return the indices of each sample's information within the associated family dictionary. Returns a dictionary
    with keys equal to the sample ID ("GSM***") and values equal to the index of that sample's information within the family
    dictionary.
    """
    
    family_dir = __family_path(GSE_family)
    file_list = os.listdir(family_dir)
    
    filtered_list = []
    for file_name in file_list:
        sample_match = re.match(r"GSM", file_name)
        if sample_match:
            filtered_list.append(file_name)
            
    for i in np.arange(len(filtered_list)):
        filtered_list[i] = re.match(r"GSM\d+", filtered_list[i]).group(0)
        
    index_dict = {}
    index = 0

    for sample_id in filtered_list:
        index_dict[sample_id] = index
        index += 1
        
    return index_dict


In [22]:
# url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSEnnn/" + "GSE41037" + "/miniml/" + "GSE41027" + "_family.xml.tgz"
# url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSEnnn/GSE1/miniml/GSE1_family.xml.tgz"

def download(GSE_family_list, file_type= "miniml"):
    
    """
    Will download family .tgz files to the following directory: "./data/"
    
    Possible values for file_type: "miniml", "series_matrix"
    """
    
    if (type(GSE_family_list) == str):
        GSE_family_list = [GSE_family_list]

    url = "ftp.ncbi.nlm.nih.gov"
    ftp = ftplib.FTP(url)
    ftp.login()
    
    if not os.path.exists(__base_path):
        os.mkdir(__base_path)
    
    for GSE_family in GSE_family_list:
        
        if file_type == "miniml":
        
            ftp.cwd("/geo/series/" + __sub_directory(GSE_family) + "/" + GSE_family + "/miniml/")
            filename = GSE_family + "_family.xml.tgz"
            
            if not (os.path.exists(__base_path + "/" + filename) or os.path.exists(__base_path + "/" + GSE_family)):
                local_file = open(__base_path + "/" + filename, 'wb')
                ftp.retrbinary('RETR ' + filename, local_file.write, blocksize= 16_384)
                local_file.close()
            
        else:
            
            ftp.cwd("/geo/series/" + __sub_directory(GSE_family) + "/" + GSE_family + "/matrix/")
            filename = GSE_family + "_series_matrix.txt.gz"
        
            if not (os.path.exists(__base_path + "/" + filename) or os.path.exists(__base_path + "/" + GSE_family + "/" + GSE_family + "_series_matrix.txt")):
                local_file = open(__base_path + "/" + filename, 'wb')
                ftp.retrbinary('RETR ' + filename, local_file.write, blocksize= 16_384)
                local_file.close()

    ftp.quit()
    

In [18]:
def __sub_directory(GSE_ID):
    
    """
    Given a GSE ID will return the corresponding sub-directory.
    """
    
    gse_int = __ID_to_int(GSE_ID)
    
    if gse_int <= 171:
        ret_str = "GSE" + str(gse_int) + "nnn"
    else:
        first_3_dig = int(str(gse_int)[0:3])
        if first_3_dig <= 171:
            ret_str = "GSE" + str(first_3_dig) + "nnn"
        else:
            first_2_dig_str = str(gse_int)[0:2]
            ret_str = "GSE" + first_2_dig_str + "nnn"
            
    return ret_str


In [19]:
def __ID_to_int(GSE_ID):
    
    """
    Given some GSE ID will return the corresponding integer as an int.
    
    Example:
    
    __ID_to_int("GSE41037") will return 41037.
    
    """
    
    match = re.search(r"\d+", GSE_ID)
    
    if not match:
        print(GSE_ID)
    
    return int(match.group(0))


In [50]:
def extract():
    
    """
    Will extract files from all downloaded family .tgz files to a respective directory: ./data/GSE***/
    
    This will also delete the compressed .tgz files.
    """
    
#     find all .tgz and .gz files
# extract those files
# delete the .tgz files

    file_list = os.listdir(__base_path)
    tgz_list = []
    gz_list = []
    
    for filename in file_list:
        match_tgz = re.search(r"\.tgz", filename)
        match_gz = re.search(r"\.gz", filename)
        
        if match_tgz:
            tgz_list.append(filename)
        elif match_gz:
            gz_list.append(filename)
                
    for filename in tgz_list:
        full_path = __base_path + "/" + filename
        family_id = re.search(r"GSE\d+", filename).group(0)
                
        file = tarfile.open(full_path)
        
        flag = False
        try:
            out_path = "./" + __base_path + "/" + family_id
            file.extractall(out_path)
        except:
#             let's end the content extraction of the file. will fix later.
            file.close()
    
#             out_file = open(out_path)
#             out_file.close()
            
#             os.remove(out_path)
            
            flag = True
            print("Something weird happened while extracting from the " + family_id + " compressed file. Ended extraction early for " + family_id + ".")
            
        if not flag:
            file.close()
            os.remove(full_path)
            
    
    for filename in gz_list:
        full_path = "./" + __base_path + "/" + filename
        family_id = re.search(r"GSE\d+", filename).group(0)
                
        compressed_file = open(full_path, 'rb')
        compressed_file_contents = compressed_file.read()
        compressed_file.close()
        
        contents_bytes = gzip.decompress(compressed_file_contents)
        contents_str = contents_bytes.decode()
        
        family_dir = os.path.join(__base_path, family_id)
        if not os.path.exists(family_dir):
            os.mkdir(family_dir)
        
        out_path = "./" + __base_path + "/" + family_id + "/" + family_id + "_series_matrix.txt"

        file = open(out_path, 'w')
        file.write(contents_str)
        file.close()
        
        os.remove(full_path)