In [2]:
# TODO: перевести все файлы либо в .soft, либо в .csv

from GEOparse import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings

In [3]:
# retrieve all strain phenotypes available
strains_genotyped = pd.DataFrame(pd.read_table("./data/jbloom_strains_genotyped.tab"))
# find a place where the metadata starts to dump it
last_segregant_col = strains_genotyped.columns.tolist().index("17_1_a")


# sort the strain names
def strain_to_list(item):
    l = item.split('_')
    return [int(l[0]), int(l[1]), ord(l[2])]

strain_names_full = sorted(
    strains_genotyped.columns.tolist()[1:last_segregant_col+1],
    key=strain_to_list
)

In [4]:
# %%javascript
# # suppressing annoying "Unknown subset type" warnings
# # (btw, where do they come from?)
# {$("div.output_stderr").toggle();}

GDS1115 = GEOparse.parse_GDS(
    filepath="./data/GDS1115.soft"
)
GDS1116 = GEOparse.parse_GDS(
    filepath="./data/GDS1116.soft"
)
# I wasn't provided with genotype data for 23_2_d,
# hence, I dropped the corresponding columns
GDS1115.table.drop("GSM62170", axis=1, inplace=True)
GDS1116.table.drop("GSM62171", axis=1, inplace=True)

 - DATABASE : Geo
 - DATASET : GDS1115
 - SUBSET : GDS1115_1
 - SUBSET : GDS1115_2
 - SUBSET : GDS1115_3
 - DATASET : GDS1115
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_2
Unknown subset type: strain for subset GDS1115_1
Unknown subset type: strain for subset GDS1115_1
Unknown subset type: strain for subset GDS1115_1
Unknown subset type: strain for subset GDS1115_1
Unknown subset type: strain for subset GDS1115_1
Unknown subset type: strain for subset GD

 - DATABASE : Geo
 - DATASET : GDS1116
 - SUBSET : GDS1116_1
 - SUBSET : GDS1116_2
 - SUBSET : GDS1116_3
 - DATASET : GDS1116
Unknown subset type: strain for subset GDS1116_1
Unknown subset type: strain for subset GDS1116_1
Unknown subset type: strain for subset GDS1116_1
Unknown subset type: strain for subset GDS1116_1
Unknown subset type: strain for subset GDS1116_1
Unknown subset type: strain for subset GDS1116_1
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GDS1116_3
Unknown subset type: strain for subset GD

In [5]:
# Display the GEO-samples used in brem2005 paper
# Both from GDS1115 and GDS1116,they are equally important
GDS1115_GSMs = GDS1115.table.columns.tolist()[:2]
GDS1116_GSMs = GDS1116.table.columns.tolist()[:2]

In [6]:
# Display all genotyped strains 
print("{}: {}".format(
        len(strain_names_full), 
        strain_names_full
    )
)

In [7]:
# Read the strain ids used in the paper
strain_data_brem = pd.read_csv('./data/brem2005_strains.csv')
strain_names_brem = strain_data_brem['Strain name'].tolist()
# Extract expression data from two microarrays
# into numpy matrices
GDS1115_m = pd.DataFrame.copy(GDS1115.table.iloc[:, 2:])\
                        .convert_objects(convert_numeric=True).values
GDS1116_m = pd.DataFrame.copy(GDS1116.table.iloc[:, 2:])\
                        .convert_objects(convert_numeric=True).values
expr_avg_m = np.zeros(shape=GDS1115_m.shape)
# Extract averaged expression data:
# -- If both values are available, take their mean
# -- If only one value is available, take that value
# -- Otherwise, set expression value to numpy.nan
for i in range(GDS1115_m.shape[0]):
    for j in range(GDS1115_m.shape[1]):
        if not np.isnan(GDS1115_m[i, j]) and not np.isnan(GDS1116_m[i, j]):
            expr_avg_m[i, j] = (GDS1115_m[i, j] + GDS1116_m[i, j]) / 2.
        elif not np.isnan(GDS1115_m[i, j]):
            expr_avg_m[i, j] = GDS1115_m[i, j]
        elif not np.isnan(GDS1116_m[i, j]):
            expr_avg_m[i, j] = GDS1116_m[i, j]
        else:
            expr_avg_m[i, j] = np.nan
# Construct resulting dataframe with expression data,
# averaged and cleaned up. Firstly, replace irrelevant
# GSMs with strain names
expr_df = pd.DataFrame(
    data=expr_avg_m,
    columns=strain_names_brem
)

  


  import sys


In [8]:
# Then, insert columns "ID_REF" and "IDENTIFIER"
# to make new dataframe similar to old ones
expr_df.insert(loc=0, column="ID_REF", value=0)
expr_df.insert(loc=1, column="IDENTIFIER", value='')
# And fill those columns with annotation values
expr_df["ID_REF"] = GDS1115.table["ID_REF"]
expr_df["IDENTIFIER"] = GDS1115.table["IDENTIFIER"]
# Drop the empty rows
expr_df = expr_df[(expr_df.IDENTIFIER != 'blank') 
        & (expr_df.IDENTIFIER != 'empty')]
# And save everything to file
expr_df.to_csv('./data/brem2005_RNA_expression.csv', sep='\t')