In [39]:
from GEOparse import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

In [40]:
# retrieve all strain phenotypes available
strains_genotyped = pd.DataFrame(pd.read_table("./data/jbloom_strains_genotyped.tab"))
# find a place where the metadata starts to dump it
last_segregant_col = strains_genotyped.columns.tolist().index("17_1_a")


# sort the strain names
def strain_to_list(item):
    l = item.split('_')
    return [int(l[0]), int(l[1]), ord(l[2])]

strain_names_full = sorted(
    strains_genotyped.columns.tolist()[1:last_segregant_col+1],
    key=strain_to_list
)

# Display all genotyped strains 
print("{}: {}".format(
        len(strain_names_full), 
        strain_names_full
    )
)

124: ['1_1_d', '1_2_d', '1_3_d', '1_4_d', '1_5_c', '2_1_d', '2_2_d', '2_3_d', '2_4_a', '2_5_d', '2_6_d', '2_7_a', '2_7_b', '2_7_c', '2_7_d', '3_1_d', '3_2_d', '3_3_d', '3_4_d', '3_5_d', '4_1_c', '4_2_a', '4_3_d', '4_4_d', '5_1_d', '5_2_d', '5_3_d', '5_4_d', '5_5_d', '6_1_d', '6_2_b', '6_3_c', '6_4_d', '6_5_d', '6_6_d', '6_7_d', '7_1_d', '7_2_c', '7_3_d', '7_4_c', '7_5_d', '7_6_c', '7_7_c', '7_8_d', '8_1_a', '8_2_d', '8_3_a', '8_4_c', '8_5_b', '8_6_c', '8_7_b', '9_1_d', '9_2_d', '9_3_d', '9_4_d', '9_5_d', '9_6_d', '9_7_d', '10_1_c', '10_2_d', '10_3_c', '10_4_d', '11_1_a', '11_2_d', '11_3_b', '12_1_d', '12_2_b', '13_1_a', '13_2_c', '13_3_b', '13_4_a', '13_5_c', '14_1_b', '14_2_c', '14_3_d', '14_4_a', '14_5_b', '14_6_d', '14_7_c', '15_2_d', '15_3_b', '15_4_d', '15_5_b', '15_6_c', '16_1_d', '17_1_a', '17_2_d', '17_4_a', '17_5_b', '18_1_d', '18_2_d', '18_3_d', '18_4_c', '18_6_d', '19_1_c', '19_2_c', '19_3_c', '19_4_b', '19_5_b', '20_1_d', '20_2_d', '20_3_c', '20_4_c', '20_5_d', '21_1_d', '2

In [41]:
# Parse GDS with experiment data
# Ignore annoying "Unknown subset type" warnings
null_device = open(os.devnull, 'w')
original_stderr = sys.__stderr__
sys.__stderr__ = null_device  

GDS1115 = GEOparse.parse_GDS(
    filepath="./data/GDS1115.soft"
)
   
GDS1116 = GEOparse.parse_GDS(
    filepath="./data/GDS1116.soft"
)
# I wasn't provided with genotype data for 23_2_d,
# hence, I dropped the corresponding columns
GDS1115.table.drop("GSM62170", axis=1, inplace=True)
GDS1116.table.drop("GSM62171", axis=1, inplace=True)
sys.__stderr__ = original_stderr

In [42]:
# Display the GEO-samples used in brem2005 paper
# Both from GDS1115 and GDS1116,they are equally important
GDS1115_GSMs = GDS1115.table.columns.tolist()[:2]
GDS1116_GSMs = GDS1116.table.columns.tolist()[:2]

In [43]:
# Read the strain ids used in the paper
strain_data_brem = pd.read_csv('./data/brem2005_strains.csv')
strain_names_brem = strain_data_brem['Strain name'].tolist()
# Extract expression data from two microarrays
# into numpy matrices
GDS1115_m = pd.DataFrame.copy(GDS1115.table.iloc[:, 2:])\
                        .convert_objects(convert_numeric=True).values
GDS1116_m = pd.DataFrame.copy(GDS1116.table.iloc[:, 2:])\
                        .convert_objects(convert_numeric=True).values
expr_avg_m = np.zeros(shape=GDS1115_m.shape)
# Extract averaged expression data:
# -- If both values are available, take their mean
# -- If only one value is available, take that value
# -- Otherwise, set expression value to numpy.nan
for i in range(GDS1115_m.shape[0]):
    for j in range(GDS1115_m.shape[1]):
        if not np.isnan(GDS1115_m[i, j]) and not np.isnan(GDS1116_m[i, j]):
            expr_avg_m[i, j] = (GDS1115_m[i, j] + GDS1116_m[i, j]) / 2.
        elif not np.isnan(GDS1115_m[i, j]):
            expr_avg_m[i, j] = GDS1115_m[i, j]
        elif not np.isnan(GDS1116_m[i, j]):
            expr_avg_m[i, j] = GDS1116_m[i, j]
        else:
            expr_avg_m[i, j] = np.nan
# Construct resulting dataframe with expression data,
# averaged and cleaned up. Firstly, replace irrelevant
# GSMs with strain names
expr_df = pd.DataFrame(
    data=expr_avg_m,
    columns=strain_names_brem
)

In [44]:
# Then, insert columns "ID_REF" and "IDENTIFIER"
# to make new dataframe similar to old ones
expr_df.insert(loc=0, column="ID_REF", value=0)
expr_df.insert(loc=1, column="IDENTIFIER", value='')
# And fill those columns with annotation values
expr_df["ID_REF"] = GDS1115.table["ID_REF"]
expr_df["IDENTIFIER"] = GDS1115.table["IDENTIFIER"]
# Drop the empty rows
expr_df = expr_df[(expr_df.IDENTIFIER != 'blank') 
        & (expr_df.IDENTIFIER != 'empty')]
# And save everything to file
expr_df.to_csv('./data/brem2005_RNA_expression.csv', sep='\t')