In [1]:
### 2021-10-14 File Created ###
### Keagan G Moo            ###
### BIOINF 575              ###
### Group Project #3        ###

# This notebook will use the GEOParse module to load in data from the GEO accession database. For this test
# we will use GSE10245 a database on Non Small Cell Lung Cancer cells. The output should be formatted for general
# use if it is not already in such a form. 

In [1]:
# installation method lifted from https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
# to ensure that this works on different machines. If you are having an issue, try using conda instead of pip.
# If you still have issues run "!conda env list" in an empty cell and ensure you are using the right python

# sys allows you to access system commands of which we need,
import sys
# sys.executable ensures that you are installing a package to the python instance being used by Jupyter
# -m ensures that the you are using the correct instance of pip to perform the installation
!{sys.executable} -m pip install GEOparse
import GEOparse
import pandas as pd

slowpoke = True



In [2]:
# Use the lovely GEO API to get the default version of the data at that address, in this case it is the
# soft zipped file which is what we want
rawDataGSE10245 = GEOparse.get_GEO(geo="GSE10245", destdir="./")

30-Oct-2021 11:58:22 DEBUG utils - Directory ./ already exists. Skipping.
30-Oct-2021 11:58:22 INFO GEOparse - File already exist: using local version.
30-Oct-2021 11:58:22 INFO GEOparse - Parsing ./GSE10245_family.soft.gz: 
30-Oct-2021 11:58:22 DEBUG GEOparse - DATABASE: GeoMiame
30-Oct-2021 11:58:22 DEBUG GEOparse - SERIES: GSE10245
30-Oct-2021 11:58:22 DEBUG GEOparse - PLATFORM: GPL570
  return parse_GSE(filepath, open_kwargs=open_kwargs)
30-Oct-2021 11:58:23 DEBUG GEOparse - SAMPLE: GSM258551
30-Oct-2021 11:58:23 DEBUG GEOparse - SAMPLE: GSM258552
30-Oct-2021 11:58:23 DEBUG GEOparse - SAMPLE: GSM258553
30-Oct-2021 11:58:23 DEBUG GEOparse - SAMPLE: GSM258554
30-Oct-2021 11:58:24 DEBUG GEOparse - SAMPLE: GSM258555
30-Oct-2021 11:58:24 DEBUG GEOparse - SAMPLE: GSM258556
30-Oct-2021 11:58:24 DEBUG GEOparse - SAMPLE: GSM258557
30-Oct-2021 11:58:24 DEBUG GEOparse - SAMPLE: GSM258558
30-Oct-2021 11:58:24 DEBUG GEOparse - SAMPLE: GSM258559
30-Oct-2021 11:58:24 DEBUG GEOparse - SAMPLE: GSM2

In [3]:
# What we have here is a complex bespoke object with some finnicky properties, let's unpack that
if slowpoke:
    print(type(rawDataGSE10245))
    print(rawDataGSE10245.metadata['summary'][0])
    print("\nSample GSM258551")
    for metaIndex in rawDataGSE10245.gsms.get("GSM258551").metadata.items():
        print(str(metaIndex))
    print("\nPlatform GPL570")
    for metaIndex in rawDataGSE10245.gpls.get('GPL570').metadata.items():
        print(str(metaIndex))
        
# Important details include
# 1 Platform, meaning that this was all done on the same machine and is therefore comparable
# Homo Sapiens as species, informing what references to use for our own analysis
# Year of original analysis, informing what references to use for comparative analysis

<class 'GEOparse.GEOTypes.GSE'>
Non-small cell lung cancer (NSCLC) can be classified into the major subtypes adenocarcinoma (AC) and squamous cell carcinoma (SCC) subtypes. Although explicit molecular, histological and clinical characteristics have been reported for both subtypes, no specific therapy exists so far. However, the characterization of suitable molecular targets holds great promises to develop novel therapies in NSCLC. In the present study, global gene expression profiling of 58 human high grade NSCLC specimens revealed large transcriptomic differences between AC and SCC subtypes: More than 1.700 genes were found to be differentially expressed.

Sample GSM258551
('title', ['NSCLC_AC_10'])
('geo_accession', ['GSM258551'])
('status', ['Public on Oct 01 2009'])
('submission_date', ['Jan 23 2008'])
('last_update_date', ['Aug 28 2018'])
('type', ['RNA'])
('channel_count', ['1'])
('source_name_ch1', ['human non-small cell lung cancer tumor tissue'])
('organism_ch1', ['Homo sapien

In [5]:
# The actual data we want to analyze however is stored in each sample seperately
rawDataGSE10245.gsms.get('GSM258551').table

Unnamed: 0,ID_REF,VALUE
0,1007_s_at,9.843349
1,1053_at,7.973332
2,117_at,4.994852
3,121_at,5.197306
4,1255_g_at,2.248520
...,...,...
54670,AFFX-r2-Ec-bioC-5_at,9.719266
54671,AFFX-r2-Ec-bioD-3_at,12.847711
54672,AFFX-r2-Ec-bioD-5_at,12.250033
54673,AFFX-r2-P1-cre-3_at,14.440756


In [57]:
# So to get a useable dataframe we will join each sample together iteratively. 
# Another, possibly faster, method would be to check if all of the rows are the same in each file, but this
# solution is more generalizable in the case where some rows might be filtered out of some samples
dfList = list()
# First we will iterate over all of the samples avialable in the object
for sampleIndex in rawDataGSE10245.gsms:
    # Get the current sample table from the larger object
    sampleData = rawDataGSE10245.gsms.get(sampleIndex).table
    # get the rownames for assignment to the dataframe
    rowNames = sampleData['ID_REF']
    # get the actual data for assignment to the dataframe
    colData = sampleData['VALUE']
    # initialize a temporary dataframe just for the sample and include that samples data with
    # the sample number as the column head
    sampleDataFrame = pd.DataFrame({sampleIndex:colData})
    # reattatch the row names as the index for this temporary dataframe to perserve this sample's
    # reference structure on the off chance they are different between samples
    sampleDataFrame.index = rowNames
    # add this temporary dataframe to the list of all samples' temporary dataframes
    dfList.append(sampleDataFrame)
# Join all temporary dataframes together by index in one step starting with the first and then 
# joining all others in the list.
rawDataFrameGSE10245 = dfList[0].join(dfList[1:])

In [59]:
# This completes step 1 of Group Project 3 'Load the Data'
print(rawDataFrameGSE10245)

                      GSM258551  GSM258552  GSM258553  GSM258554  GSM258555  \
ID_REF                                                                        
1007_s_at              9.129905   9.843349   9.730661   9.032165  10.281793   
1053_at                8.034022   7.973332   8.834045   7.723965   9.040800   
117_at                 3.564520   4.994852   5.066018   4.958580   4.951835   
121_at                 4.746490   5.197306   5.234618   6.078180   5.205632   
1255_g_at              2.320698   2.248520   2.259504   2.262787   2.207531   
...                         ...        ...        ...        ...        ...   
AFFX-r2-Ec-bioC-5_at  10.730983   9.719266   9.101115   9.763076   9.651785   
AFFX-r2-Ec-bioD-3_at  13.599488  12.847711  12.384142  12.969199  12.924465   
AFFX-r2-Ec-bioD-5_at  13.031726  12.250033  11.798363  12.307684  12.243207   
AFFX-r2-P1-cre-3_at   15.028729  14.440756  14.439887  14.557363  14.612223   
AFFX-r2-P1-cre-5_at   14.586347  14.072366  14.01139