# SGPS BioStatistics
## Exercise 2: Getting publically available data

#### Gene Expression data from Gene Expression Omnibus
+ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE123336

#### Specifically from this paper
1.	LaRocca D, Barns S, Hicks SD, Brindle A et al. Comparison of serum and saliva miRNAs for identification and characterization of mTBI in adult mixed martial arts fighters. *PLoS One* 2019;14(1):e0207785.


In [1]:
# Import the packages we need
import pandas as pd
import numpy as np
import GEOparse

In [2]:
# Get the data in
human_expression = pd.read_csv('GSE123336_MMA_CountMatrix.csv.gz', index_col=0, compression='gzip')
human_expression = human_expression.apply(pd.to_numeric, errors='coerce')

In [3]:
# Get the data straight (off the omnibus)
gse = GEOparse.get_GEO(geo="GSE123336", destdir=".")

25-Aug-2025 11:36:58 DEBUG utils - Directory . already exists. Skipping.
25-Aug-2025 11:36:58 INFO GEOparse - File already exist: using local version.
25-Aug-2025 11:36:58 INFO GEOparse - Parsing ./GSE123336_family.soft.gz: 
25-Aug-2025 11:36:58 DEBUG GEOparse - DATABASE: GeoMiame
25-Aug-2025 11:36:58 DEBUG GEOparse - SERIES: GSE123336
25-Aug-2025 11:36:58 DEBUG GEOparse - PLATFORM: GPL18573
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500956
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500957
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500958
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500959
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500960
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500961
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500962
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500963
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500964
25-Aug-2025 11:36:58 DEBUG GEOparse - SAMPLE: GSM3500965
25-Aug-2025 11:36:58 DEBUG GEOparse

In [4]:
gse.gsms

{'GSM3500956': <SAMPLE: GSM3500956>,
 'GSM3500957': <SAMPLE: GSM3500957>,
 'GSM3500958': <SAMPLE: GSM3500958>,
 'GSM3500959': <SAMPLE: GSM3500959>,
 'GSM3500960': <SAMPLE: GSM3500960>,
 'GSM3500961': <SAMPLE: GSM3500961>,
 'GSM3500962': <SAMPLE: GSM3500962>,
 'GSM3500963': <SAMPLE: GSM3500963>,
 'GSM3500964': <SAMPLE: GSM3500964>,
 'GSM3500965': <SAMPLE: GSM3500965>,
 'GSM3500966': <SAMPLE: GSM3500966>,
 'GSM3500967': <SAMPLE: GSM3500967>,
 'GSM3500968': <SAMPLE: GSM3500968>,
 'GSM3500969': <SAMPLE: GSM3500969>,
 'GSM3500970': <SAMPLE: GSM3500970>,
 'GSM3500971': <SAMPLE: GSM3500971>,
 'GSM3500972': <SAMPLE: GSM3500972>,
 'GSM3500973': <SAMPLE: GSM3500973>,
 'GSM3500974': <SAMPLE: GSM3500974>,
 'GSM3500975': <SAMPLE: GSM3500975>,
 'GSM3500976': <SAMPLE: GSM3500976>,
 'GSM3500977': <SAMPLE: GSM3500977>,
 'GSM3500978': <SAMPLE: GSM3500978>,
 'GSM3500979': <SAMPLE: GSM3500979>,
 'GSM3500980': <SAMPLE: GSM3500980>,
 'GSM3500981': <SAMPLE: GSM3500981>,
 'GSM3500982': <SAMPLE: GSM3500982>,
 

In [5]:
gse.gsms['GSM3500956'].metadata

{'title': ['MMA001-2533'],
 'geo_accession': ['GSM3500956'],
 'status': ['Public on Dec 05 2018'],
 'submission_date': ['Dec 04 2018'],
 'last_update_date': ['Dec 05 2018'],
 'type': ['SRA'],
 'channel_count': ['1'],
 'source_name_ch1': ['Saliva'],
 'organism_ch1': ['Homo sapiens'],
 'taxid_ch1': ['9606'],
 'characteristics_ch1': ['tissue: Saliva',
  'timepoint: 0d post',
  'hits to the head: 2',
  'subject: MMA001'],
 'molecule_ch1': ['total RNA'],
 'extract_protocol_ch1': ['Saliva was collected in an RNA stabilizing container (Oragene RNA RE-100, DNA Genotek) and RNA was harvested using Qiazol reagent followed by RNeasy (Qiagen).',
  'llumina TruSeq RNA Sample Prep Kit (Cat#FC-122-1001) was used with 250-500 nanograms of total RNA for the construction of sequencing libraries.'],
 'description': ['Sample 1'],
 'data_processing': ['Raw sequencing intensities were obtained on a NextSeq500 instrument and converted to base calls using bcl2fastq software (Illumina)',
  'TruSeq Small RNA ad

In [6]:
# And collect the metadata
meta = {}
for key in gse.gsms:
    # print(key)
    samp = gse.gsms[key].metadata['description'][0]
    characteristics = {}
    for item in gse.gsms[key].metadata['characteristics_ch1']:
        temp = item.split(': ')
        characteristics[temp[0]] = temp[1]

    if samp not in meta:
        meta[samp] = {}
        meta[samp] = characteristics
    else:
        meta[samp] = characteristics

metadata = pd.DataFrame(meta).T
metadata.to_csv('GSE123336_metadata.csv')

In [7]:
metadata

Unnamed: 0,tissue,timepoint,hits to the head,subject
Sample 1,Saliva,0d post,2,MMA001
Sample 2,Serum,0d post,2,MMA001
Sample 3,Saliva,0d pre,0,MMA001
Sample 4,Serum,0d pre,0,MMA001
Sample 5,Saliva,1wk post,2,MMA001
...,...,...,...,...
Sample 214,Serum,0d pre,0,MMA040
Sample 215,Serum,0d post,35,MMA041
Sample 216,Serum,0d pre,0,MMA041
Sample 217,Serum,0d post,7,MMA042


In [8]:
# Can we see which miRNA are differentially expressed in Saliva pre and post fight?
# Filter for saliva samples only
saliva_samples = metadata[(metadata['tissue'] == 'Saliva') & 
                          (metadata['timepoint'].isin(['0d pre', '0d post']))]

# Get the corresponding sample names
saliva_sample_names = saliva_samples.index

# Subset the expression data
saliva_expression = human_expression[saliva_sample_names]
saliva_expression

pre_fight_samples = saliva_samples[saliva_samples['timepoint'] == '0d pre'].index
post_fight_samples = saliva_samples[saliva_samples['timepoint'] == '0d post'].index

pre_fight_expression = saliva_expression[pre_fight_samples]
post_fight_expression = saliva_expression[post_fight_samples]

In [9]:
from scipy.stats import mannwhitneyu

# Initialize a list to store results
diff_expression_results = []

# Iterate through each miRNA
for miRNA in saliva_expression.index:
    # Extract expression levels for this miRNA
    pre_values = pre_fight_expression.loc[miRNA]
    post_values = post_fight_expression.loc[miRNA]
    
    # Perform the Mann-Whitney U test
    stat, p_value = mannwhitneyu(pre_values, post_values, alternative='two-sided')
    
    # Store the results
    diff_expression_results.append({
        'miRNA': miRNA,
        'statistic': stat,
        'p_value': p_value
    })

# Convert the results to a DataFrame
diff_expression_df = pd.DataFrame(diff_expression_results)

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['Significant'], diff_expression_df['adjusted_p_value'], _, _ = multipletests(diff_expression_df['p_value'], method='fdr_bh')

# Sort the results by adjusted p-value
diff_expression_df = diff_expression_df.sort_values('p_value')
diff_expression_df

Unnamed: 0,miRNA,statistic,p_value,Significant,adjusted_p_value
1287,hsa-miR-486-3p,96.0,0.001339,False,1.0
351,hsa-miR-204-5p,108.0,0.004007,False,1.0
960,hsa-miR-449c-5p,121.5,0.007006,False,1.0
992,hsa-miR-4524b-3p,132.0,0.010144,False,1.0
47,hsa-miR-1185-2-3p,142.0,0.011817,False,1.0
...,...,...,...,...,...
1992,hsa-miR-6805-3p,208.5,1.000000,False,1.0
547,hsa-miR-3155a,209.5,1.000000,False,1.0
322,hsa-miR-196b-3p,209.0,1.000000,False,1.0
325,hsa-miR-197-5p,208.5,1.000000,False,1.0


In [10]:
# Can we see which miRNA are differentially expressed in Serum pre and post fight?
# Filter for serum samples only
serum_samples = metadata[(metadata['tissue'] == 'Serum') & 
                          (metadata['timepoint'].isin(['0d pre', '0d post']))]

# Get the corresponding sample names
serum_sample_names = serum_samples.index

# Subset the expression data
serum_expression = human_expression[serum_sample_names]
serum_expression

pre_fight_samples = serum_samples[serum_samples['timepoint'] == '0d pre'].index
post_fight_samples = serum_samples[serum_samples['timepoint'] == '0d post'].index

pre_fight_expression = serum_expression[pre_fight_samples]
post_fight_expression = serum_expression[post_fight_samples]

In [11]:
from scipy.stats import mannwhitneyu

# Initialize a list to store results
diff_expression_results = []

# Iterate through each miRNA
for miRNA in serum_expression.index:
    # Extract expression levels for this miRNA
    pre_values = pre_fight_expression.loc[miRNA]
    post_values = post_fight_expression.loc[miRNA]
    
    # Perform the Mann-Whitney U test
    stat, p_value = mannwhitneyu(pre_values, post_values, alternative='two-sided')
    
    # Store the results
    diff_expression_results.append({
        'miRNA': miRNA,
        'statistic': stat,
        'p_value': p_value
    })

# Convert the results to a DataFrame
diff_expression_df = pd.DataFrame(diff_expression_results)

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['Significant'], diff_expression_df['adjusted_p_value'], _, _ = multipletests(diff_expression_df['p_value'], method='fdr_bh')

# Sort the results by adjusted p-value
diff_expression_df = diff_expression_df.sort_values('p_value')
diff_expression_df

Unnamed: 0,miRNA,statistic,p_value,Significant,adjusted_p_value
111,hsa-miR-125b-2-3p,473.5,0.000012,True,0.019786
829,hsa-miR-3934-5p,598.0,0.000017,True,0.019786
383,hsa-miR-214-3p,538.0,0.000032,True,0.024891
37,hsa-miR-10b-5p,520.0,0.000070,True,0.040821
35,hsa-miR-10a-5p,545.0,0.000160,False,0.074919
...,...,...,...,...,...
246,hsa-miR-1537-3p,1012.5,1.000000,False,1.000000
732,hsa-miR-3684,1012.5,1.000000,False,1.000000
1734,hsa-miR-639,1012.5,1.000000,False,1.000000
230,hsa-miR-147a,1012.5,1.000000,False,1.000000


### How about getting rat data for mTBI?
#### Gene Expression data from Gene Expression Omnibus
+ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE159011

#### Specifically from this paper
1.	Das Gupta S, Ciszek R, Heiskanen M, Lapinlampi N et al. Plasma miR-9-3p and miR-136-3p as Potential Novel Diagnostic Biomarkers for Experimental and Human Mild Traumatic Brain Injury. *Int J Mol Sci* 2021 Feb 4;22(4).


In [12]:
rat_expression = pd.read_csv("GSE159011_Raw_counts_matrix.txt", index_col=0, sep='\t')

In [13]:
rat_expression

Unnamed: 0_level_0,20657-020-UMIs,20657-002-UMIs,20657-012-UMIs,20657-006-UMIs,20657-007-UMIs,20657-019-UMIs,20657-018-UMIs,20657-008-UMIs,20657-005-UMIs,20657-004-UMIs,...,20657-004-READs,20657-003-READs,20657-011-READs,20657-010-READs,20657-013-READs,20657-001-READs,20657-015-READs,20657-014-READs,20657-017-READs,20657-016-READs
miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rno-let-7a-1-3p/7c-2-3p,160,308,817,358,548,179,793,320,232,311,...,490,799,447,318,413,571,365,394,486,762
rno-let-7a-2-3p,48,55,48,40,55,38,35,56,43,50,...,83,219,83,82,113,120,96,107,105,31
rno-let-7a-5p,36267,38882,58248,45855,61422,41501,73998,41453,42114,46192,...,66583,64615,69738,41452,54041,53300,44454,56369,99764,51578
rno-let-7b-3p,441,650,994,770,772,579,1302,780,553,636,...,819,899,747,558,696,768,600,670,943,630
rno-let-7b-5p,88155,135591,214262,128710,178067,99188,243651,139027,95987,125310,...,181639,186234,163980,111789,143520,183815,114509,129363,213979,146829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rno-miR-99b-5p,1792,2422,5632,3005,3995,1925,5573,3163,2202,2921,...,4660,4829,4179,3320,4608,5087,4296,3322,5110,3231
rno-miR-9a-3p,23,18,33,655,163,303,159,42,647,34,...,53,225,141,76,733,761,66,45,221,20
rno-miR-9a-5p,52,75,56,395,127,387,150,86,421,85,...,136,401,176,178,519,654,216,137,401,75
rno-miR-9b-3p,0,0,4,21,6,19,0,13,24,5,...,5,95,12,4,35,29,0,0,18,5


In [14]:
rat_expression.sum()

20657-020-UMIs     2894711
20657-002-UMIs     3322547
20657-012-UMIs     5041094
20657-006-UMIs     3584618
20657-007-UMIs     4221068
20657-019-UMIs     3232696
20657-018-UMIs     5866757
20657-008-UMIs     3625424
20657-005-UMIs     3524486
20657-004-UMIs     3757053
20657-003-UMIs     4446067
20657-011-UMIs     3779106
20657-010-UMIs     2886651
20657-013-UMIs     2968683
20657-001-UMIs     2851406
20657-015-UMIs     2409545
20657-014-UMIs     2908509
20657-017-UMIs     3389253
20657-016-UMIs     4090289
20657-020-READs    5005676
20657-002-READs    4828446
20657-012-READs    6826144
20657-006-READs    5844032
20657-007-READs    5002302
20657-019-READs    6482291
20657-018-READs    8876409
20657-008-READs    6066846
20657-005-READs    6257970
20657-004-READs    5729837
20657-003-READs    7700898
20657-011-READs    6504203
20657-010-READs    4803736
20657-013-READs    5576208
20657-001-READs    4764649
20657-015-READs    4871664
20657-014-READs    4898247
20657-017-READs    7588587
2

In [15]:
# Normalization
# Step 1: Calculate the sum of each column
column_sums = rat_expression.sum()

# Step 2: Determine the normalization factor to scale each column to sum to 1 million
normalization_factor = 1_000_000 / column_sums

# Step 3: Multiply each column by its respective normalization factor
normalized_df = rat_expression.multiply(normalization_factor, axis=1)

# Now each column in `normalized_df` should sum to 1 million
normalized_df

Unnamed: 0_level_0,20657-020-UMIs,20657-002-UMIs,20657-012-UMIs,20657-006-UMIs,20657-007-UMIs,20657-019-UMIs,20657-018-UMIs,20657-008-UMIs,20657-005-UMIs,20657-004-UMIs,...,20657-004-READs,20657-003-READs,20657-011-READs,20657-010-READs,20657-013-READs,20657-001-READs,20657-015-READs,20657-014-READs,20657-017-READs,20657-016-READs
miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rno-let-7a-1-3p/7c-2-3p,55.273221,92.699968,162.067996,99.871172,129.824964,55.371739,135.168373,88.265538,65.825201,82.777645,...,85.517267,103.754134,68.724792,66.198476,74.064669,119.840937,74.923065,80.436940,64.043543,152.002635
rno-let-7a-2-3p,16.581966,16.553566,9.521743,11.158790,13.029878,11.754894,5.965817,15.446469,12.200361,13.308303,...,14.485578,28.438242,12.760979,17.070047,20.264667,25.185486,19.705793,21.844550,13.836568,6.183834
rno-let-7a-5p,12528.711847,11702.468016,11554.634768,12792.158049,14551.293654,12837.891345,12613.101241,11433.972964,11948.976390,12294.742715,...,11620.400371,8390.579904,10721.990073,8629.117004,9691.352977,11186.553301,9125.013548,11507.994595,13146.584470,10288.703272
rno-let-7b-3p,152.346815,195.633049,197.179422,214.806710,182.892102,179.107469,221.928401,215.147249,156.902311,169.281615,...,142.936003,116.739632,114.848814,116.159589,124.816004,161.187109,123.161203,136.783629,124.265558,125.671470
rno-let-7b-5p,30453.817324,40809.354992,42503.075721,35906.196978,42185.295285,30682.749012,41530.780975,38347.790493,27234.325800,33353.269171,...,31700.552738,24183.413415,25211.390235,23271.262201,25737.920824,38578.917356,23505.110369,26410.060579,28197.476026,29289.232089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rno-miR-99b-5p,619.060072,728.958838,1117.217810,838.304109,946.442938,595.478201,949.928555,872.449678,624.771953,777.471066,...,813.286661,627.069726,642.507622,691.128738,826.368026,1067.654721,881.834215,678.201814,673.379642,644.515109
rno-miR-9a-3p,7.945525,5.417531,6.546198,182.725189,38.615820,93.729816,27.101855,11.584852,183.572867,9.049646,...,9.249827,29.217372,21.678290,15.821019,131.451338,159.717956,13.547732,9.186960,29.122681,3.989570
rno-miR-9a-5p,17.963797,22.573044,11.108700,110.193053,30.087172,119.714319,25.567788,23.721363,119.450042,22.624115,...,23.735405,52.071849,27.059426,37.054493,93.074003,137.260898,44.338033,27.969190,52.842512,14.960889
rno-miR-9b-3p,0.000000,0.000000,0.793479,5.858365,1.421441,5.877447,0.000000,3.585787,6.809504,1.330830,...,0.872625,12.336224,1.844961,0.832685,6.276667,6.086492,0.000000,0.000000,2.371983,0.997393


In [16]:
# We also want to do put this on a log2 scale
log2_df = np.log2(normalized_df + 1)
log2_df

Unnamed: 0_level_0,20657-020-UMIs,20657-002-UMIs,20657-012-UMIs,20657-006-UMIs,20657-007-UMIs,20657-019-UMIs,20657-018-UMIs,20657-008-UMIs,20657-005-UMIs,20657-004-UMIs,...,20657-004-READs,20657-003-READs,20657-011-READs,20657-010-READs,20657-013-READs,20657-001-READs,20657-015-READs,20657-014-READs,20657-017-READs,20657-016-READs
miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rno-let-7a-1-3p/7c-2-3p,5.814377,6.549977,7.349330,6.656370,7.031494,5.816900,7.089248,6.480031,6.062320,6.388493,...,6.434916,6.710863,6.123600,6.070357,6.230062,6.916965,6.246466,6.347611,6.023334,7.257413
rno-let-7a-2-3p,4.136025,4.133692,3.395302,3.603928,3.810431,3.672979,2.800293,4.039706,3.722505,3.838781,...,3.952853,4.879620,3.782511,4.175528,4.410386,4.710695,4.371963,4.513778,3.891086,2.844754
rno-let-7a-5p,13.613066,13.514648,13.496309,13.643085,13.828959,13.648233,13.622750,13.481165,13.544720,13.585871,...,13.504496,13.034727,13.388420,13.075164,13.242631,13.449607,13.155769,13.490474,13.682510,13.328914
rno-let-7b-3p,7.260654,7.619362,7.630663,7.753596,7.522716,7.492714,7.800437,7.755871,7.302888,7.411779,...,7.169284,6.879456,6.856099,6.872331,6.975172,7.341515,6.956071,7.106261,6.968846,6.984948
rno-let-7b-5p,14.894383,15.316648,15.375314,15.131985,15.364487,14.905187,15.341928,15.226893,14.733192,15.025584,...,14.952266,14.561790,14.621845,14.506324,14.651664,15.235562,14.520748,14.688855,14.783330,14.838132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rno-miR-99b-5p,9.276264,9.511671,10.126986,9.713050,9.887895,9.220326,9.893193,9.770581,9.289493,9.604500,...,9.669393,9.294781,9.329813,9.434897,9.692385,10.061580,9.785999,9.407697,9.397417,9.334307
rno-miR-9a-3p,3.161166,2.682018,2.915750,7.521406,5.308005,6.565747,4.812593,3.653616,7.528047,3.329073,...,3.357528,4.917306,4.503240,4.072193,7.049319,7.328387,3.862722,3.348652,4.912778,2.318916
rno-miR-9a-5p,4.245176,4.559066,3.597972,6.796923,4.958247,6.915453,4.731606,4.627686,6.912291,4.562188,...,4.628506,5.729875,4.810414,5.249995,6.555724,7.111249,5.502650,4.856447,5.750674,3.996469
rno-miR-9b-3p,0.000000,0.000000,0.842760,2.777865,1.275866,2.781873,0.000000,2.197169,2.965231,1.220844,...,0.905062,3.737278,1.508409,0.873959,2.863278,2.825072,0.000000,0.000000,1.753597,0.998118


In [17]:
# Get the data straight (off the omnibus)
gse = GEOparse.get_GEO(geo="GSE159011", destdir=".")

25-Aug-2025 11:37:00 DEBUG utils - Directory . already exists. Skipping.
25-Aug-2025 11:37:00 INFO GEOparse - File already exist: using local version.
25-Aug-2025 11:37:00 INFO GEOparse - Parsing ./GSE159011_family.soft.gz: 
25-Aug-2025 11:37:00 DEBUG GEOparse - DATABASE: GeoMiame
25-Aug-2025 11:37:00 DEBUG GEOparse - SERIES: GSE159011
25-Aug-2025 11:37:00 DEBUG GEOparse - PLATFORM: GPL25029
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817312
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817313
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817314
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817315
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817316
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817317
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817318
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817319
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817320
25-Aug-2025 11:37:00 DEBUG GEOparse - SAMPLE: GSM4817321
25-Aug-2025 11:37:00 DEBUG GEOparse

In [18]:
# And collect the metadata
meta = {}
for key in gse.gsms:
    # print(key)
    samp = gse.gsms[key].metadata['description'][0]
    characteristics = {}
    for item in gse.gsms[key].metadata['characteristics_ch1']:
        temp = item.split(': ')
        characteristics[temp[0]] = temp[1]

    if samp not in meta:
        meta[samp] = {}
        meta[samp] = characteristics
    else:
        meta[samp] = characteristics

rat_metadata = pd.DataFrame(meta).T
rat_metadata.to_csv('GSE159011_metadata.csv')

In [19]:
rat_metadata

Unnamed: 0,strain,age,group
20657-001,Sprague-Dawley,Adult,Severe TBI
20657-002,Sprague-Dawley,Adult,Sham
20657-003,Sprague-Dawley,Adult,Mild TBI
20657-004,Sprague-Dawley,Adult,Sham
20657-005,Sprague-Dawley,Adult,Severe TBI
20657-006,Sprague-Dawley,Adult,Severe TBI
20657-007,Sprague-Dawley,Adult,Mild TBI
20657-008,Sprague-Dawley,Adult,Sham
20657-010,Sprague-Dawley,Adult,Sham
20657-011,Sprague-Dawley,Adult,Mild TBI


In [20]:
# Filter metadata for Naive and Severe TBI groups
log2_df.columns = log2_df.columns.str.replace('-UMIs', '')
naive_samples = rat_metadata[rat_metadata['group'] == 'Naive'].index
severe_tbi_samples = rat_metadata[rat_metadata['group'] == 'Severe TBI'].index

In [21]:
# Subset expression data for Naive and Severe TBI samples
naive_expression = log2_df[naive_samples]
severe_tbi_expression = log2_df[severe_tbi_samples]

In [22]:
# Initialize a list to store results
diff_expression_results = []

# Iterate through each miRNA
for miRNA in log2_df.index:
    # Extract expression levels for this miRNA in Naive and Severe TBI groups
    naive_values = naive_expression.loc[miRNA]
    severe_tbi_values = severe_tbi_expression.loc[miRNA]
    
    # Perform the Mann-Whitney U test
    stat, p_value = mannwhitneyu(naive_values, severe_tbi_values, alternative='two-sided')
    
    # Store the results
    diff_expression_results.append({
        'miRNA': miRNA,
        'statistic': stat,
        'p_value': p_value
    })

# Convert the results to a DataFrame
diff_expression_df = pd.DataFrame(diff_expression_results)

# Drop out NaN from the p-values
diff_expression_df = diff_expression_df.dropna(subset=['p_value'])

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['adjusted_p_value'] = multipletests(diff_expression_df['p_value'], method='fdr_bh')[1]

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['Significant'], diff_expression_df['adjusted_p_value'], _, _ = multipletests(diff_expression_df['p_value'], method='fdr_bh')

# Sort the results by adjusted p-value
diff_expression_df = diff_expression_df.sort_values('p_value')
diff_expression_df

Unnamed: 0,miRNA,statistic,p_value,adjusted_p_value,Significant
129,rno-miR-15a-3p,0.0,0.015873,0.177209,False
508,rno-miR-381-3p,0.0,0.015873,0.177209,False
510,rno-miR-382-3p,0.0,0.015873,0.177209,False
665,rno-miR-668,0.0,0.015873,0.177209,False
663,rno-miR-667-3p,0.0,0.015873,0.177209,False
...,...,...,...,...,...
194,rno-miR-196b-5p,10.0,1.000000,1.000000,False
621,rno-miR-568,10.0,1.000000,1.000000,False
180,rno-miR-193a-3p,10.0,1.000000,1.000000,False
453,rno-miR-3583-3p,10.0,1.000000,1.000000,False


In [23]:
# Initialize a list to store results
diff_expression_results = []

# Iterate through each miRNA
for miRNA in log2_df.index:
    # Extract expression levels for this miRNA in Naive and Severe TBI groups
    naive_values = naive_expression.loc[miRNA]
    severe_tbi_values = severe_tbi_expression.loc[miRNA]
    
    # Perform the Mann-Whitney U test
    stat, p_value = mannwhitneyu(naive_values, severe_tbi_values, alternative='two-sided')
    
    # Store the results
    diff_expression_results.append({
        'miRNA': miRNA,
        'statistic': stat,
        'p_value': p_value
    })

# Convert the results to a DataFrame
diff_expression_df = pd.DataFrame(diff_expression_results)

# Drop out NaN from the p-values
diff_expression_df = diff_expression_df.dropna(subset=['p_value'])

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['adjusted_p_value'] = multipletests(diff_expression_df['p_value'], method='fdr_bh')[1]

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['Significant'], diff_expression_df['adjusted_p_value'], _, _ = multipletests(diff_expression_df['p_value'], method='fdr_bh')

# Sort the results by adjusted p-value
diff_expression_df = diff_expression_df.sort_values('p_value')
diff_expression_df

Unnamed: 0,miRNA,statistic,p_value,adjusted_p_value,Significant
129,rno-miR-15a-3p,0.0,0.015873,0.177209,False
508,rno-miR-381-3p,0.0,0.015873,0.177209,False
510,rno-miR-382-3p,0.0,0.015873,0.177209,False
665,rno-miR-668,0.0,0.015873,0.177209,False
663,rno-miR-667-3p,0.0,0.015873,0.177209,False
...,...,...,...,...,...
194,rno-miR-196b-5p,10.0,1.000000,1.000000,False
621,rno-miR-568,10.0,1.000000,1.000000,False
180,rno-miR-193a-3p,10.0,1.000000,1.000000,False
453,rno-miR-3583-3p,10.0,1.000000,1.000000,False
