# SGPS BioStatistics
## Excercise 2: Getting publically availible data

#### Gene Expression data from Gene Expression Omnibus
+ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE123336

#### Specifically from this paper
1.	LaRocca D, Barns S, Hicks SD, Brindle A et al. Comparison of serum and saliva miRNAs for identification and characterization of mTBI in adult mixed martial arts fighters. *PLoS One* 2019;14(1):e0207785.


In [None]:
# Import the packages we need
import pandas as pd
import numpy as np
# import GEOparse

In [None]:
# Get the data in
human_expression = pd.read_csv('GSE123336_MMA_CountMatrix.csv.gz', index_col=0, compression='gzip')
human_expression = human_expression.apply(pd.to_numeric, errors='coerce')

In [None]:
# Get the data straight (off the omnibus)
# gse = GEOparse.get_GEO(geo="GSE123336", destdir=".")

In [None]:
# gse.gsms

In [None]:
# gse.gsms['GSM3500956'].metadata

In [None]:
# # And collect the metadata
# meta = {}
# for key in gse.gsms:
#     # print(key)
#     samp = gse.gsms[key].metadata['description'][0]
#     characteristics = {}
#     for item in gse.gsms[key].metadata['characteristics_ch1']:
#         temp = item.split(': ')
#         characteristics[temp[0]] = temp[1]

#     if samp not in meta:
#         meta[samp] = {}
#         meta[samp] = characteristics
#     else:
#         meta[samp] = characteristics

# metadata = pd.DataFrame(meta).T
# metadata.to_csv('GSE123336_metadata.csv')

In [None]:
metadata = pd.read_csv('GSE123336_metadata.csv', index_col=0)
metadata

In [None]:
# Can we see which miRNA are differentially expressed in Saliva pre and post fight?
# Filter for saliva samples only
saliva_samples = metadata[(metadata['tissue'] == 'Saliva') & 
                          (metadata['timepoint'].isin(['0d pre', '0d post']))]

# Get the corresponding sample names
saliva_sample_names = saliva_samples.index

# Subset the expression data
saliva_expression = human_expression[saliva_sample_names]
saliva_expression

pre_fight_samples = saliva_samples[saliva_samples['timepoint'] == '0d pre'].index
post_fight_samples = saliva_samples[saliva_samples['timepoint'] == '0d post'].index

pre_fight_expression = saliva_expression[pre_fight_samples]
post_fight_expression = saliva_expression[post_fight_samples]

In [None]:
from scipy.stats import mannwhitneyu

# Initialize a list to store results
diff_expression_results = []

# Iterate through each miRNA
for miRNA in saliva_expression.index:
    # Extract expression levels for this miRNA
    pre_values = pre_fight_expression.loc[miRNA]
    post_values = post_fight_expression.loc[miRNA]
    
    # Perform the Mann-Whitney U test
    stat, p_value = mannwhitneyu(pre_values, post_values, alternative='two-sided')
    
    # Store the results
    diff_expression_results.append({
        'miRNA': miRNA,
        'statistic': stat,
        'p_value': p_value
    })

# Convert the results to a DataFrame
diff_expression_df = pd.DataFrame(diff_expression_results)

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['Significant'], diff_expression_df['adjusted_p_value'], _, _ = multipletests(diff_expression_df['p_value'], method='fdr_bh')

# Sort the results by adjusted p-value
diff_expression_df = diff_expression_df.sort_values('p_value')
diff_expression_df

In [None]:
# Can we see which miRNA are differentially expressed in Serum pre and post fight?
# Filter for serum samples only


### How about getting rat data for mTBI?
#### Gene Expression data from Gene Expression Omnibus
+ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE159011

#### Specifically from this paper
1.	Das Gupta S, Ciszek R, Heiskanen M, Lapinlampi N et al. Plasma miR-9-3p and miR-136-3p as Potential Novel Diagnostic Biomarkers for Experimental and Human Mild Traumatic Brain Injury. *Int J Mol Sci* 2021 Feb 4;22(4).


In [None]:
rat_expression = pd.read_csv("GSE159011_Raw_counts_matrix.txt", index_col=0, sep='\t')

In [None]:
rat_expression

In [None]:
rat_expression.sum()

In [None]:
# Normalization
# Step 1: Calculate the sum of each column
column_sums = rat_expression.sum()

# Step 2: Determine the normalization factor to scale each column to sum to 1 million
normalization_factor = 1_000_000 / column_sums

# Step 3: Multiply each column by its respective normalization factor
normalized_df = rat_expression.multiply(normalization_factor, axis=1)

# Now each column in `normalized_df` should sum to 1 million
normalized_df

In [None]:
# We also want to do put this on a log2 scale
log2_df = np.log2(normalized_df + 1)
log2_df

In [None]:
# Get the data straight (off the omnibus)
# gse = GEOparse.get_GEO(geo="GSE159011", destdir=".")

In [None]:
# # And collect the metadata
# meta = {}
# for key in gse.gsms:
#     # print(key)
#     samp = gse.gsms[key].metadata['description'][0]
#     characteristics = {}
#     for item in gse.gsms[key].metadata['characteristics_ch1']:
#         temp = item.split(': ')
#         characteristics[temp[0]] = temp[1]

#     if samp not in meta:
#         meta[samp] = {}
#         meta[samp] = characteristics
#     else:
#         meta[samp] = characteristics

# rat_metadata = pd.DataFrame(meta).T
# rat_metadata.to_csv('GSE159011_metadata.csv')

In [None]:
rat_metadata = pd.read_csv('GSE159011_metadata.csv', index_col=0)
rat_metadata

In [None]:
# Filter metadata for Naive and Severe TBI groups
log2_df.columns = log2_df.columns.str.replace('-UMIs', '')
naive_samples = rat_metadata[rat_metadata['group'] == 'Naive'].index
severe_tbi_samples = rat_metadata[rat_metadata['group'] == 'Severe TBI'].index

In [None]:
# Subset expression data for Naive and Severe TBI samples
naive_expression = log2_df[naive_samples]
severe_tbi_expression = log2_df[severe_tbi_samples]

In [None]:
# Initialize a list to store results
diff_expression_results = []

# Iterate through each miRNA
for miRNA in log2_df.index:
    # Extract expression levels for this miRNA in Naive and Severe TBI groups
    naive_values = naive_expression.loc[miRNA]
    severe_tbi_values = severe_tbi_expression.loc[miRNA]
    
    # Perform the Mann-Whitney U test
    stat, p_value = mannwhitneyu(naive_values, severe_tbi_values, alternative='two-sided')
    
    # Store the results
    diff_expression_results.append({
        'miRNA': miRNA,
        'statistic': stat,
        'p_value': p_value
    })

# Convert the results to a DataFrame
diff_expression_df = pd.DataFrame(diff_expression_results)

# Drop out NaN from the p-values
diff_expression_df = diff_expression_df.dropna(subset=['p_value'])

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['adjusted_p_value'] = multipletests(diff_expression_df['p_value'], method='fdr_bh')[1]

# Apply multiple testing correction (e.g., Benjamini-Hochberg)
from statsmodels.stats.multitest import multipletests

diff_expression_df['Significant'], diff_expression_df['adjusted_p_value'], _, _ = multipletests(diff_expression_df['p_value'], method='fdr_bh')

# Sort the results by adjusted p-value
diff_expression_df = diff_expression_df.sort_values('p_value')
diff_expression_df