# SGPS Molecular Techniques
## Exercise 3: Working with genome-wide gene expression data
### Comparing gene expression of naive, LPS and IFNy treated monocytes.
+ Data from this paper:
+ Fairfax, BP et al. Innate Immune Activity Conditions the Effect of Regulatory Variants upon Monocyte Gene Expression. Science 343,1246949(2014).
#### First look: just one gene - TNF from the genome wide expression dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn import manifold
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# Get the data in
expr_all = pd.read_csv('monocyte_all_expression.csv', index_col=0)
expr_all

In [None]:
# Extract TNF data and the treatments
expr_tnf = expr_all[['TNF']].copy()
expr_tnf['Treatment'] = [idx.split('_')[-1] for idx in expr_tnf.index]

# Group by treatment and calculate mean and standard error
tnf_grouped = expr_tnf.groupby('Treatment').agg(['mean', 'std'])
tnf_grouped.columns = ['Mean', 'StDev']

# Number of replicates (assuming it's 3 for this example)
n_replicates = 3
tnf_grouped['StErr'] = tnf_grouped['StDev'] / np.sqrt(n_replicates)

# Define the correct order and colors
labels = ['Untreated', 'IFN', 'LPS2h', 'LPS24h']
colours = ['red', 'green', 'blue', 'gold']

# Reorder the dataframe according to the specified order
tnf_grouped = tnf_grouped.reindex(labels)

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(labels, 
       tnf_grouped['Mean'], 
       yerr=tnf_grouped['StErr'], 
       color=colours, 
       capsize=5)

ax.set_xlabel('Treatment')
ax.set_ylabel('TNF Expression')
ax.set_title('TNF Gene Expression Across Treatments with StErr')

plt.show()

## Working with all the gene expression data
## Looking at dimension reduction, using principal component analysis

In this dataset we have:
    - 228 donors, ~15000 gene expression values

The gene expression is from the innate immune cells, monocytes under 4 conditions:
    - Untreated, as a control.
    - Treated with inteferon-gamma (IFN) for 24 hours – a good model for viral infections.
    - Treated with Lipopolysaccharide (LPS) for 2 hours - LPS is a major component of the outer wall of gram negative bacteria, which our body registers as a toxin and elicits a strong immune response.
    - Treated with Lipopolysaccharide (LPS) for 24 hours.
    
So we have a dataset for 912 samples (from 228 donors for 4 conditions each), gene expression data for ~15,000 genes. 

How to understand this dataset?

No doubt there is high redundancy amongst the samples, so reducing them from ~15000 to a smaller number could be really helpful into interpreting the dataset (in this case for projecting the gene expression and genes into one value for each sample for each principal component).



In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.decomposition import PCA
sns.set_context("poster")

In [None]:
# Get the data in
expr_all = pd.read_csv('monocyte_all_expression.csv', index_col=0)

In [None]:
# See what it looks like
expr_all


In [None]:
# Get the treatment for each sample
y = []
for item in expr_all.index.values:
    if 'Untreated' in item:
        y.append(0)
    elif 'IFN' in item:
        y.append(1)
    elif 'LPS2h' in item:
        y.append(2)
    elif 'LPS24h' in item:
        y.append(3)
    else:
        print('Error: Data from non-recognisable experiment')
y = np.array(y)

labels = ['Untreated', 'IFN', 'LPS 2h', 'LPS 24h']
colours = ['red', 'green', 'blue', 'gold']

In [None]:
# PCA of component 1 and 2
pca = PCA()
X_pca = pca.fit_transform(expr_all)
var_expl = pca.explained_variance_ratio_
r_col = []
fig = plt.figure(figsize=(18,12))
for colour, i, target_name in zip(colours, range(len(labels)), labels):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], s= 8, color=colour, lw=2, label=target_name)

plt.title("PCA of monocytes")
plt.legend(loc=1, shadow=False)
# plt.axis([-4, 4, -1.5, 1.5])
plt.xlabel('1st Comp: ' + str(round(var_expl[0]*100,1)) + '% variance explained')
plt.ylabel('2nd Comp: ' + str(round(var_expl[1]*100,1)) + '% variance explained')
fig.savefig('PCA_Extreme_Data_comp1_comp2.pdf')
plt.show()

In [None]:
# PCA of component 2 and 3
fig = plt.figure(figsize=(18,12))
for colour, i, target_name in zip(colours, range(len(labels)), labels):
    plt.scatter(X_pca[y == i, 1], X_pca[y == i, 2], s= 8, color=colour, lw=2, label=target_name)

plt.title("PCA of monocytes")
plt.legend(loc=1, shadow=False)
# plt.axis([-4, 4, -1.5, 1.5])
plt.xlabel('2nd Comp: ' + str(round(var_expl[1]*100,1)) + '% variance explained')
plt.ylabel('3rd Comp: ' + str(round(var_expl[2]*100,1)) + '% variance explained')
fig.savefig('PCA_Extreme_Data_comp2_comp3.pdf')
plt.show()

In [None]:
# PCA of component 3 and 4
fig = plt.figure(figsize=(18,12))
for colour, i, target_name in zip(colours, range(len(labels)), labels):
    plt.scatter(X_pca[y == i, 2], X_pca[y == i, 4], s= 8, color=colour, lw=2, label=target_name)

plt.title("PCA of monocytes")
plt.legend(loc=1, shadow=False)
# plt.axis([-4, 4, -1.5, 1.5])
plt.xlabel('3rd Comp: ' + str(round(var_expl[2]*100,1)) + '% variance explained')
plt.ylabel('4th Comp: ' + str(round(var_expl[3]*100,1)) + '% variance explained')
fig.savefig('PCA_Extreme_Data_comp2_comp3.pdf')
plt.show()

In [None]:
# Get loadings for the first two components
loadings = pca.components_.T

# Creating the correct dataframe for loadings
loadings_df = pd.DataFrame(loadings, index=expr_all.columns, columns=[f'PC{i+1}' for i in range(loadings.shape[1])])

# Get the top and bottom 100 genes for PC1 and PC2
top_100_pc1 = loadings_df['PC1'].nlargest(100)
bottom_100_pc1 = loadings_df['PC1'].nsmallest(100)

for gene in bottom_100_pc1.index.values:
    print(gene)

#### Can we see what gene pathways this list is enriched in?

Try pasting these genes into Enrichr - https://maayanlab.cloud/Enrichr/
and look at:

+ Ontologies -> GO Biological Process
+ Pathways -> KEGG

If a particaular KEGG pathway is of interest, can also go further and look at it in KEGG: https://www.genome.jp/kegg/

### Clustering the data

From the PCA (well for the first-  three components), we can see a good separation between the four different treatments of the cells. Can we use clustering methods to fully classify them?

Read through this tutorial - https://realpython.com/k-means-clustering-python/ and have a go at clustering the PCA data.

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [None]:
# Let's set K-means for 4 clusters
kmeans = KMeans(
    init="random",
    n_clusters=4,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [None]:
kmeans.fit(X_pca[:,:3])
kmeans.inertia_
kmeans.cluster_centers_
# Counter(kmeans.labels_)

In [None]:
# Let's see how the clusters on our plot
new_colours = ['aqua', 'coral', 'yellow', 'seagreen']
fig = plt.figure(figsize=(18,12))
for colour, i, target_name in zip(new_colours, range(4), range(4)):
    plt.scatter(X_pca[kmeans.labels_ == i, 0], X_pca[kmeans.labels_ == i, 1], s= 8, color=colour, lw=2, label=target_name)
    plt.scatter(kmeans.cluster_centers_[i][0], kmeans.cluster_centers_[i][1],s=58, color='black', marker='x')

plt.title("PCA of monocytes")
plt.legend(loc=1, shadow=False)
# plt.axis([-4, 4, -1.5, 1.5])
plt.xlabel('1st Comp: ' + str(round(var_expl[0]*100,1)) + '% variance explained')
plt.ylabel('2nd Comp: ' + str(round(var_expl[1]*100,1)) + '% variance explained')
fig.savefig('PCA_Extreme_Data_comp1_comp2_with_clusters.pdf')
plt.show()

Instead of relying on th PCA, can we use clustering methods on all the data to fully classify them?

In [None]:
kmeans.fit(expr_all)
Counter(kmeans.labels_)

In [None]:
# Let's see how the clusters on our plot
new_colours = ['teal', 'orangered', 'goldenrod', 'chartreuse']
fig = plt.figure(figsize=(18,12))
for colour, i, target_name in zip(new_colours, range(4), range(4)):
    plt.scatter(X_pca[kmeans.labels_ == i, 0], X_pca[kmeans.labels_ == i, 1], s= 8, color=colour, lw=2, label=target_name)

plt.title("PCA of monocytes")
plt.legend(loc=1, shadow=False)
# plt.axis([-4, 4, -1.5, 1.5])
plt.xlabel('1st Comp: ' + str(round(var_expl[0]*100,1)) + '% variance explained')
plt.ylabel('2nd Comp: ' + str(round(var_expl[1]*100,1)) + '% variance explained')
fig.savefig('PCA_Extreme_Data_comp1_comp2_with_clusters.pdf')
plt.show()

In [None]:
kmeans.labels_