# Student Activity
> Cell Types: [Holmes et al. (2005)](https://www.pnas.org/content/102/15/5519) studied gene expression profiles of sorted T-cell populations from different subjects. The columns are a subset of gene expression measurements, they correspond to 156 genes that show differential expression between cell types.

- Dataset: [../data/Msig3transp.csv](../data/Msig3transp.csv)
- Metadata:
    - Status = Healthy & Melanoma
    - Cell Types = Naive, Effector, Memory
    
- Your task:
    - Find out whether gene expression profiles reflects those of status or cell types!
    - Do you find sample that seems mislabelled?

In [None]:
# load library
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

## Cleaning & EDA

In [None]:
# load data
raw = pd.read_csv('../data') #load data here #!!!
#raw = raw.rename(columns={'Unnamed: 0':'sample'}) #rename first columns #!!!
raw.head(1)

In [None]:
metadata = pd.DataFrame(data=[[i[:3], i[6:9]] for i in raw['sample']], columns=['status','cell_types']) #extract metadata from first column
#metadata.head(1)

In [None]:
data = raw.merge(metadata, left_index=True, right_index=True).set_index('sample') #merge into 1 dataframe
#data.head(1)

In [None]:
raw = raw.set_index('sample')
#raw.head(1)

In [None]:
sns.set_theme()
sns.heatmap(data.drop(columns=['status', 'cell_types']))
plt.show()

## PCA

In [None]:
# split data into features (X) and labels (y)
X = raw.values
Y = metadata.values

In [None]:
# feature scaling
scaler = preprocessing.StandardScaler()

scaler.fit(X)
X_scaled_array = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled_array, columns = ) #masukkan kolom nama gen #!!!

X_scaled.head(5)

In [None]:
seed = 0
ndimensions = 2

pca = PCA(n_components=ndimensions, random_state=seed)
pca.fit(X_scaled)
X_pca_array = pca.transform(X_scaled)
X_pca = pd.DataFrame(X_pca_array, columns=['PC1','PC2']) # PC=principal component
X_pca.sample(5)

## K-Means Clustering

In [None]:
nclusters = # this is the k in kmeans #!!!


km = KMeans(n_clusters=nclusters, random_state=seed)
km.fit(X_scaled)

# predict the cluster for each data point
y_cluster_kmeans = km.predict(X_scaled)
y_cluster_kmeans

In [None]:
Y1 = pd.Categorical(data['status'].values).codes
y_id_array1 = Y1
Y2 = #!!!
y_id_array2 = Y2

df_plot = X_pca.copy()
df_plot['ClusterKmeans'] = y_cluster_kmeans
df_plot['status'] = y_id_array1 # also add actual labels so we can use it in later plots
df_plot['cell_types'] = #!!!
df_plot.sample(5)

## Visualization

In [None]:
def plotData(df, groupby):
    "make a scatterplot of the first two principal components of the data, colored by the groupby field"
    
    # make a figure with just one subplot.
    # you can specify multiple subplots in a figure, 
    # in which case ax would be an array of axes,
    # but in this case it'll just be a single axis object.
    fig, ax = plt.subplots(figsize = (7,7))

    # color map
    cmap = mpl.cm.get_cmap('prism')

    # we can use pandas to plot each cluster on the same graph.
    # see http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html
    for i, cluster in df.groupby(groupby):
        cluster.plot(ax = ax, # need to pass this so all scatterplots are on same graph
                     kind = 'scatter', 
                     x = 'PC1', y = 'PC2',
                     color = cmap(i/(nclusters-1)), # cmap maps a number to a color
                     label = "%s %i" % (groupby, i), 
                     s=30) # dot size
    ax.grid()
    ax.axhline(0, color='black')
    ax.axvline(0, color='black')
    ax.set_title("Principal Components Analysis");

In [None]:
# plot the clusters each datapoint was assigned to
plotData(df_plot, 'ClusterKmeans')
plotData(df_plot, 'status')
plotData(df_plot) #!!!