In [3]:
import numpy as np
import pandas as pd
import visualisation
import clustering
import plotly
import plotly.express as px
import plotly.graph_objs as go
from sklearn.metrics import adjusted_rand_score

FlowDataAll = pd.read_csv('FlowData.csv', index_col=[0])
labelsAll = FlowDataAll.index  # The labels give the cell-type for each cell

#Make a data subsdet containing 5 of the major cell-type subsets and 400 cells
frames = [
    FlowDataAll.iloc[labelsAll == 'T cells', :],
    FlowDataAll.iloc[labelsAll == 'Neutrophils', :],
    FlowDataAll.iloc[labelsAll == 'Monocytes', :],
    FlowDataAll.iloc[labelsAll == 'NK cells', :],
    FlowDataAll.iloc[labelsAll == 'Eosinophils', :]
]
data = pd.concat(frames)
data = data.sample(400, random_state=1)  #Take only 400 cells (at random)
labels = data.index
N, D = data.shape
print('Cells: %s, Markers: %s' % (N, D))
data.head()

W, scores, fracs = visualisation.do_pca(data)
# W - Factor loading matrix,
# scores - PCA scores can be used to visualise data in a lower dimensional space
# fracs - Eigenvalues (normalised to sum to one) give fraction of variance explained by each principal component (PC)

scores = scores / abs(scores).max().max()
# Scale the scores by the maximum value in the score matrix so the plots look better

scores['label'] = labels  # Add a column with labels for plotting

W

scores.shape

scores

fracs

x = np.arange(
    1,
    len(fracs) +
    1)  # Create a numpy array from 1 up to the length of the fracs vector
y = np.array(fracs)  # Put the fracs into an numpy array
fig = go.Figure(data=go.Scatter(x=x, y=y))
fig.update_layout(xaxis_title='Principal component',
                  yaxis_title='Variance explained (fraction)',
                  xaxis=dict(dtick=1.0))
fig.show()

x = np.arange(1, len(fracs) + 1)
y = np.cumsum(fracs)  # Cumulative sum of elements in the fracs array
fig = go.Figure(data=go.Scatter(x=x, y=y))
fig.update_layout(xaxis_title='Principal component',
                  yaxis_title='Variance explained (cummulative)',
                  xaxis=dict(dtick=1.0))
fig.show()

XPC = 'PC1'  # Principal component shown on x-axis
YPC = 'PC2'  # Principal component shown on y-axis
fig = px.scatter(scores, x=XPC, y=YPC, color='label', hover_data=[XPC, YPC])
fig.update_traces(mode='markers', marker_line_width=1, marker_size=8)
fig.show()

explained_variance_ratio = data.explained_variance_ratio_
print(
    f"Percentage of variance explained by PC1: {explained_variance_ratio[0] * 100:.2f}%"
)
print(
    f"Percentage of variance explained by PC2: {explained_variance_ratio[1] * 100:.2f}%"
)
explained_variance_ratio = data.explained_variance_ratio_

explained_variance_ratio

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(data.values)

kmeanslabels = kmeans.labels_
print(kmeanslabels)

fig = px.scatter(scores, x='PC1', y='PC2', color=labels, symbol=kmeanslabels)
fig.update_traces(mode='markers', marker_line_width=1, marker_size=8)
fig.show()

df = pd.DataFrame({'Labels': labels, 'Clusters': kmeanslabels})
ct = pd.crosstab(df['Clusters'], df['Labels'])
print(ct)

kmeansARI = adjusted_rand_score(kmeanslabels, labels)
print(kmeansARI)

from scipy.cluster.hierarchy import linkage

np.set_printoptions(precision=5,
                    suppress=True)  # suppress scientific float notation
%matplotlib inline

# generate the linkage matrix
X = data.values
print('Shape', X.shape)
Z = linkage(X, method='ward', metric='euclidean')
print(Z)

clusterlabels = clustering.Dendogram(data,
                                     method='ward',
                                     metric='euclidean',
                                     distanceThreshold=0.3)

df = pd.DataFrame({'Labels': labels, 'Clusters': 'clusterlabels'})
ct = pd.crosstab(df['Clusters'], df['Labels'])
print(ct)

ModuleNotFoundError: No module named 'visualisation'