imports and data uploads
-----------

In [6]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter('ignore')
import os
from scipy import stats
import colorcet as cc
import scipy
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score
%matplotlib inline

In [7]:
# upload data
cpath = os.getcwd() #assumes mean_df_females is in the same path as the notebook
mean_df = pd.read_csv(cpath+'/mean_df_female.csv')

# ----------------------------------------------------------------------- 
# ANALYSIS
# -----------------------------------------------------------------------

In [8]:
# prepare data
data = mean_df.groupby(by = ['mouse', 'group','rep','syllable']).mean()['usage'].reset_index()
data = pd.pivot_table(data, values='usage', index=['mouse','group','rep'], columns=['syllable']).reset_index().fillna(0)
data_umap = data.drop(['mouse','group','rep'], axis=1).to_numpy()
data_umap_labels = data['mouse'].to_numpy()

## clustering analysis

In [9]:
# clustering for individuals
frequencies = data_umap

#scaling the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(frequencies)

nclust=16 #for 16 individuals
kmeans_kwargs = {"init": "random",
                "n_init": 10,
                "max_iter": 300,
                "random_state":10}

kmeans = KMeans(n_clusters=nclust, **kmeans_kwargs)
kmeans.fit_predict(scaled_features)

# The actual labels
clust_labels_indv = kmeans.labels_

# explore goodness of fit to true labels of indv mice
_, idx = np.unique(clust_labels_indv, return_index=True) #unique labels by clustering
clust_labels = clust_labels_indv[np.sort(idx)]

# prepare data
mouse_true_labels = np.unique(data['mouse'].to_numpy()) #unique labels by mice in exp
new_mouse_labels = data['mouse'].to_numpy().copy() # all labels of mice in expetiment

#change eacn label in mouse to corresponding label in cluster labels
for i in range(len(mouse_true_labels)): 
    new_mouse_labels[new_mouse_labels==mouse_true_labels[i]] = clust_labels[i]

clust_labels_indv_str =list(map(str,clust_labels_indv)) # turn clusters into string labels

# accuracy of clustering scores:
ari_kmeans = adjusted_rand_score(new_mouse_labels, clust_labels_indv_str)
print('ARI for clustering by indvidual '+str(ari_kmeans))

ARI for clustering by indvidual 0.4344142768774923


In [10]:
## clustering for phase
nclust=4 # for 4 estrous phases
kmeans_kwargs = {"init": "random",
                "n_init": 10,
                "max_iter": 300,
                "random_state":3} 

kmeans = KMeans(n_clusters=nclust, **kmeans_kwargs)
kmeans.fit_predict(scaled_features)

# The actual labels
clust_labels_phase = kmeans.labels_

# explore goodness of fit to true labels of phase
_, idx = np.unique(clust_labels_phase, return_index=True) #unique labels by clustering
clust_labels = clust_labels_phase[np.sort(idx)]

# prepare data
phase_true_labels = np.unique(data['group'].to_numpy()) #unique labels by mice in exp
new_phase_labels = data['group'].to_numpy().copy() # all labels of mice in expetiment

#change eacn label in phase to corresponding label in cluster labels
for i in range(len(phase_true_labels)): 
    new_phase_labels[new_phase_labels==phase_true_labels[i]] = clust_labels[i]

clust_labels_phase_str =list(map(str,clust_labels_phase)) # turn clusters into string labels

# accuracy of clustering scores:
ari_kmeans = adjusted_rand_score(new_phase_labels, clust_labels_phase_str)
print('ARI for clustering by phase '+str(ari_kmeans))

ARI for clustering by phase 0.030187207225155353
