This notebook follows 04_extract_data_from_supplementary_excel_files.

Inputs:
1. FINAL_MARKERS_FOR_EACH_CLUSTER.csv was prepared from 04-clean-supp_data_4
2. NIHMS687993-supplement-supp_data_2.xlsx was downloaded from [here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4481139/).

Outputs:
1. mouse_gene_metadata.csv

In [2]:
import pandas as pd
import numpy as np
import itertools
from harrison_functions.formatting.list_tools import pairwise

In [3]:
to_save=False

# Get Data

In [7]:
cluster_markers = pd.read_csv('data/FINAL_MARKERS_FOR_EACH_CLUSTER.csv')
mmc2 = pd.read_excel('data/downloads/NIHMS687993-supplement-supp_data_2.xlsx')
mmc2

Unnamed: 0,cluster,human gene,cluster.1,mouse gene,Unnamed: 4,Intersection,Unnamed: 6,Unnamed: 7
0,1.0,CCNE2,1,Shmt1,,All genes,novel genes,annotation
1,1.0,CDC6,1,Zmym1,,ACTB,ACTB,
2,1.0,CLSPN,1,Meaf6,,AKIRIN2,ARHGAP11A,
3,1.0,DTL,1,Usp37,,ANLN,ARL6IP6,
4,1.0,MCM3,1,Msh6,,ANP32E,ARPC2,
...,...,...,...,...,...,...,...,...
663,,,8,Ccdc6,,,,
664,,,8,Luc7l3,,,,
665,,,8,Gm9843,,,,
666,,,8,Rsl1d1,,,,


In [9]:
mouse_genes = mmc2[['cluster.1', 'mouse gene']]
mouse_genes['cluster.1'] = 'cellcycle_' + mouse_genes['cluster.1'].astype(str).str.zfill(2)
mouse_genes = mouse_genes.rename(columns={'cluster.1': 'cluster_name'})
mouse_genes['in_cluster'] = True

mouse_genes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,cluster_name,mouse gene,in_cluster
0,cellcycle_01,Shmt1,True
1,cellcycle_01,Zmym1,True
2,cellcycle_01,Meaf6,True
3,cellcycle_01,Usp37,True
4,cellcycle_01,Msh6,True


# Cell Cycle

In [10]:
cellcycle = mouse_genes.pivot(index='mouse gene', 
                              columns='cluster_name', 
                              values='in_cluster')
cellcycle = cellcycle.fillna(False)
cellcycle.head()

cluster_name,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
mouse gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Aaas,False,False,False,False,True,False,False,False
Acat2,False,False,True,False,False,False,False,False
Acot9,False,False,False,False,False,False,True,False
Actb,False,False,False,False,False,False,False,True
Adar,False,False,False,False,False,False,False,True


In [11]:
cluster_markers['cluster_name'] = 'retina_' \
    + cluster_markers['cluster_no'].astype(str).str.zfill(2)
cluster_markers['in_cluster'] = True
cluster_markers.head()

Unnamed: 0,gene_symbol,myAUC,myDiff,power,cluster_no,cluster_name,in_cluster
0,CALB1,0.966,3.615047,0.466,1,retina_01,True
1,SLC4A3,0.963,3.448571,0.463,1,retina_01,True
2,TPM3,0.965,3.151521,0.465,1,retina_01,True
3,SEPT4,0.964,2.939258,0.464,1,retina_01,True
4,VIM,0.944,2.937992,0.444,1,retina_01,True


In [12]:
mouse_gene_metadata = cluster_markers.pivot(index='gene_symbol',
                                            values='in_cluster',
                                            columns='cluster_name')
mouse_gene_metadata = mouse_gene_metadata.fillna(False)
mouse_gene_metadata.head()

cluster_name,retina_01,retina_02,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,...,retina_29,retina_30,retina_31,retina_32,retina_33,retina_34,retina_35,retina_36,retina_37,retina_38
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [13]:
mouse_gene_metadata = pd.concat([mouse_gene_metadata, cellcycle], axis=1)
print(mouse_gene_metadata.shape)
mouse_gene_metadata.head()

(1938, 46)


cluster_name,retina_01,retina_02,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,...,retina_37,retina_38,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,,,,,,,,
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,,,,,,,,
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,,,,,,,,
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,False,True,,,,,,,,
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,,,,,,,,


In [14]:
mouse_gene_metadata = mouse_gene_metadata.fillna(False)
mouse_gene_metadata

cluster_name,retina_01,retina_02,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,...,retina_37,retina_38,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zfp36l2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
Zmym1,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
Zmynd19,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
Zrsr2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [None]:
if to_save:
    mouse_gene_metadata.to_csv('data/metadata/mouse_gene_metadata.csv', index=True)