This notebook loosely follows 04_extract_data_from_supplementary_excel_files.<br>It is a pure data-cleaning notebook.

Inputs:
1. NIHMS687993-supplement-supp_data_4v2.csv was prepared from 04-clean-supp_data_4
2. NIHMS687993-supplement-supp_data_2.xlsx was downloaded from [here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4481139/).

Outputs:
1. mouse_gene_metadata.csv

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

In [2]:
to_save=True

# Get Data

In [3]:
# FINAL_MARKERS_FOR_EACH_CLUSTER
cluster_markers_df = pd.read_csv('data/NIHMS687993-supplement-supp_data_4v2.csv')
cluster_markers_df['cluster_name'] = 'retina_' + cluster_markers_df['cluster_no'].astype(str).str.zfill(2)
cluster_markers_df['in_cluster'] = True
cluster_markers_df.head()

Unnamed: 0,gene_symbol,myAUC,myDiff,power,cluster_no,cluster_name,in_cluster
0,CALB1,0.966,3.615047,0.466,1,retina_01,True
1,SLC4A3,0.963,3.448571,0.463,1,retina_01,True
2,TPM3,0.965,3.151521,0.465,1,retina_01,True
3,SEPT4,0.964,2.939258,0.464,1,retina_01,True
4,VIM,0.944,2.937992,0.444,1,retina_01,True


In [4]:
# Cell Cycle Regulated Genes
mmc2 = pd.read_excel('data/downloads/NIHMS687993-supplement-supp_data_2.xlsx',
                     sheet_name='Cell Cycle Regulated Genes',
                     names=['human_cluster', 'human_gene', 'cluster_no', 'gene_symbol',
                            None, 'all_mouse_genes', 'novel_mouse_genes', 'annotation']
                    )
mouse_genes = mmc2[['cluster_no', 'gene_symbol']]
mouse_genes['cluster_name'] = 'cellcycle_' + mouse_genes['cluster_no'].astype(str).str.zfill(2)
mouse_genes['in_cluster'] = True
mouse_genes[['gene_symbol', 'cluster_no', 'cluster_name', 'in_cluster']]

Unnamed: 0,gene_symbol,cluster_no,cluster_name,in_cluster
0,Shmt1,1,cellcycle_01,True
1,Zmym1,1,cellcycle_01,True
2,Meaf6,1,cellcycle_01,True
3,Usp37,1,cellcycle_01,True
4,Msh6,1,cellcycle_01,True
...,...,...,...,...
663,Ccdc6,8,cellcycle_08,True
664,Luc7l3,8,cellcycle_08,True
665,Gm9843,8,cellcycle_08,True
666,Rsl1d1,8,cellcycle_08,True


# Combine and Reshape

In [5]:
retina_metadata = cluster_markers_df.pivot(index='gene_symbol',
                                           columns='cluster_name',
                                           values='in_cluster')
retina_metadata = retina_metadata.fillna(False)
print(retina_metadata.shape)
retina_metadata.head()

(1339, 39)


cluster_name,retina_01,retina_02,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,...,retina_30,retina_31,retina_32,retina_33,retina_34,retina_35,retina_36,retina_37,retina_38,retina_39
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [6]:
cell_cycle_metadata = mouse_genes.pivot(index='gene_symbol', 
                              columns='cluster_name', 
                              values='in_cluster')
cell_cycle_metadata = cell_cycle_metadata.fillna(False)
print(cell_cycle_metadata.shape)
cell_cycle_metadata.head()

(668, 8)


cluster_name,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Aaas,False,False,False,False,True,False,False,False
Acat2,False,False,True,False,False,False,False,False
Acot9,False,False,False,False,False,False,True,False
Actb,False,False,False,False,False,False,False,True
Adar,False,False,False,False,False,False,False,True


In [7]:
mouse_gene_metadata = pd.concat([retina_metadata, cell_cycle_metadata], axis=1)
mouse_gene_metadata = mouse_gene_metadata.fillna(False)
mouse_gene_metadata
print(mouse_gene_metadata.shape)
mouse_gene_metadata.head()

(2007, 47)


cluster_name,retina_01,retina_02,retina_03,retina_04,retina_05,retina_06,retina_07,retina_08,retina_09,retina_10,...,retina_38,retina_39,cellcycle_01,cellcycle_02,cellcycle_03,cellcycle_04,cellcycle_05,cellcycle_06,cellcycle_07,cellcycle_08
1500015O10RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1500016L03RIK,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1700025G04RIK,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1810009A15RIK,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1810037I17RIK,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
if to_save:
    mouse_gene_metadata.to_csv('data/mouse_gene_metadata.csv', index=True)