In [2]:
# Alphabetical order is standard
# We're doing "import superlongname as abbrev" for our laziness - this way we don't have to type out the whole thing each time.

# Python plotting library
import matplotlib.pyplot as plt

# Numerical python library (pronounced "num-pie")
import numpy as np

# Dataframes in Python
import pandas as pd

# conda install -c anaconda pytables=3.2.2 
from pandas import HDFStore as hdfstore

# T-test of independent samples
from scipy.stats import ttest_ind

# Statistical plotting library we'll use
import seaborn as sns
sns.set(style='whitegrid')

# Matrix decomposition
from sklearn.decomposition import PCA, FastICA

# Manifold learning
from sklearn.manifold import MDS, TSNE

# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline

In [30]:
# ! ls /Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data
filename = '/Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/GSE63472_P14Retina_merged_digital_expression.txt'
! ls /Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/

[31mGSE63472_P14Retina_merged_digital_expression.txt[m[m
[31mGSE63472_mm10_reference_metadata.tar[m[m
[30m[43mmm10[m[m
[31mstore.h5[m[m


In [31]:
# Read the data table
mccarroll_data = pd.read_table(filename, 
                               
                                     # Sets the first (Python starts counting from 0 not 1) column as the row names
                                      index_col=0)

In [32]:
# hdf_file = '/Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/store.h5'
# # mc_data = pd.read_hdf(hdf_file)
mccarroll_data.to_hdf('/Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/mc_hd5.h5', 'mccarroll_data')

In [13]:
hdf_file = '/Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/mc_hd5.h5'

In [3]:
mc2 = pd.read_hdf(hdf_file, 'mccarroll_data')

In [4]:
mc2.head()

Unnamed: 0_level_0,r1_GGCCGCAGTCCG,r1_CTTGTGCGGGAA,r1_GCGCAACTGCTC,r1_GATTGGGAGGCA,r1_CCTCCTAGTTGG,r1_AGTCAAGCCCTC,r1_GTGCCGCCTCTC,r1_CCTGTGACACAC,r1_AATCTCGTTAAT,r1_GATTTCCTCTGA,...,p1_GAGGGGCTCTAA,p1_AGCCAAGGCTCG,p1_TGAGTCGTCTTA,p1_AACGGTCGCTTT,p1_CGAATACGTGTC,p1_TCAAAAGCCGGG,p1_ATTAAGTTCCAA,p1_CTGTCTGAGACC,p1_TAACGCGCTCCT,p1_ATTCTTGTTCTT
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KITL,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
TMTC3,3,0,0,0,2,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CEP290,1,3,0,2,1,18,10,3,4,3,...,0,0,0,0,0,0,0,0,0,0
4930430F08RIK,2,1,2,0,1,1,0,1,1,1,...,0,0,0,0,0,0,1,0,0,0
1700017N19RIK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
mc2

Unnamed: 0_level_0,r1_GGCCGCAGTCCG,r1_CTTGTGCGGGAA,r1_GCGCAACTGCTC,r1_GATTGGGAGGCA,r1_CCTCCTAGTTGG,r1_AGTCAAGCCCTC,r1_GTGCCGCCTCTC,r1_CCTGTGACACAC,r1_AATCTCGTTAAT,r1_GATTTCCTCTGA,...,p1_GAGGGGCTCTAA,p1_AGCCAAGGCTCG,p1_TGAGTCGTCTTA,p1_AACGGTCGCTTT,p1_CGAATACGTGTC,p1_TCAAAAGCCGGG,p1_ATTAAGTTCCAA,p1_CTGTCTGAGACC,p1_TAACGCGCTCCT,p1_ATTCTTGTTCTT
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KITL,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
TMTC3,3,0,0,0,2,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CEP290,1,3,0,2,1,18,10,3,4,3,...,0,0,0,0,0,0,0,0,0,0
4930430F08RIK,2,1,2,0,1,1,0,1,1,1,...,0,0,0,0,0,0,1,0,0,0
1700017N19RIK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MGAT4C,0,0,4,1,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RASSF9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LRRIQ1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ADGB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SLC6A15,4,1,3,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
mc2_logged = np.log2(mc2+1)
mc2_logged.head()

Unnamed: 0_level_0,r1_GGCCGCAGTCCG,r1_CTTGTGCGGGAA,r1_GCGCAACTGCTC,r1_GATTGGGAGGCA,r1_CCTCCTAGTTGG,r1_AGTCAAGCCCTC,r1_GTGCCGCCTCTC,r1_CCTGTGACACAC,r1_AATCTCGTTAAT,r1_GATTTCCTCTGA,...,p1_GAGGGGCTCTAA,p1_AGCCAAGGCTCG,p1_TGAGTCGTCTTA,p1_AACGGTCGCTTT,p1_CGAATACGTGTC,p1_TCAAAAGCCGGG,p1_ATTAAGTTCCAA,p1_CTGTCTGAGACC,p1_TAACGCGCTCCT,p1_ATTCTTGTTCTT
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KITL,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TMTC3,2.0,0.0,0.0,0.0,1.584963,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CEP290,1.0,2.0,0.0,1.584963,1.0,4.247928,3.459432,2.0,2.321928,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4930430F08RIK,1.584963,1.0,1.584963,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1700017N19RIK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
mc2_logged = mc2_logged.T

In [11]:
mc2_logged.head()

gene,KITL,TMTC3,CEP290,4930430F08RIK,1700017N19RIK,MGAT4C,RASSF9,LRRIQ1,ADGB,SLC6A15,...,RP23-341H6.1,GM26101,GM26851,GM15023,4930513O06RIK,VSIG1,GM16390,GM25207,1110059M19RIK,GM20861
r1_GGCCGCAGTCCG,0.0,2.0,1.0,1.584963,0.0,0.0,0.0,0.0,0.0,2.321928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_CTTGTGCGGGAA,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GCGCAACTGCTC,1.0,0.0,0.0,1.584963,0.0,2.321928,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GATTGGGAGGCA,0.0,0.0,1.584963,0.0,0.0,1.0,0.0,0.0,0.0,1.584963,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_CCTCCTAGTTGG,0.0,1.584963,1.0,1.0,0.0,1.584963,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
mc2_logged_nonzero = (mc2_logged != 0)
mc2_logged_nonzero.head(10)

gene,KITL,TMTC3,CEP290,4930430F08RIK,1700017N19RIK,MGAT4C,RASSF9,LRRIQ1,ADGB,SLC6A15,...,RP23-341H6.1,GM26101,GM26851,GM15023,4930513O06RIK,VSIG1,GM16390,GM25207,1110059M19RIK,GM20861
r1_GGCCGCAGTCCG,False,True,True,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
r1_CTTGTGCGGGAA,False,False,True,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
r1_GCGCAACTGCTC,True,False,False,True,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
r1_GATTGGGAGGCA,False,False,True,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
r1_CCTCCTAGTTGG,False,True,True,True,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
r1_AGTCAAGCCCTC,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
r1_GTGCCGCCTCTC,False,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
r1_CCTGTGACACAC,True,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
r1_AATCTCGTTAAT,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
r1_GATTTCCTCTGA,False,False,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
mc2_logged_nonzero.to_hdf(hdf_file, 'mc2_logged_nonzero_boolmat')

In [20]:
cells_gt_900 = mc2_logged_nonzero.sum(axis=1) > 900
cells_gt_900.head(200)
# delme_sum = (delme != 0).sum(axis=1) >= 900
# #bool is true is gene detected in a cell (row)


r1_GGCCGCAGTCCG    True
r1_CTTGTGCGGGAA    True
r1_GCGCAACTGCTC    True
r1_GATTGGGAGGCA    True
r1_CCTCCTAGTTGG    True
r1_AGTCAAGCCCTC    True
r1_GTGCCGCCTCTC    True
r1_CCTGTGACACAC    True
r1_AATCTCGTTAAT    True
r1_GATTTCCTCTGA    True
r1_TGTCCATGCGTA    True
r1_GAAGGCTGGAAC    True
r1_TTCCGGCTGTAC    True
r1_TCGACTTCTTCT    True
r1_TGTGGATCTGCT    True
r1_GGGCCACGATCA    True
r1_GATTACTGAGCT    True
r1_CGAGATATAGCA    True
r1_GGGTGTCAGTGG    True
r1_TGCGAGAGCTTG    True
r1_GCTTCTTGTCCT    True
r1_GTTTATATGCGC    True
r1_TCTTCACTGGCT    True
r1_CTCCACGCCTTT    True
r1_ATCTTTCCCGTG    True
r1_TCATTTAGTCGA    True
r1_GCGCCGAGATGT    True
r1_TAGATTATTCAT    True
r1_TCAACACGCTCT    True
r1_GTCTATTCGGTT    True
                   ... 
r1_ATGACACTGGGT    True
r1_CCAACAGTCCTT    True
r1_CTTAGTGACATA    True
r1_CTCTGTCGTAAA    True
r1_GTGATACAGAGA    True
r1_GAGCGGTTAGGC    True
r1_CAGATCCGACAA    True
r1_TGATAGCTTTAA    True
r1_TTCCACGCGCAG    True
r1_ACTAAGGCGTAA    True
r1_GTGATTATGTAT 

In [24]:
cells_gt_900.to_hdf(hdf_file, 'cells_gt_900_boolmat')
# print(delme_sum.head())
cells_gt_900.shape

(49300,)

In [112]:
cells_gt_900.head(200)

r1_GGCCGCAGTCCG    True
r1_CTTGTGCGGGAA    True
r1_GCGCAACTGCTC    True
r1_GATTGGGAGGCA    True
r1_CCTCCTAGTTGG    True
r1_AGTCAAGCCCTC    True
r1_GTGCCGCCTCTC    True
r1_CCTGTGACACAC    True
r1_AATCTCGTTAAT    True
r1_GATTTCCTCTGA    True
r1_TGTCCATGCGTA    True
r1_GAAGGCTGGAAC    True
r1_TTCCGGCTGTAC    True
r1_TCGACTTCTTCT    True
r1_TGTGGATCTGCT    True
r1_GGGCCACGATCA    True
r1_GATTACTGAGCT    True
r1_CGAGATATAGCA    True
r1_GGGTGTCAGTGG    True
r1_TGCGAGAGCTTG    True
r1_GCTTCTTGTCCT    True
r1_GTTTATATGCGC    True
r1_TCTTCACTGGCT    True
r1_CTCCACGCCTTT    True
r1_ATCTTTCCCGTG    True
r1_TCATTTAGTCGA    True
r1_GCGCCGAGATGT    True
r1_TAGATTATTCAT    True
r1_TCAACACGCTCT    True
r1_GTCTATTCGGTT    True
                   ... 
r1_ATGACACTGGGT    True
r1_CCAACAGTCCTT    True
r1_CTTAGTGACATA    True
r1_CTCTGTCGTAAA    True
r1_GTGATACAGAGA    True
r1_GAGCGGTTAGGC    True
r1_CAGATCCGACAA    True
r1_TGATAGCTTTAA    True
r1_TTCCACGCGCAG    True
r1_ACTAAGGCGTAA    True
r1_GTGATTATGTAT 

In [117]:
store = hdfstore(hdf_file)
store

<class 'pandas.io.pytables.HDFStore'>
File path: /Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/mc_hd5.h5
/cells_gt_900_boolmat                  series       (shape->[49300])      
/coi                                   frame        (shape->[13155,24658])
/coi_false                             frame        (shape->[13155,24658])
/coi_true                              frame        (shape->[36145,24658])
/goi                                   series       (shape->[24658])      
/mc2_logged_nonzero_boolmat            frame        (shape->[49300,24658])
/mccarroll_data                        frame        (shape->[24658,49300])

In [22]:
# store.remove('cells_gt_900')

In [36]:
coi = mc2_logged[cells_gt_900]
print(coi.shape)
mc2_logged.shape
coi.head()

(13155, 24658)


gene,KITL,TMTC3,CEP290,4930430F08RIK,1700017N19RIK,MGAT4C,RASSF9,LRRIQ1,ADGB,SLC6A15,...,RP23-341H6.1,GM26101,GM26851,GM15023,4930513O06RIK,VSIG1,GM16390,GM25207,1110059M19RIK,GM20861
r1_GGCCGCAGTCCG,0.0,2.0,1.0,1.584963,0.0,0.0,0.0,0.0,0.0,2.321928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_CTTGTGCGGGAA,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GCGCAACTGCTC,1.0,0.0,0.0,1.584963,0.0,2.321928,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GATTGGGAGGCA,0.0,0.0,1.584963,0.0,0.0,1.0,0.0,0.0,0.0,1.584963,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_CCTCCTAGTTGG,0.0,1.584963,1.0,1.0,0.0,1.584963,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
coi.to_hdf(hdf_file, 'coi')
# print(delme_sum.head())

In [None]:
store

In [48]:
hdf_file
coi.shape

(13155, 24658)

In [52]:
coi_false = mc2_logged[[not(x) for x in cells_gt_900]]
print(coi_false.shape)
coi_false.head()

(36145, 24658)


gene,KITL,TMTC3,CEP290,4930430F08RIK,1700017N19RIK,MGAT4C,RASSF9,LRRIQ1,ADGB,SLC6A15,...,RP23-341H6.1,GM26101,GM26851,GM15023,4930513O06RIK,VSIG1,GM16390,GM25207,1110059M19RIK,GM20861
r1_GTCGAGTCCCCC,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_CAAACCCAGCCT,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_CAATGATGGTTT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_GACGCCTTCTTT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
r1_ACACACGTGTTT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
false_sum = (coi_false != 0).sum(axis=1)
false_sum.head(20)

NameError: name 'coi_false' is not defined

In [109]:
print(min(false_sum),max(false_sum))

28 900


In [100]:
coi_true = mc2_logged[[x for x in cells_gt_900]]
print(coi_true.shape)
# coi_true.head()

(13155, 24658)


In [101]:
true_sum = (coi_true != 0).sum(axis=1)
true_sum.head(20)

r1_GGCCGCAGTCCG    7243
r1_CTTGTGCGGGAA    6933
r1_GCGCAACTGCTC    6397
r1_GATTGGGAGGCA    5740
r1_CCTCCTAGTTGG    5779
r1_AGTCAAGCCCTC    5221
r1_GTGCCGCCTCTC    4846
r1_CCTGTGACACAC    5118
r1_AATCTCGTTAAT    4947
r1_GATTTCCTCTGA    4898
r1_TGTCCATGCGTA    5184
r1_GAAGGCTGGAAC    5197
r1_TTCCGGCTGTAC    4775
r1_TCGACTTCTTCT    4571
r1_TGTGGATCTGCT    4863
r1_GGGCCACGATCA    4651
r1_GATTACTGAGCT    4591
r1_CGAGATATAGCA    4518
r1_GGGTGTCAGTGG    4716
r1_TGCGAGAGCTTG    4078
dtype: int64

In [108]:
print(min(true_sum),max(true_sum))

901 8363


In [1]:
coi_file = '/Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/coi_data.h5'
coi_true.to_hdf(coi_file,'coi_false')
coi_false.to_hdf(coi_file,'coi_true')

NameError: name 'coi_true' is not defined

In [124]:
store = hdfstore(hdf_file)
store

<class 'pandas.io.pytables.HDFStore'>
File path: /Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/mc_hd5.h5
/cells_gt_900_boolmat                  series       (shape->[49300])      
/coi                                   frame        (shape->[13155,24658])
/goi                                   series       (shape->[24658])      
/mc2_logged_nonzero_boolmat            frame        (shape->[49300,24658])
/mccarroll_data                        frame        (shape->[24658,49300])

In [123]:
store.remove('coi_false')
store.remove('coi_true')

In [127]:
newfile = '/Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/new_hd5.h5'
newfile

'/Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/new_hd5.h5'

In [128]:
store2 = hdfstore(newfile)

In [129]:
store2

<class 'pandas.io.pytables.HDFStore'>
File path: /Volumes/Samsung_T3/CSHL_SingleCell16/dropSeq_data/new_hd5.h5
Empty

In [130]:
mc2_logged.to_hdf(newfile,'mc2_logged')

In [131]:
cells_gt_900.to_hdf(newfile,'cells_gt_900_boolmat')

In [132]:
coi.to_hdf(newfile,'coi')