Load required packages and data files for further analysis.

In [17]:
%pwd

'/Users/kangb3/github/project_spring_2020_dup'

In [18]:
from class_setup import create_package_dir
create_package_dir("bk_packages")

Starting in /Users/kangb3/github/project_spring_2020_dup
Removing old directory...
Creating /Users/kangb3/github/project_spring_2020_dup/bk_packages...
The current working directory is now /Users/kangb3/github/project_spring_2020_dup/bk_packages


In [19]:
from pathlib import Path
package_name = "bk_packages"
Path('tests').mkdir()
python_dir = Path(package_name)
python_dir.mkdir()
(python_dir / '__init__.py').touch()
Path('setup.py').touch()
Path('LICENSE').touch()
Path('README.md').touch()

In [20]:
%pwd

'/Users/kangb3/github/project_spring_2020_dup/bk_packages'

In [21]:
%%writefile setup.py
import setuptools

with open("README.md", "r") as fh:
    long_description = fh.read()

setuptools.setup(
    name="bk_pakages", 
    version="0.0.1",
    author="byunghyun_kang",
    author_email="danjong99@gmail.com",
    description="A small python package which downsizes input scRNAseq matrix",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/danjong99/project_spring_2020",
    packages=setuptools.find_packages(),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
    python_requires='>=3.6',

)

Overwriting setup.py


In [22]:
%%writefile README.md
# Single Cell Random Drawing

This python package will enable for users to reduce the size of input scRNAseq matrix by random selection of a given number of samples within pre-defined clusters and averaging the gene expression value of the chosen cells.

# Input
  1. Expression matrix of scRNAseq (gene by cells)
  2. Metadata (cell ids and cluster or cell type info)
  3. For visualization, pre-calculated tSNE or UMAP coordinates can be tossed along with meatadata

# Returns
  1. shrunken matrix
  2. shrunken metadata
  3. cell.ids of randomly selected cells
  4. tSNE plot, if the coordinates were given.

# The benefit of downsizing cells with averaging the exprssion levels of gene?
  1. downsizing itself reduce the calculation burden --> reducing calculation time
  2. Increase the resolution of differential gene expressin.

(e.g. assume that you have 400 cells in cluster 1 of your scRNAseq data, and if you set the number of cells to be chosen as 20, then 20 cells from cluster 1 of your data will be randomly and repeatedly selected with no replacement. This process will downsize the cluster 1 from 400 cells to 20 cells with better gene expression resolution.)

Overwriting README.md


In [23]:
%%writefile LICENSE
Copyright (c) 2018 The Python Packaging Authority

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

Overwriting LICENSE


In [24]:
%pwd

'/Users/kangb3/github/project_spring_2020_dup/bk_packages'

Defined Functions Needed For the Following Analysis

In [25]:
%%writefile bk_packages/basic_methods.py

def clusTocell_dic_generator(meta):
    """Return {cluster:cell.ids} dictionary """
    temp = [meta[i][2] for i in range(0,len(meta))] # extract cell cluster info
    unique_cluster = unique(temp[1:])         # take unique cluster info
    unique_cluster = sorted(unique_cluster)
    
    meta_by_cluster = []
    meta_dic = {}
    
    ## group cell.ids based on their cluster
    for cluster in unique_cluster:
        index = [i for i, n in enumerate(temp) if n == cluster] # extract the indices of rows in a given cluster
        a = [meta[i][1:3] for i in index] # using the indices, extract cell.id and assigned cluster info.
        meta_by_cluster.append(a)
    
    # convert the list acquired above to dictionary with the cluster as keys
    for j in range(0, len(unique_cluster)):
        meta_dic[ meta_by_cluster[j][0][1] ] = [meta_by_cluster[j][i][0] for i in range(0, len(meta_by_cluster[j]))]
    
    return(meta_dic)

def cellTogene_dic_generator(x):
    """Apply T-transformation and convert the list of lists to Dictionary"""
    """with cell.id as keys and gene expression as value"""
    mtx_t = t(x)
    mtx_dic = {}
    
    for line in mtx_t:
        mtx_dic[line[0]] = line[1:]
    
    return(mtx_dic)

def random_selector(meta_dic, num_to_select = 20):
    
    """Random selection of cell.ids from each cluster"""
    
    import random
    import math
        
    dic_random_items = {}
    num_to_select = num_to_select  # this will be determined by user.
    
    for key in meta_dic.keys():
        
        clust_cell_ids = meta_dic[key].copy()
        
        # determine the number of cycle to run in the following loop.
        if len(meta_dic[key]) < num_to_select:
            print("Number of items to select exceeds the number of items in the list.")
            print("Remember this is random selection without replacement.")
            cycle_to_run = math.floor(len(meta_dic[key])/num_to_select)
        elif len(meta_dic[key])%num_to_select == 0:
            cycle_to_run = int(len(meta_dic[key])/num_to_select)
        else:
            cycle_to_run = math.ceil(len(meta_dic[key])/num_to_select)
    
        for i in range(0, cycle_to_run): # random sampling from the given list without replacement
            if len(clust_cell_ids) > num_to_select:
                a = random.sample(clust_cell_ids, num_to_select)
                dic_random_items[ a[0] ] = a  # takes the first cell.id among selected ones as a key.
                [clust_cell_ids.remove(a[i]) for i in range(0,len(a))] # selected cell.ids are removed from the dic.
            elif len(clust_cell_ids) < num_to_select: # the remaining cells are taken 
                a = clust_cell_ids
                dic_random_items[ a[0] ] = a
    
    return(dic_random_items)

def geneAverage(mtx_dic, dic_random_items):
    mtx_small = {}
    a = mtx_dic['gene'].copy()
    mtx_small['gene'] = a
    
    for key, values in dic_random_items.items():
        temp = []
        a = []
        for value in values:
            a = mtx_dic[ value ].copy()
            temp.append(convert_to_float(a))
        mtx_small[ key ] = rowMean(temp)
    
    [v.insert(0,k) for k, v in mtx_small.items()]
    temp_2 = [x for x in mtx_small.values()]
    
    final_mtx = []
    temp = t(temp_2)
    final_mtx = temp.copy()
    
    return(final_mtx)

def t(mtx):
    """T-transform the Input Matrix"""
    try:
        if type(mtx) is str: #if the input is str, raise exception.
            raise Exception('Error: the input should be a list of lists.')
        
        for i in range(0, len(mtx)):
            assert len(mtx[i]) == len(mtx[i-1]) # check whether each list has the same length.
        
        ## Execution code
        mtx_t = []
        for j in range(0, len(mtx[0])):
            mtx_t.append( [mtx[i][j] for i in range(0,len(mtx))] )
        return(mtx_t)
    
    except TypeError as detail: # Control TypeEorror.
        return ('Error: the input should be a list of lists.')
    except AssertionError as detail:
        return ('Error: length of the lists should be the same')

def convert_to_float(input_list):
    """Convert list of str to list of float"""
    try:
        list_float = [float(i) for i in input_list]
        return(list_float)
    except ValueError as detail:
        return ("input is not convertible to float.")

def unique(input_list):
    """Return unique value of the input list"""  
    try:
        # intilize a null list 
        unique_list = [] 
        # traverse for all elements 
        for x in input_list: 
            # check if exists in unique_list or not 
            if x not in unique_list: 
                unique_list.append(x)
        return(unique_list)
    except TypeError as detail:
        return ("int object is not iterable")

def rowSum(mtx):
    """Return all row-sums as a list"""
    try:
        for i in range(0, len(mtx)):
            assert len(mtx[i]) == len(mtx[i-1]) # check whether each list has the same length.
        
        res = list()
        for j in range(0, len(mtx[0])): 
            tmp = 0
            for i in range(0, len(mtx)): 
                tmp = tmp + mtx[i][j]
            res.append(tmp)
        return(res)
    
    except AssertionError as detail:
        return ('Length of lists is irregular or input format is wrong.')
    except TypeError as detail:
        return ('Undefined operand type')

def rowMean(mtx):
    """Return all row-sums as a list"""
    try:
        for i in range(0, len(mtx)):
            assert len(mtx[i]) == len(mtx[i-1]) # check whether each list has the same length.
        
        res = list()
        for j in range(0, len(mtx[0])): 
            tmp = 0
            for i in range(0, len(mtx)): 
                tmp = tmp + mtx[i][j]
            res.append(tmp/len(mtx))
        return(res)
    
    except AssertionError as detail:
        return ('Length of lists is irregular or input format is wrong.')
    except TypeError as detail:
        return ('Undefined operand type')

Writing bk_packages/basic_methods.py


In [39]:
%%writefile tests/test_basics.py

from bk_packages.basic_methods import *

def test_t():
    input_list = [[1,2,3],[4,5,6],[7,8,9],[10,11,12]]
    obs = t(input_list)
    exp = [[1, 4, 7, 10], [2, 5, 8, 11], [3, 6, 9, 12]]
    assert obs == exp

def test_convertTofloat():
    input_list = ['1','2','3','4','5','6','7','8','9','10']
    obs = convert_to_float(input_list)
    exp = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
    assert obs == exp

def test_unique():
    input_list = [1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,8,9,9,9,9,9,9,10]
    obs = unique(input_list)
    exp = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    assert obs == exp

def test_rowSum():
    input_list = [[1,2,3],[4,5,6],[7,8,9],[10,11,12]]
    obs = rowSum(input_list)
    exp = [22, 26, 30]
    assert obs == exp

def test_rowMean():
    input_list = [[1,2,3],[4,5,6],[7,8,9],[10,11,12]]
    obs = rowMean(input_list)
    exp = [5.5, 6.5, 7.5]
    assert obs == exp

Overwriting tests/test_basics.py


In [37]:
%pwd
%cd ..

/Users/kangb3/github/project_spring_2020_dup/bk_packages


In [38]:
%%writefile bk_packages/main.py

def main():

    import os
    import pandas as pd
    from matplotlib import pyplot as plt
    import seaborn as sns
    from bk_packages.basic_methods import *
    
    DATA_FOLDER="/Users/kangb3/github/project_spring_2020_dup/data/import/"
    EXP_FNAME = os.path.join(DATA_FOLDER, "exprs_mtx.txt")
    META_FNAME = os.path.join(DATA_FOLDER, "metatable.txt")
    
    #### Read Meta Files - read as list of lists
    meta = []
    with open(META_FNAME) as f_obj:
        for line in f_obj:
            res = line.replace('"','').strip('\n').split('\t')
            meta.append(res)
    
    ### Insert one element in the 1th list of the meta data
    meta[0].insert(0,'rowname')
    
    ### Read matrix file - read as list of lists
    mtx = []
    with open(EXP_FNAME) as f_obj:
        for line in f_obj:
            res = line.replace('"','').strip('\n').split('\t')
            mtx.append(res)
    
    ### Insert one element in the 1th list of the mtx
    mtx[0].insert(0,'gene')

if __name__ == '__main__':
    main()

Overwriting bk_packages/main.py


In [2]:
%pwd
%cd ./bk_packages/

/Users/kangb3/github/project_spring_2020_dup/bk_packages


In [3]:
!pip install -e .

Obtaining file:///Users/kangb3/github/project_spring_2020_dup/bk_packages
Installing collected packages: bk-pakages
  Attempting uninstall: bk-pakages
    Found existing installation: bk-pakages 0.0.1
    Uninstalling bk-pakages-0.0.1:
      Successfully uninstalled bk-pakages-0.0.1
  Running setup.py develop for bk-pakages
Successfully installed bk-pakages


In [4]:
%pwd
%cd ./tests/

/Users/kangb3/github/project_spring_2020_dup/bk_packages/tests


In [5]:
!pytest

platform darwin -- Python 3.7.1, pytest-4.1.0, py-1.7.0, pluggy-0.8.0
rootdir: /Users/kangb3/github/project_spring_2020_dup/bk_packages, inifile:
plugins: remotedata-0.3.1, openfiles-0.3.1, doctestplus-0.2.0, dependency-0.4.0, arraydiff-0.2
[1mcollecting ... [0m[1mcollected 5 items                                                              [0m

test_basics.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[36m                                                     [100%][0m



In [26]:
%cd ..

/Users/kangb3/github/project_spring_2020_dup/bk_packages


In [6]:
import bk_packages

In [7]:
meta

NameError: name 'meta' is not defined

In [8]:
meta_dic = clusTocell_dic_generator(meta)
mtx_dic = cellTogene_dic_generator(mtx)
dic_random_items = random_selector(meta_dic=meta_dic, num_to_select=15)
final_mtx = geneAverage(dic_random_items=dic_random_items, mtx_dic=mtx_dic)

NameError: name 'clusTocell_dic_generator' is not defined

In [58]:
final_mtx[2]

['0610007P14Rik',
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.13279030834039798,
 0.0,
 0.0,
 0.0,
 0.0,
 0.39762926258357467,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.17995470510475267,
 0.0,
 0.0,
 0.0,
 0.3218082351227,
 0.0,
 0.21593752305254535,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.21850198123733133,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.17595572972154935,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.2036064313124027,
 0.0,
 0.19401566009303867,
 0.0,
 0.13289973754951265,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.191971737677118,
 0.0,
 0.0,
 0.0,
 0.21785081230046,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1713175229662053,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1591371344276727,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.11417374086

In [None]:
    ## Export result matrix as txt file
    with open("/Users/kangb3/github/project_spring_2020_dup/data/export/mtx_small.txt", "w") as file:
    for lines in final_mtx:
        for line in lines:
            file.write(str(line) + "\t")
        file.write("\n")
    
    ## Export the list of randomly selected cells and representative cells.
    import csv
    
    with open('/Users/kangb3/github/project_spring_2020_dup/data/export/randomcells.csv', 'w') as f:
        for key in dic_random_items.keys():
            f.write("%s, %s\n" % (key, dic_random_items[key]))
    
    for k, v in dic_random_items.items():
        print( "Represeting Cell ID: " + k + " - Assigned Cluster: " + str(meta_by_clus_dic[k]) + " - Sample Number: " + str(len(v)) )


In [54]:
meta_by_clus_temp = []
for i in range(0, len(meta_by_cluster)):
    for j in range(0, len(meta_by_cluster[i])):
        meta_by_clus_temp.append(meta_by_cluster[i][j])

meta_by_clus_dic = {}
for i in range(0, len(meta_by_clus_temp)):
    meta_by_clus_dic[ meta_by_clus_temp[i][0] ] = meta_by_clus_temp[i][1]

for k, v in dic_random_items.items():
    print( "Represeting Cell ID: " + k + " - Assigned Cluster: " + str(meta_by_clus_dic[k]) + " - Sample Number: " + str(len(v)) )

NameError: name 'meta_by_clus_dic' is not defined