In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install scrapy --quiet
! pip install scrapydo --quiet
! pip install scrapy-user-agents --quiet
#! pip install selenium

## $\color{Yellow}{\text{"scRNA-seq"[All Fields] AND "lung"[All Fields] AND "Homo sapiens"[Organism] }}$ :
https://www.ncbi.nlm.nih.gov/gds/?term=%22scRNA-seq%22%5BAll+Fields%5D+AND+%22lung%22%5BAll+Fields%5D+AND+%22Homo+sapiens%22%5BOrganism%5D

### Gene Symbol/ID: 
### Alias:
### External ID: 
### Ensembl:    
### Entrez: 
### Gene Type: 
### Description:

### Synonyms:
### Description:
### Chromosome:
### Database Reference (with hyperlinks):
### See related (with hyperlinks):
### Dataset (with hyperlinks):

In [None]:
import scrapy
import scrapydo
import logging
#from selenium import webdriver

scrapydo.setup()

#scRNA-seq databases
class scRNAItem(scrapy.Item):
    Accession_id = scrapy.Field() # gene ID/ symbol
    title = scrapy.Field() # title
    alias = scrapy.Field() # gene alias
    ensembl_id = scrapy.Field() # ensembl id
    gene_type = scrapy.Field() #gene type
    description = scrapy.Field() #gene description
    authors = scrapy.Field() # researchers
    reference = scrapy.Field() # database reference
    
class scRNASpider(scrapy.Spider):
    name = 'scRNA-seq'
    custom_settings = {
        'LOG_LEVEL': 'DEBUG',
        'ROBOTSTXT': False,
        'COOKIES_ENABLED': False,
        'DOWNLOAD_DELAY': 3,
        'DOWNLOADER_MIDDLEWARES' : {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
        },
        
        'FEEDS': {
                '%(name)s.csv': {
                    'format': 'csv',
                }
        }
    }
    page_index = 1
    #offset = 1
    
    def start_requests(self):
        url = 'https://www.ncbi.nlm.nih.gov/gds/?term=%22single-cell+RNA-sequencing%22%5BAll+Fields%5D+OR+%22scRNA-seq%22%5BAll+Fields%5D+AND+%22Homo+sapiens%22%5BOrganism%5D+AND+%22lung%22%5BAll+Fields%5D'

        yield scrapy.Request(url=url,callback=self.parse_keywrd,dont_filter=True)
    
    def parse_keywrd(self,response):
        print('Processing: '+response.url)
        wholeitem = scRNAItem()
        print(response.xpath('//div[@id="maincontent"]//div[@class="content"]//div[@class="rprt"]').extract())
        for whole in response.xpath('//div[@id="maincontent"]//div[@class="content"]//div[@class="rprt"]'):
            
            title = whole.xpath('.//p[@class="title"]/text()').extract(), #title of each db article
            accession_id = whole.xpath('.//div[@class="supp"]//dd[@class="lng_ln"]/text()').extract(), #accession_id for downstream scraping
            summary = list(map(str.strip, whole.xpath('.//div[@class="supp"]/text()').extract())), #summary
            ref = whole.xpath('.//a/@href').extract_first()

            wholeitem['title'] = title
            wholeitem['accession_id'] = accession_id
            wholeitem['summary'] = summary
            wholeitem['reference'] = ref

            yield wholeitem
            
            time.sleep(random.random()*3)
        if self.page_index < 5:
            next_page = response.xpath('//*[@id="amore"]/@href').extract_first()
            print("Next Page Url is: "+next_page)
            self.page_index += 1
            yield scrapy.Request(url=next_page,callback=self.parse,dont_filter=True)
            
    def parse_display(self,response):
        print('Processing: '+response.url)
        wholeitem = scRNAItem()
        
        print(response.xpath('//div[@id="maincontent"]//div[@class="content"]//div[@class="rprt"]').extract())
        for whole in response.xpath('//div[@id="maincontent"]//div[@class="content"]//div[@class="rprt"]'):
            
            title = whole.xpath('.//p[@class="title"]/text()').extract(),
            Accession_id = whole.xpath('.//div[@class="supp"]//dd[@class="lng_ln"]/text()').extract(),
            author = whole.xpath('.//p[@class="source"]//a/text()').extract()[0],
            content = list(map(str.strip, whole.xpath('.//div[@class="contson"]/text()').extract())),
            ref = whole.xpath('.//a/@href').extract_first()

            wholeitem['title'] = title
            wholeitem['alias'] = alias
            wholeitem['ensembl_id'] = ensembl_id
            wholeitem['gene_type'] = gene_type
            wholeitem['authors'] = authors
            wholeitem['description'] = description
            wholeitem['reference'] = ref

            yield wholeitem
            
            time.sleep(random.random()*3)
        if self.page_index < 5:
            next_page = response.xpath('//*[@id="amore"]/@href').extract_first()
            print("Next Page Url is: "+next_page)
            self.page_index += 1
            yield scrapy.Request(url=next_page,callback=self.parse,dont_filter=True)
            
class SeleniumMiddleware(object):

    def __init__(self,timeout=25):
        profile = FirefoxProfile()
        profile.set_preference('permissions.default.image', 2)
        self.browser = webdriver.Firefox(profile)
        self.timeout = timeout
        self.browser.maximize_window()
        # # self.browser.implicitly_wait(20)
        self.browser.set_page_load_timeout(self.timeout)
        self.wait = WebDriverWait(self.browser, self.timeout)
        
    def __del__(self):
        self.browser.close()
    
    
        
scrapydo.run_spider(scRNASpider)

# $\color{Yellow}{\text{"scRNA-seq" [All Fields] OR "single-cell RNA-seq" [All Fields] OR "single-cell RNA sequencing" [All Fields] AND "lung"[All Fields] AND "Homo sapiens"[Organism]  }}$

https://www.ncbi.nlm.nih.gov/gds/?term=%22scRNA-seq%22+%5BAll+Fields%5D+OR+%22single-cell+RNA-seq%22+%5BAll+Fields%5D++OR+%22single-cell+RNA+sequencing%22+%5BAll+Fields%5D+AND+%22lung%22%5BAll+Fields%5D+AND+%22Homo+sapiens%22%5BOrganism%5D
# $\color{Yellow}{\text{"single-cell RNA sequencing" 
[All Fields]AND "lung"[All Fields] AND "Homo sapiens"[Organism] }}$
https://www.ncbi.nlm.nih.gov/gds/?term=%22single-cell+RNA+sequencing%22+%5BAll+Fields%5D+AND+%22lung%22%5BAll+Fields%5D+AND+%22Homo+sapiens%22%5BOrganism%5D


# $\color{Yellow}{\text{"single-cell RNA-seq" [All Fields]AND "cardiovascular"[All Fields] AND "Homo sapiens"[Organism]  }}$


### Title: 
### Organisms:
### Protocol:
### Published/Updated Year: 
### Summary: 
### Contributors:    
### Download_URL: 
### GSM_Samples: 
### Reference Publication: 
protocol not yet pointed but included in Summary

In [None]:
import scrapy
import scrapydo
import logging
#from selenium import webdriver

input_gds = '../input/gds-lung-three-naming/'
file = open(input_gds+'gds_result_three_scRNA.txt','r')
gds = file.readlines()
gds = gds[901:]
file.close()

#print(gds)

scrapydo.setup()

#scRNA-seq databases
class gseItem(scrapy.Item):
    accession = scrapy.Field()
    date = scrapy.Field() #published/updated date
    title = scrapy.Field() # title
    organism = scrapy.Field() # organism:should be Homo Sapiens
    summary = scrapy.Field() # summary of each GSE
    contributors = scrapy.Field() #contributors of each gse
    download_url = scrapy.Field() #download link
    download = scrapy.Field() #download filename
    #gsm = scrapy.Field() #gsm download url
    citation = scrapy.Field() # database reference
    overall_design = scrapy.Field()
    
class scRNAlungSpider(scrapy.Spider):
    name = 'scRNA-gds-lung-0411'
    custom_settings = {
        'LOG_LEVEL': 'DEBUG',
        'ROBOTSTXT': False,
        'COOKIES_ENABLED': False,
        'DOWNLOAD_DELAY': 1.5,
        'DOWNLOADER_MIDDLEWARES' : {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
        },

        'FEEDS': {
                '%(name)s.csv': {
                    'format': 'csv',
                }
        }
    }
    page_index = 1

    def start_requests(self):
        accession_url= ('https://www.ncbi.nlm.nih.gov/gds/?term={}'.format(i) for i in gds)
        for i in accession_url:
            yield scrapy.Request(url=i,callback=self.parse_gds,dont_filter=True)

    def parse_gds(self,response):
        #print('Processing: '+response.url)
        ref = response.xpath('//p[@class="title"]//a/@href').extract_first()
        item = gseItem()
        item['accession'] = ref[ref.index('=')+1:]
        gse_url = 'https://www.ncbi.nlm.nih.gov/'+ref
        yield scrapy.Request(url=gse_url,meta={'item':item},callback=self.parse_GSE,dont_filter=True) #grab the GSE_ID and GSE page

    def parse_GSE(self,response):
        #print('Now GSE is: '+response.url)
        # a scrapy item class for export
        item = response.meta['item']
        
        date = response.xpath('//td[contains(text(),"Status")]/following-sibling::node()/text()').extract()
        title = response.xpath('//td[contains(text(),"Title")]/following-sibling::node()/text()').extract()
        organism = response.xpath('//td[contains(text(),"Organism")]/following-sibling::node()/a/text()').extract()
        summary = response.xpath('//td[contains(text(),"Summary")]/following-sibling::node()/text()').extract()
        overall = response.xpath('//td[contains(text(),"Overall design")]/following-sibling::node()/text()').extract()
        contributors = response.xpath('//td[contains(text(),"Contributor(s)")]/following-sibling::node()/a/text()').extract()
        #backup: //td[contains(text(),"Citation(s)")/following-sibling::node()]
        ref = response.xpath('//span[@class="pubmed_id"]/a/text()').extract()
        #print(ref)
        pubmed = ['https://pubmed.ncbi.nlm.nih.gov/' + r for r in ref]
        accession_file = item['accession']+'_'
        download = response.xpath('//td[contains(text(),"'+accession_file+'")]/text()').extract()
        #print(download)
        download_url = response.xpath('//a[contains(text(),"(ftp)")]/@href').extract()
        http_url = response.xpath('//a[contains(text(),"(http)")]/@href').extract()
        if not download_url:
            item['download_url'] = ['https://www.ncbi.nlm.nih.gov' + h for h in http_url]
        else:   
            item['download_url'] = [s for s in download_url]
            
        item['download'] = [d for d in download]
        

        item['title'] = title
        item['date'] = date
        item['overall_design'] = overall
        item['organism'] = [o for o in organism]

        item['summary'] = summary
  
        item['contributors'] = [c for c in contributors]
        #print(wholeitem['contributors'])
        if ref:
            item['citation'] = pubmed
        yield item
        
        time.sleep(random.random()*3)
        


scrapydo.run_spider(scRNAlungSpider)
    

        

In [None]:
import scrapy
import scrapydo
import logging
#from selenium import webdriver

input_gds = '../input/gds-different-sets/'
file = open(input_gds+'gds_lung.txt','r')
gds = file.readlines()
#gds = gds[1300:]
file.close()

gds = 
#print(gds)

scrapydo.setup()

#scRNA-seq databases
class gseItem(scrapy.Item):
    accession = scrapy.Field()
    date = scrapy.Field() #published/updated date
    title = scrapy.Field() # title
    organism = scrapy.Field() # organism:should be Homo Sapiens
    summary = scrapy.Field() # summary of each GSE
    contributors = scrapy.Field() #contributors of each gse
    download_url = scrapy.Field() #download link
    download = scrapy.Field() #download filename
    #gsm = scrapy.Field() #gsm download url
    citation = scrapy.Field() # database reference
    
    
class scRNAlungSpider(scrapy.Spider):
    name = 'scRNA-gds-lung-3'
    custom_settings = {
        'LOG_LEVEL': 'DEBUG',
        'ROBOTSTXT': False,
        'COOKIES_ENABLED': False,
        'DOWNLOAD_DELAY': 3,
        'DOWNLOADER_MIDDLEWARES' : {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
        },

        'FEEDS': {
                '%(name)s.csv': {
                    'format': 'csv',
                }
        }
    }
    page_index = 1

    def start_requests(self):
        accession_url= ('https://www.ncbi.nlm.nih.gov/gds/?term={}'.format(i) for i in gds)
        for i in accession_url:
            yield scrapy.Request(url=i,callback=self.parse_gds,dont_filter=True)

    def parse_gds(self,response):
        #print('Processing: '+response.url)
        ref = response.xpath('//p[@class="title"]//a/@href').extract_first()
        item = gseItem()
        item['accession'] = ref[ref.index('=')+1:]
        gse_url = 'https://www.ncbi.nlm.nih.gov/'+ref
        yield scrapy.Request(url=gse_url,meta={'item':item},callback=self.parse_GSE,dont_filter=True) #grab the GSE_ID and GSE page

    def parse_GSE(self,response):
        #print('Now GSE is: '+response.url)
        # a scrapy item class for export
        item = response.meta['item']
        
        date = response.xpath('//td[contains(text(),"Status")]/following-sibling::node()/text()').extract()
        title = response.xpath('//td[contains(text(),"Title")]/following-sibling::node()/text()').extract()
        organism = response.xpath('//td[contains(text(),"Organism")]/following-sibling::node()/a/text()').extract()
        summary = response.xpath('//td[contains(text(),"Summary")]/following-sibling::node()/text()').extract()
        contributors = response.xpath('//td[contains(text(),"Contributor(s)")]/following-sibling::node()/a/text()').extract()
        #backup: //td[contains(text(),"Citation(s)")/following-sibling::node()]
        ref = response.xpath('//span[@class="pubmed_id"]/a/text()').extract()
        #print(ref)
        pubmed = ['https://pubmed.ncbi.nlm.nih.gov/' + r for r in ref]
        accession_file = item['accession']+'_'
        #print(accession_file)
        #gsm = response.xpath('//a[contains(text(),"GSM")]/@href').extract()
        #wholeitem['gsm_url'] = ['https://www.ncbi.nlm.nih.gov/' + s for s in gsm]
        download = response.xpath('//td[contains(text(),"'+accession_file+'")]/text()').extract()
        #print(download)
        try:
            download_url = response.xpath('//a[contains(text(),"(ftp)")]/@href').extract()
            item['download_url'] = [s for s in download_url]
        except:
            download_url = response.xpath('//a[contains(text(),"(http)")]/@href').extract()
            item['download_url'] = ['https://www.ncbi.nlm.nih.gov/' + s for s in download_url]
        
        item['download'] = [d for d in download]
        item['download_url'] = ['https://www.ncbi.nlm.nih.gov/' + s for s in download_url]

        item['title'] = title
        item['date'] = date

        item['organism'] = [o for o in organism]

        item['summary'] = summary
  
        item['contributors'] = [c for c in contributors]
        #print(wholeitem['contributors'])
        if ref:
            item['citation'] = pubmed
        yield item
        
        time.sleep(random.random()*3)
        


scrapydo.run_spider(scRNAlungSpider)
    

        

### Next Step: Read in CSV file as Panda DataFrame and Groupby download format
### Pie Chart with specific scRNA-seq collections, Figure 1. ref: https://academic.oup.com/nar/advance-article/doi/10.1093/nar/gkac199/6553688 

In [None]:
print(1)

In [None]:
import pandas as pd
import re

df = pd.read_csv('../input/GSE-with-design/scRNA-gds-lung_design.csv', sep=',')
df1 = pd.read_csv('../input/GSE-with-design/scRNA-gds-lung_design_1.csv', sep=',')

#whole non-filtered df
f_df = pd.concat([df,df1])
#print(f_df['overall_design'].notnull().sum())
#f_df.to_csv('./bulk_GSE_lung.csv')

#filter out non-downloadable rows
#f_df = f_df.dropna(subset=['download'])

#extract overall design information
od = f_df.dropna(subset = ['overall_design'])
#print(od['overall_design'])
#od['overall_design'].to_csv('./design_info.csv')

#extract the download file extension and group them by the extensions
pat = '^.*\.(gz|tar|xlsx|h5|txt|loom)$'
od = od.dropna(subset = ['download'])
print(od)
print("Not null entry counts: ",od['download'].notnull().sum())

od = od.iloc[:-1]
#p_df contains all the downloadable url
p_df = od[od['download'].str.contains(pat)]
#print(p_df.tail())
#df_all = f_df.merge(p_df.drop_duplicates(), on=['download'],  
                   #how='left', indicator=True)
search = []    
for values in p_df['download']:
    search.append(values.split('.')[-2])
#filter with proper group
od['download_group'] = search
#od.to_csv('./organized_GSE_lung.csv')
#d_df = f_df.groupby('download_group')

#o = pd.DataFrame(d_df)
#o = d_df.sum().reset_index()
#print(o)
#o.to_csv('./organized_GSE_lung.csv')
#print(df['download'].astype(str).str.extract(r'^.*\.(gz|tar|)$'))



## Keywords To Search For In Cell, Treatment, Tissu, Disease

In [None]:

import pandas as pd
import re


def process_words(test):
    test = test.split()
    count = 0
    last_i = 0
    tmp = {}
    for i in test:
        if i in keywords and count != 0:
            tmp.setdefault(i, [])
            tmp[i].append(" ".join(test[last_i:count+1]))
            last_i = count+1
        count += 1
    return tmp

df = pd.read_csv('../input/GSE-with-design/scRNA-gds-lung_design.csv', sep=',')
df1 = pd.read_csv('../input/GSE-with-design/scRNA-gds-lung_design_1.csv', sep=',')

#whole non-filtered df
f_df = pd.concat([df,df1])

#extract overall design information
od = f_df.dropna(subset = ['overall_design'])

#extract the download file extension and group them by the extensions
pat = '^.*\.(gz|tar|xlsx|h5|txt|loom)$'
od = od.dropna(subset = ['download'])

#remove last element
od = od.iloc[:-1]
#p_df contains all the downloadable url
p_df = od[od['download'].str.contains(pat)]


search = []    
for values in p_df['download']:
    search.append(values.split('.')[-2])

#filter with proper group
od['download_group'] = search

import spacy

# Load spacy model
nlp = spacy.load('en_core_web_sm')  

#keywords to consider in NLP
keywords = [
    'cell','tissue','disease','treatment','hour','protocol',
    'biospy','patient','scRNA seq','transcriptomic'
]

# Convert each row into spacy document and return the lemma of the tokens in 
# the document if it is not a sotp word. Finally join the lemmas into as a string
od['design_chunks'] = od.overall_design.apply(lambda text: 
                                          " ".join(token.lemma_ for token in nlp(text)
                                                    if 
                                                   not token.is_stop
                                                   and token.pos_ in ["NOUN","PROPN","NUM","ADV"]
                                                   ).replace('/ +/g', ' '))
print(od['design_chunks'][6])
od['design_groups'] = od.design_chunks.apply(lambda text: process_words(text))

#od.to_csv('./grouped_GSE_lung.csv', sep=',')

df = pd.json_normalize(od['design_groups'])
print(df)
#print(od['design_groups'][2])
#print('Before: ', od['overall_design'][0], '\n', 'After: ',od['design_chunks'][0])



print('Before: ', od['design_chunks'][2], '\n', 'After: ',process_words(od['design_chunks'][2]))

#print(od['design_chunks'][0].split(keywords))
'''
size = len(test_list)
idx_list = [idx + 1 for idx, val in
            enumerate(test_list) if val in keywords]
  
res = [test_list[i: j] for i, j in
        zip([0] + idx_list, idx_list + 
        ([size] if idx_list[-1] != size else []))]
print(res)
'''

#Add a Matcher to design_groups to better filter but need a more precise model with lemma, pos
'''  
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

#terms = ['cell','tissue','disease','treatment','hour','protocol']
# Only run nlp.make_doc to speed things up
#patterns = [nlp.make_doc(text) for text in terms]

pat = [ [{"POS": "NUM", "OP": "?"},{"POS":  {"IN": ["NOUN", "PROPN"]}}],
         [{"POS": "ADV", "OP": "?"},{"POS":  {"IN": ["NOUN", "PROPN"]}},{"POS":  {"IN": ["NOUN", "PROPN"]}, "OP": "?"}],
          [{"POS":  {"IN": ["NOUN", "PROPN"]}},{"POS": "VERB", "OP": "+"}],
       [{"POS": "VERB"},{"POS":  {"IN": ["NOUN", "PROPN"]}}]
           
      ]
    
matcher.add("C",pat)
doc = nlp(od['design_chunks'][1])
matches = matcher(doc)
tmp = [(doc[start:end].text) for match_id, start, end in matches]
print(tmp)
'''    




In [None]:
import pandas as pd
hca = [
    {"url":"https://data.humancellatlas.org/explore/projects/10201832-7c73-4033-9b65-3ef13d81656a",
       "title": "The Tabula Sapiens: a single cell transcriptomic atlas of multiple organs from individual human donors",
       "summary": "In recent years there has been tremendous progress towards deep molecular characterization of cell types using single cell transcriptome sequencing. Here we report a single cell transcriptomic atlas comprising nearly 500,000 cells from 24 different human tissues and organs. In several instances multiple organs were analyzed from the same donor. Analyzing organs from the same individual controls for genetic background, age, environment, and epigenetic effects, and enables a detailed comparison of cell types that are shared between tissues. This resource provides a rich molecular characterization of more than 400 cell types, their distribution across tissues, and detailed information about tissue specific variation in gene expression. We have used the fact that multiple tissues came from the same donor to study the clonal distribution of T cells between tissues, to understand the tissue specific mutation rate in B cells, and to analyze the cell cycle state and proliferative potential of shared cell types across tissues. Finally, we have also used this data to characterize cell type specific RNA splicing and how such splicing varies across tissues within an individual.", 
       "contributors": "Stephen R Quake",
       "fastq": "TabulaSapiens.h5ad.zip", "if_fastq": "N", "accession": ''},


{"url": "https://data.humancellatlas.org/explore/projects/0792db34-8047-4e62-802c-9177c9cd8e28",
       "title":"Joint profiling of chromatin accessibility and gene expression in thousands of single cells",
       "summary":"Here we describe sci-CAR, a combinatorial indexing strategy to jointly profile chromatin accessibility and mRNA in each of thousands of single cells. As a proof-of-concept, we apply sci-CAR to 4,825 cells comprising a time-series of dexamethasone treatment, as well as to 11,233 cells from the mouse kidney.",
       "overall_design":"Single cell RNA-seq and ATAC-seq co-profiling for HEK293T cells, NIH/3T3 cells, A549 cells across three treatment conditions (DEX 0 hour, 1 hour and 3 hour treatment), and wild type mouse kidney.",
       "contributors":"Junyue Cao", 
       "fastq":"GSE117089_RAW.tar", "if_fastq":"Y", "accession":"GSE117089"},


{"url":"https://data.humancellatlas.org/explore/projects/2a64db43-1b55-4639-aabb-8dba0145689d",
       "title":"Direct exposure to SARS-CoV-2 and cigarette smoke increases infection severity and alters the stem cell-derived airway repair response",
       "summary":"To study the effect of cigarette smoke exposure on Sars-Cov2 infection, we directly exposed mucociliary air-liquid interface (ALI) cultures derived from primary human nonsmoker airway basal stem cells (ABSCs) to short term cigarette smoke and infected them with live SARS-CoV-2. We set out to examine the underlying mechanisms governing the increased susceptibility of cigarette smoke exposed ALI cultures to SARS-CoV-2 infection by usingle cell profiling of the cultures, which showed that interferon response genes were induced in SARS-CoV-2 infected airway epithelial cells in ALI cultures but smoking exposure together with SARS-CoV-2 infection reduced the interferon response.",
       "overall_design":"Human bronchial airway cells were cultured in air liquid interface for approximately one week. Cells were exposed or not to cigarette smoke for 4 days, then exposed or not to SARS-Covid2 virus for 72 hour. Cultures were then harvested for single cell RNA sequencing.",
       "contributors":"Kathrin Plath",
       "fastq":"SARSCov2-human-lung-10XV3.loom,GSE161089_RAW (1).tar",
       "if_fastq":"Y", "accession":"GSE161089"},


{"url":"https://data.humancellatlas.org/explore/projects/31887183-a72c-4308-9eac-c6140313f39c", 
       "title":"Single-nucleus cross-tissue molecular reference maps to decipher disease gene function.",
       "summary":"A single-nucleus cross-tissue molecular reference map was generated from frozen samples deriving from multiple organ types obtained from the GTEx Project.",
       "contributors":"Orit Rozenblatt-Rosen1 Philip A Branton 2 Gokcen Eraslan1 Julia Waldman1 Shankara Anand1 Ayshwarya Subramanian1 Aviv Regev1 Evgenij Fiskin1 Michael S Cuoco1 Thet Su Win3 Ayellet V Segrè4 Michal Slyper1 Orr Ashenberg1 Jamie L Marshall1 François Aguet1 Olena Kuksenko1 Jiali Wang4 Gad Getz1 Anna Greka1 Kristin G Ardlie1 Danielle Dionne1 Eugene Drokhlyansky1 Nicholas Van Wittenberghe1 John M Rouhana4",
       "fastq":"GTEx_8_tissues_snRNAseq_immune_atlas_071421.public_obs.h5ad, GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad",
       "if_fastq":"N", "accession":""},


{"url": "https://data.humancellatlas.org/explore/projects/c4077b3c-5c98-4d26-a614-246d12c2e5d7",
       "title":"Ischaemic sensitivity of human tissue by single cell RNA seq",
       "summary":"Assessment of the effect of cold ischaemic time on single cell RNAseq data from human tissues using 10x Genomics 3' single cell RNA sequencing. This project contains data for spleen, oesophagus epithelium and lung parenchyma, three tissues that had previously been reported to have differential sensitivity to ischaemia. Samples were collected into Hypothermasol FRS hypothermic preservation media and dissociated fresh (as soon as possible) or at 12h, 24h, 72h post onset of cold ischaemia in the donor. Single cell and bulk RNA sequencing data was generated at each time point and whole genome sequencing was carried out for each donor." ,
       "contributors":"Krzysztof Polanski (Computational Scientist)1 Ricardo J Miragaia (Experimental Scientist)1 Tracey Andrew (Administrator)1 Liam Bolt (Experimental Scientist)1 John R Ferdinand (Experimental Scientist)2 Anna Wilbrey-Clark (Experimental Scientist)1 Emily Relton (Experimental Scientist)1 Karol Nowicki-Osuch (Experimental Scientist)3 Nikitas Georgakopoulos (Clinician)2 Oliver Stegle (Principal Investigator)4 Anthi Tsingene (Experimental Scientist)1 Sarah A Teichmann (Co-investigator)1 Rebecca Fitzgerald (Co-investigator)3 Krishnaa Mahbubani (Clinician)2 Kevin Loudon (Experimental Scientist)2 Phillipa Harding (Experimental Scientist)1 Ni Huang (Computational Scientist)1 Elo Madissoon (Computational Scientist)4 Michael JT Stubbington (Co-investigator)1 Kourosh Saeb-Parsy (Clinician)2 Kerstin B Meyer (Principal Investigator)1",
       "fastq":"lung.cellxgene.h5ad, tissue-stability-human-lung-10XV2.loom",
       "if_fastq":"N", "accession":"ERP114453,PRJEB31843"},


{"url":"https://data.humancellatlas.org/explore/projects/2f676143-80c2-4bc6-b7b4-2613fe0fadf0",
       "title":"Integrative analysis of cell state changes in lung fibrosis with peripheral protein biomarkers",
       "summary":"The correspondence of cell state changes in diseased organs to peripheral protein signatures is currently unknown. Here, we generated and integrated single‐cell transcriptomic and proteomic data from multiple large pulmonary fibrosis patient cohorts. Integration of 233,638 single‐cell transcriptomes (n = 61) across three independent cohorts enabled us to derive shifts in cell type proportions and a robust core set of genes altered in lung fibrosis for 45 cell types. Mass spectrometry analysis of lung lavage fluid (n = 124) and plasma (n = 141) proteomes identified distinct protein signatures correlated with diagnosis, lung function, and injury status. A novel SSTR2+ pericyte state correlated with disease severity and was reflected in lavage fluid by increased levels of the complement regulatory factor CFHR1. We further discovered CRTAC1 as a biomarker of alveolar type‐2 epithelial cell health status in lavage fluid and plasma. Using cross‐modal analysis and machine learning, we identified the cellular source of biomarkers and demonstrated that information transfer between modalities correctly predicts disease status, suggesting feasibility of clinical cell state monitoring through longitudinal sampling of body fluid proteomes.",
       "contributors":"Ilias Angelidis1 Maximilian Strunz1 Matthias Mann2 Fabian Theis3 Heiko Adler4 Meshal Ansari1 Lukas Simon3 Philipp Geyer2 Jürgen Behr5 Herbert Schiller1 Christoph Mayr1 Stephan Böhm6 Nikolaus Kneidinger5 Anne Hilgendorff7 Michael Lindner8 Antje Prasse9 Frank Reichenberger8 Pawandeep Singh1 Edith Silbernagel8 Oliver Eickelberg9 Gabriela Leuschner1",
       "fastq":"munich_cohort_human_dataset.h5ad", "if_fastq":"N", "accession":""},


{"url":"https://data.humancellatlas.org/explore/projects/65858543-530d-48a6-a670-f972b34dfe10",
       "title":"Single cell RNA-sequencing on healthy and IPF lung mesenchymal cells.",
       "summary":"Single cell lung suspensions of explanted healthy and IPF donor lung tissue were generated. scRNA-seq was performed on EPCAM negative live cells sorted by FACS. Puried mesenchymal cells were clustered to evaluate the mesenchymal cell sub-clusters.",
       "overall_design": "To investigate pulmonary fibroblast lineage and the subtypes in human healthy and IPF lungs, mesenchymal cells (EPCAM/CD31/CD45-) were collected from five lungs from healthy donors and five lungs from IPF patients, single cell RNA-sequencing was performed on the purified mesenchymal cells.",
       "contributors": "Xue Liu (Experimental Scientist)1 Simon C Rowan (Experimental Scientist)1 Paul W Noble (Principal Investigator)1 Dianhua Jiang (Principal Investigator)1",
       "fastq":" GSM4763855_IPF2_Expr_norm.csv.gz,GSM4763857_IPF4_Expr_norm.csv.gz,GSM4763860_Normal2_Expr_norm.csv.gz,GSM4763861_Normal4_Expr_norm.csv.gz,GSM4763863_Normal6_Expr_norm.csv.gz,GSM4763856_IPF3_Expr_raw.csv.gz,GSM4763854_IPF1_Expr_norm.csv.gz,GSM4763862_Normal5_Expr_raw.csv.gz,GSM4763858_IPF5_Expr_norm.csv.gz,GSM4763859_Normal1_Expr_norm.csv.gz",
       "if_fastq":"Y", "accession":"GSE157376"},


{"url":"https://data.humancellatlas.org/explore/projects/58028aa8-0ed2-49ca-b60f-15e2ed5989d5",
       "title":"SARS‐CoV‐2 receptor ACE2 and TMPRSS2 are primarily expressed in bronchial transient secretory cells",
       "summary":"The SARS-CoV-2 pandemic affecting the human respiratory system severely challenges public health and urgently demands for increasing our understanding of COVID-19 pathogenesis, especially host factors facilitating virus infection and replication. SARS-CoV-2 was reported to enter cells via binding to ACE2, followed by its priming by TMPRSS2. Here, we investigate ACE2 and TMPRSS2 expression levels and their distribution across cell types in lung tissue (twelve donors, 39,778 cells) and in cells derived from subsegmental bronchial branches (four donors, 17,521 cells) by single nuclei and single cell RNA sequencing, respectively. While TMPRSS2 is expressed in both tissues, in the subsegmental bronchial branches ACE2 is predominantly expressed in a transient secretory cell type. Interestingly, these transiently differentiating cells show an enrichment for pathways related to RHO GTPase function and viral processes suggesting increased vulnerability for SARS-CoV-2 infection. Our data provide a rich resource for future investigations of COVID-19 infection and pathogenesis.",
       "contributors":"Marc Schneider1 Nicolas Kahn1 Timo Trefzer2 Michael Meister1 Robert Lorenz Chua2 Roland Eils (Principal Investigator)2 Hauke Winter1 Agnes W Boots3 Michael Kreuter (Principal Investigator)1 Carmen Veith4 Soeren Lukassen2 Christian Conrad (Principal Investigator)2 Thomas Muley1 Bianca P Hennig2",
       "fastq":"Counts_lung_cells.csv,Metadata_lung_cells.csv",
       "if_fastq":"", "accession":"EGAS00001004419"},


{"url":"https://data.humancellatlas.org/explore/projects/5eafb94b-02d8-423e-81b8-3673da319ca0",
       "title":"Differentiation of Human Intestinal Organoids with Endogenous Vascular Endothelial Cells",
       "summary":"Human pluripotent stem cell (hPSC)-derived intestinal organoids (HIOs) lack some cellular populations found in the native organ, including vasculature. Using single-cell RNA sequencing (scRNA-seq), we have identified a population of endothelial cells (ECs) present early in HIO differentiation that declines over time in culture. Here, we developed a method to expand and maintain this endogenous population of ECs within HIOs (vHIOs). Given that ECs possess organ-specific gene expression, morphology, and function, we used bulk RNA-seq and scRNA-seq to interrogate the developing human intestine, lung, and kidney in order to identify organ-enriched EC gene signatures. By comparing these gene signatures and validated markers to HIO ECs, we find that HIO ECs grown in vitro share the highest similarity with native intestinal ECs relative to kidney and lung. Together, these data demonstrate that HIOs can co-differentiate a native EC population that is properly patterned with an intestine-specific EC transcriptional signature in vitro.",
       "contributors":"Ian A. Glass1 Yu-Hwai Tsai2 Emily M. Holloway2 Sha Huang2 Jason Spence2 Michael Czerwinski2 Meghan M. Capeling2 Angeline Wu2 Caden W. Sweet2 Joshua H. Wu2 Amy E. Stoddard2",
       "fastq":"curl --location --fail 'https://service.azul.data.humancellatlas.org/manifest/files?catalog=dcp13&format=curl&filters=%7B%22fileFormat%22%3A+%7B%22is%22%3A+%5B%22fastq%22%5D%7D%2C+%22projectId%22%3A+%7B%22is%22%3A+%5B%225eafb94b-02d8-423e-81b8-3673da319ca0%22%5D%7D%2C+%22genusSpecies%22%3A+%7B%22is%22%3A+%5B%22Homo+sapiens%22%5D%7D%7D&objectKey=manifests%2F05693854-4e06-5b62-8c8c-bf4704a5960c.a4fcc60d-77b6-5c89-9ae6-284e84fec9e4.curlrc' | curl --config -",
       "if_fastq":"Y", "accession":""},


{"url":"https://data.humancellatlas.org/explore/projects/a9301beb-e9fa-42fe-b75c-84e8a460c733",
       "title":"A human cell atlas of fetal gene expression.",
       "summary":"The gene expression program underlying the specification of human cell types is of fundamental interest. We generated human cell atlases of gene expression and chromatin accessibility in fetal tissues. For gene expression, we applied three-level combinatorial indexing to >110 samples representing 15 organs, ultimately profiling ~4 million single cells. We leveraged the literature and other atlases to identify and annotate hundreds of cell types and subtypes, both within and across tissues. Our analyses focused on organ-specific specializations of broadly distributed cell types (such as blood, endothelial, and epithelial), sites of fetal erythropoiesis (which notably included the adrenal gland), and integration with mouse developmental atlases (such as conserved specification of blood cells). These data represent a rich resource for the exploration of in vivo human gene expression in diverse tissues and cell types." ,
       "overall_design": "single cell RNA-seq profiling of human fetal tissues",
       "contributors":"Fan Zhang1 Malte Spielmann2 James Palis3 Kimberly A Aldinger4 Paul D Kingsley3 Dan Doherty4 Ian A Glass4 Cole Trapnell (Principal Investigator)4 Jay Shendure (Principal Investigator)4 Diana R O'Day4 Junyue Cao4 Mei Deng4 Michael A Zager5 Ronnie Blecher-Gonen4 Frank J Steemers1 Riza M Daza4 Hannah A Pliner5",
       "fastq":"Lung_gene_count.RDS,df_cell.RDS,df_gene.RDS",
       "if_fastq":"N", "accession":" GSE156793"},

{"url":"https://data.humancellatlas.org/explore/projects/ad04c8e7-9b7d-4cce-b8e9-01e31da10b94",
       "title":"Single-cell Transcriptome Analysis Reveals an Anomalous Epithelial Variation and Ectopic Inflammatory Response in Chronic Obstructive Pulmonary Disease",
       "summary":"Chronic obstructive pulmonary disease (COPD) is a progressive disease that obstructs the airflow from the lungs, and tobacco smoking is the major cause of COPD. Here, we applied single-cell RNA sequencing to analyze COPD pathogenesis in COPD patients, non-COPD smokers and never-smokers and investigated the disease progression at single-cell resolution. By single-cell transcriptome analysis, COPD was characterized by shifts in the stromal, immune system and epithelial cell compositions. While epithelial components in never-smokers were relatively uniform, the smoker groups presented with extensive heterogeneity in epithelial cells, particularly in the alveolar type II (AT2) lineages. We identified a subpopulation of AT2 epithelial cells that emerged in smokers, such as COPD patients, and specifically expressed a series of chemokines and PD-L1. A trajectory analysis revealed that the inflammatory AT2 cell subpopulation followed a unique differentiation path, and a prediction model of cell-to-cell interactions inferred increased intercellular networks of inflammatory AT2 cells with immune and stromal cell populations. Thus, our analysis reveals a unique cellular differentiation pathway and function underlying the biological and clinical characteristics of COPD pathogenesis.",
       "overall_design": "single-cell transcriptome profiling from human lung tissues of 6 COPD, 3 non-COPD smoker, and 3 never-smoker samples.",
       "contributors":"Yusuke Yamamoto (Experimental Scientist)1",
       "fastq":"GSE173896_COPD.rds.gz", "if_fastq":"Y", "accession":" GSE173896"},


{"url":"https://data.humancellatlas.org/explore/projects/b32a9915-c81b-4cbc-af53-3a66b5da3c9a",
       "title":"In Vitro and In Vivo Development of the Human Airway at Single-Cell Resolution",
       "summary":"Bud tip progenitor cells give rise to all murine lung epithelial lineages and have been described in the developing human lung; however, the mechanisms controlling human bud tip differentiation into specific lineages are unclear. Here, we used homogeneous human bud tip organoid cultures and identified SMAD signaling as a key regulator of the bud tip-to-airway transition. SMAD induction led to the differentiation of airway-like organoids possessing functional basal cells capable of clonal expansion and multilineage differentiation. To benchmark in vitro-derived organoids, we developed a single-cell mRNA sequencing atlas of the human lung from 11.5 to 21 weeks of development, which revealed high degrees of similarity between the in vitro-derived and in vivo airway. Together, this work sheds light on human airway differentiation in vitro and provides a single-cell atlas of the developing human lung.",
       "contributors":"Alyssa J. Miller1 Barbara Treutlein (Principal Investigator)2 Gray Camp3 Renee F. Conway1 Jason Spence1 Qianhui Yu3 Emily M. Holloway1 Ian A. Glass4 Angeline Wu1 Michael Czerwinski1 Taylor Walker1 Yu-Hwai Tsai1",
       "fastq":"curl --location --fail 'https://service.azul.data.humancellatlas.org/manifest/files?catalog=dcp13&format=curl&filters=%7B%22fileFormat%22%3A+%7B%22is%22%3A+%5B%22fastq%22%5D%7D%2C+%22projectId%22%3A+%7B%22is%22%3A+%5B%22b32a9915-c81b-4cbc-af53-3a66b5da3c9a%22%5D%7D%2C+%22genusSpecies%22%3A+%7B%22is%22%3A+%5B%22Homo+sapiens%22%5D%7D%7D&objectKey=manifests%2Fd9d13d2e-ed6e-5366-b56f-6677f8f99512.a4fcc60d-77b6-5c89-9ae6-284e84fec9e4.curlrc' | curl --config -",
       "if_fastq":"Y", "accession":"635.35 GB File Size 186 Files" },


{"url":"https://data.humancellatlas.org/explore/projects/c1a9a93d-d9de-4e65-9619-a9cec1052eaa",
       "title":"Single-cell RNA-sequencing reveals profibrotic roles of distinct epithelial and mesenchymal lineages in pulmonary fibrosis",
       "summary":"Pulmonary fibrosis (PF) is a form of chronic lung disease characterized by progressive destruction of normal alveolar gas-exchange surfaces and accumulation of extracellular matrix (ECM). In order to comprehensively define the cell types, mechanisms and mediators driving ECM deposition and fibrotic remodeling in lungs with pulmonary fibrosis, we performed single-cell RNA-sequencing (scRNA-seq) of single-cell suspensions generated from non-fibrotic control and PF lungs. Analysis of over 114,000 cells from 20 PF and 10 control lungs identified 31 distinct cell types. We identified multiple distinct lineages directly contribute to ECM expansion, including a novel HAS1hi fibroblast subtype and a previously undescribed KRT5-/KRT17+, collagen and ECM-producing epithelial cell population that was highly enriched in PF lungs. Together these data provide high-resolution insights into the basic mechanisms of pulmonary fibrosis, and indicate a direct profibrotic role of the lung epithelium in PF pathogenesis.",
       "overall_design": "We performed single-cell RNA-sequencing (scRNA-seq) of single-cell suspensions generated from non-fibrotic control and pulmonary fibrosis (PF) lungs",
       "contributors":"Jennifer MS Sucre1 Christopher Jetter1 James E Loyd1 Lori Wood2 Wyatt J McDonnell3 Linh T Bui4 Stephanie L Yahn4 Lorraine B Ware1 Matthew J Bacchetta1 Ross Bremner2 Rajat Walia2 Carla L Calvi1 Lance Peter4 Bradley W Richmond1 Chase J Taylor1 Simon B Mallal3 Ana P Serezani1 Mei-I Chung4 Nicholas E Banovich (Principal Investigator)4 Austin J Gutierrez4 Arun C Habermann1 Ciara M Shaver1 Jamie Roberson1 Guixiao Ding1 Jonathan A Kropski (Principal Investigator)1 Timothy S Blackwell1 Nichelle I Winters1 Latha Raju1",
       "fastq":" pulmonary-fibrosis-human-lung-10XV2.loom,GSE135893_barcodes.tsv.gz,GSE135893_IPF_metadata.csv.gz,GSE135893_matrix.mtx.gz,GSE135893_genes.tsv.gz",
       "if_fastq":"Y", "accession":"GSE135893"},


{"url":"https://data.humancellatlas.org/explore/projects/e526d91d-cf3a-44cb-80c5-fd7676b55a1d",
       "title":"Transcriptional analysis of cystic fibrosis airways at single-cell resolution reveals altered epithelial cell states and composition",
       "summary":"Single cell sequencing of human airway epithelium from normal and cystic fibrosis lungs.",
       "overall_design": "Single cell RNA-seq using dropSeq (UCLA) or 10x chromium system (CFF, CSMC), on freshly isolated (UCLA, CFF), FACS isolated (DAPI-,CD45-,CD31-,CD326+) (CSMC), or freshly isolated, expanded, and cultured on ALI for 28 days (CFF) epithelial cells from control and CF lung explant tissue. The barcoded sequencing libraries were quantified by qPCR using the KAPA Library Quantification Kit (KAPA Biosystems). Sequencing libraries were loaded on a NovaSeq 6000 (Illumina) (CSMC), NextSeq 500 (Illumina) (CFF) or HiSeq4000 (Illumina) (UCLA).",
       "contributors":"Langerman Justin (Experimental Scientist)1 Gomperts N Brigitte (Principal Investigator)1",
       "fastq":"GSE150674_Seurat_Object.rds.gz,GSE150674_Seurat_Object_ALI.rds.gz",
       "download_url":"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE150674&format=file&file=GSE150674%5FSeurat%5FObject%2Erds%2Egz, https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE150674&format=file&file=GSE150674%5FSeurat%5FObject%5FALI%2Erds%2Egz",
       "if_fastq":"Y", "accession":" GSE150674 587.21 GB File Size 606"},


{"url":"https://data.humancellatlas.org/explore/projects/daf9d982-7ce6-43f6-ab51-272577290606",
       "title":"Single-Cell Transcriptomic Analysis of Human Lung Provides Insights into the Pathobiology of Pulmonary Fibrosis.",
       "summary":"Rationale: The contributions of diverse cell populations in the human lung to pulmonary fibrosis pathogenesis are poorly understood. Single-cell RNA sequencing can reveal changes within individual cell populations during pulmonary fibrosis that are important for disease pathogenesis. Objectives: To determine whether single-cell RNA sequencing can reveal disease-related heterogeneity within alveolar macrophages, epithelial cells, or other cell types in lung tissue from subjects with pulmonary fibrosis compared with control subjects. Methods: We performed single-cell RNA sequencing on lung tissue obtained from eight transplant donors and eight recipients with pulmonary fibrosis and on one bronchoscopic cryobiospy sample from a patient with idiopathic pulmonary fibrosis. We validated these data using in situ RNA hybridization, immunohistochemistry, and bulk RNA-sequencing on flow-sorted cells from 22 additional subjects. Measurements and Main Results: We identified a distinct, novel population of profibrotic alveolar macrophages exclusively in patients with fibrosis. Within epithelial cells, the expression of genes involved in Wnt secretion and response was restricted to nonoverlapping cells. We identified rare cell populations including airway stem cells and senescent cells emerging during pulmonary fibrosis. We developed a web-based tool to explore these data. Conclusions: We generated a single-cell atlas of pulmonary fibrosis. Using this atlas, we demonstrated heterogeneity within alveolar macrophages and epithelial cells from subjects with pulmonary fibrosis. These results support the feasibility of discovery-based approaches using next-generation sequencing technologies to identify signaling pathways for targeting in the development of personalized therapies for patients with pulmonary fibrosis.",
       "overall_design":"1. Single-cell suspension from two 4 mo old C57Bl/6 male mice was prepared and single cell RNA-seq libraries were generated using 3' V2 chemistry kit on Chromium Single cell controller (10x Genomics). 2.Single-cell suspensions from eight donor lungs and nine lungs from patients with various forms of pulmonary fibrosis were prepared and single cell RNA-seq libraries were generated using 3' V2 chemistry kit on Chromium Single cell controller (10x Genomics). This GEO submission contains only processed data (raw counts tables in HDF5 format), raw data (FASTQ files) available from dbGaP/SRA (phs001750.v1.p1).",
       "contributors":"James M Walter1 SeungHye Han1 Robert B Hamanaka2 Benjamin D Singer1 Harris Perlman1 Francisco J Gonzalez-Gonzalez1 Karen M Ridge1 Catherine A Bonham2 Cara J Gottardi1 Alexander V Misharin1 Monica Chi1 Jane Dematte1 Kiwon Nam1 Jacob I Sznajder1 Satoshi Watanabe1 Paul A Reyfman1 Saul Soberanes1 Annette S Flozak1 Anjana V Yeldandi1 Nikita Joshi1 Mahzad Akbarpour1 Vince K Morgan1 Deborah R Winter1 Cara L Hrusch2 Trevor T Nicholson1 Colin T Gillespie1 Robert D Guzy2 Rohan Verma1 Stephen Chiu1 Stacy A Marshall1 Remzi Bag2 Alexandra C McQuattie-Pimentel1 Manu Jain1 Ching-I Chen1 Anne I Sperling2 Ankit Bharat1 Gökhan M Mutlu2 Sangeeta M Bhorade1 Ramiro Fernandez1 A Christine Argento1 Ziyou Ren1 Monique Hinchcliff1 Ali Shilatifard1 Kishore R Anekalla1 Anna P Lam1 G R Scott Budinger1 Hiam Abdala-Valencia1 Luis A N Amaral1 Kinola J N Williams1",
       "fastq":"GSM3489185_Donor_02_filtered_gene_bc_matrices_h5.h5,GSM3489184_IPF_02_filtered_gene_bc_matrices_h5.h5,GSM3489194_SSc-ILD_01_filtered_gene_bc_matrices_h5.h5,GSM3489192_HP_01_filtered_gene_bc_matrices_h5.h5,GSM3489193_Donor_06_filtered_gene_bc_matrices_h5.h5, GSM3489190_IPF_04_filtered_gene_bc_matrices_h5.h5,GSM3489186_Cryobiopsy_01_filtered_gene_bc_matrices_h5.h5,GSM3489182_Donor_01_filtered_gene_bc_matrices_h5.h5,GSM3489188_IPF_03_filtered_gene_bc_matrices_h5.h5,GSM3489198_SSc-ILD_02_filtered_gene_bc_matrices_h5.h5,GSM3489197_Donor_08_filtered_gene_bc_matrices_h5.h5,GSM3489187_Donor_03_filtered_gene_bc_matrices_h5.h5,GSM3489195_Donor_07_filtered_gene_bc_matrices_h5.h5,GSM3489196_Myositis-ILD_01_filtered_gene_bc_matrices_h5.h5,GSM3489191_Donor_05_filtered_gene_bc_matrices_h5.h5,GSM3489183_IPF_01_filtered_gene_bc_matrices_h5.h5,GSM3489189_Donor_04_filtered_gene_bc_matrices_h5.h5",
       "if_fastq":"Y", "accession":" GSE121611,GSE122960"},


{"url":"https://data.humancellatlas.org/explore/projects/d7b7beae-652b-4fc0-9bf2-bcda7c7115af",
       "title":"Single Cell RNAseq of primary pulmonary endothelial cells.",
       "summary":"Single Cell RNAseq of primary pulmonary endothelial cells.",
       "overall_design":"Exploratory analysis of healthy lung cell populations and of primary pulmonary endothelial cells",
       "contributors":"Jonathan A Kropski1 Kerstin B Meyer2 Martijn C Nawijn3 Micha Sam Brickman Raredon4 Edward P Manning5 Nir Neumark5 Giuseppe DeIuliis5 Sergio Poli6 Taylor S Adams5 Carlos Cosme5 Farida Ahangari5 Yifan Yuan4 Maurizio Chioccioli5 Richard W Pierce5 Linh T Bui7 Xiting Yan5 Nicholas E Banovich7 Norihito Omote5 Kadi-Ann Rose5 Maor Sauler5 Robert J Homer5 Laura E Niklason4 Ivan O Rosas6 Jonas C Schupp5 Robert Lafyatis8 Naftali Kaminski5 Sarah A Teichmann2 Dana Pe'er9 Austin J Gutierrez7 Arun C Habermann1",
       "fastq":"GSE164829_WholeLungeDissociates.raw.counts.processed.mtx.gz,GSE164829_WholeLungeDissociates.processed.genes.tsv.gz,GSE164829_WholeLungeDissociates.processed.barcodes.tsv.gz,GSE164829_PrimaryPulmonaryEndothelialCells.raw.counts.processed.mtx.gz,GSE164829_PrimaryPulmonaryEndothelialCells.processed.genes.tsv.gz,GSE164829_PrimaryPulmonaryEndothelialCells.processed.barcodes.tsv.gz",
       "if_fastq":"Y", "accession":"GSE164829"},


{"url":"https://data.humancellatlas.org/explore/projects/c31fa434-c9ed-4263-a9b6-d9ffb9d44005",
     "title":"A single-cell atlas of chromatin accessibility in the human genome",
     "summary":"Current catalogs of regulatory sequences in the human genome are still incomplete and lack cell type resolution. To profile the activity of gene regulatory elements in diverse cell types and tissues in the human body, we applied single-cell chromatin accessibility assays to 30 adult human tissue types from multiple donors. We integrated these datasets with single-cell chromatin accessibility data from 15 fetal tissue types to reveal the status of open chromatin for approximately 1.2 million candidate cis-regulatory elements (cCREs) in 222 distinct cell types comprised of >1.3 million nuclei. We used these chromatin accessibility maps to delineate cell type-specificity of fetal and adult human cCREs and to systematically interpret the noncoding variants associated with complex human traits and diseases. This rich resource provides a foundation for the analysis of gene regulatory programs in human cell types across tissues, life stages, and organ systems.",
     "overall_design": "Applying single cell chromatin accessibility assays to diverse human tissue types from four donors.",
     "contributors":"Bing Ren (Experimental Scientist)1",
     "fastq":"GSM5589376_lung_SM-A62E9_rep1_fragments.bed.gz,GSM5589377_lung_SM-A8WNH_rep1_fragments.bed.gz,GSM5589378_lung_SM-ACCPU_rep1_fragments.bed.gz,GSM5589379_lung_SM-JF1NZ_rep1_fragments.bed.gz",
     "if_fastq":"Y", "accession":"GSE184462"},
]

df2 = pd.DataFrame(hca)
df2.rename(columns = {'fastq':'download'}, inplace = True)

import pandas as pd
import re


def process_words(test):
    test = test.split()
    count = 0
    last_i = 0
    tmp = {}
    #rough filter
    for i in test:
        if i in keywords and count != 0:
            tmp.setdefault(i, [])
            tmp[i].append(" ".join(test[last_i:count+1]))
            last_i = count+1
        count += 1
    
    return tmp

df = pd.read_csv('../input/GSE-with-design/scRNA-gds-lung_design.csv', sep=',')
df1 = pd.read_csv('../input/GSE-with-design/scRNA-gds-lung_design_1.csv', sep=',')

#whole non-filtered df
f_df = pd.concat([df,df1,df2])
f_df.to_csv('./organized_GSE_lung.csv', sep=',')

#extract the download file type and group them by the extensions
pat = '^.*\.(xlsx|h5|txt|loom|rds|mtx|bed|tsv|h5ad|csv|RDS|tar)$'
gp = ['gz','tar','xlsx','h5','txt','loom','rds']
od = f_df.dropna(subset = ['download'], how = 'any')

#remove last element
#od = od.iloc[:-1]
#p_df contains all the downloadable url
p_df = od[od['download'].str.contains(pat)]
p_df = p_df.dropna(subset = ['download','overall_design'], how = 'any')

search = []    
for values in p_df['download']:
    search.append([i for i in gp if i in values])

#filter with proper group
p_df['download_group'] = search
#print(p_df)
import spacy

# Load spacy model
nlp = spacy.load('en_core_web_sm')  

#reGEX expression to find keywords in design_chunks
keywords = [
    '.*cell','.*tissue','.*disease',
    '.*treatment','.*protocol','.*patient',
]

# Convert each row into spacy document and return the lemma of the tokens in 
# the document if it is not a sotp word. Finally join the lemmas into as a string
p_df['design_chunks'] = p_df.overall_design.apply(lambda text: 
                                          " ".join(token.lemma_ for token in nlp(text)
                                                    if 
                                                   not token.is_stop
                                                   and token.pos_ in ["NOUN","PROPN","NUM","ADV"]
                                                   ))

p_df['design_groups'] = p_df.design_chunks.apply(lambda text: process_words(text))
#print(p_df['design_groups'])
#p_df.to_csv('./grouped_GSE_lung.csv', sep=',')

df = pd.json_normalize(p_df['design_groups'])
print(df['cell'])




