In [1]:
import pandas as pd
import os
import sys
import re
import numpy as np

In [2]:
sample_regex = re.compile(r"GSM[0-9]{7}")

def get_sample_id(filename):
    return sample_regex.search(filename).group()

In [19]:
def get_genome(sample_id, metadata):
    return metadata.loc[metadata['SAMPLE'] == sample_id]['GENOME'].values[0]

# izbaceni prefixi za odredjene kolone (npr hg19_ za kolonu hg19) da bi se lakse poredili sa fajlovima koji vec imaju gene umesto ENSG-ova

In [3]:
common_human_list = pd.read_csv("no_prefix_human_common_list.csv")
common_human_list.head()

Unnamed: 0,ENSG_ID,hg19,hg37,hg38,Ensembl_GRCh38.p12_rel94,GSM3717979
0,ENSG00000181638,ZFP41,ZFP41,ZFP41,ZFP41,ZFP41
1,ENSG00000111875,ASF1A,ASF1A,ASF1A,ASF1A,ASF1A
2,ENSG00000176142,TMEM39A,TMEM39A,TMEM39A,TMEM39A,TMEM39A
3,ENSG00000177186,OR2M7,OR2M7,OR2M7,OR2M7,OR2M7
4,ENSG00000135624,CCT7,CCT7,CCT7,CCT7,CCT7


In [9]:
# metadata = pd.read_csv("reference_data\\SCT-10x-Metadata_readylist_merged-PBMC-tasks-short-Bgd.csv")
# metadata = metadata[['GENOME', 'SAMPLE']]
# metadata.to_csv("metadata.csv", index=False)

# iz SCT-10x-Metadata_readylist_merged-PBMC-tasks-short-Bgd zadrzane samo kolone GENOME i SAMPLE; to je zapisano u metadata.csv

In [10]:
metadata = pd.read_csv("metadata.csv")
metadata.head()

Unnamed: 0,GENOME,SAMPLE
0,GRCh38,GSM3020393
1,GRCh38,GSM3020394
2,GRCh38,GSM3020395
3,GRCh38,GSM3020396
4,GRCh38,GSM3020397


# U nastavku opisan postupak sredjivanje gena

In [12]:
df = pd.read_csv("data/GSM3478792_P5_matrix.csv")
df.head()

Unnamed: 0,Index,AAACCTGAGCCACCTG-1,AAACCTGAGTTTCCTT-1,AAACCTGCAAACAACA-1,AAACCTGCAAGAGGCT-1,AAACCTGCAAGGTTTC-1,AAACCTGGTCGTGGCT-1,AAACCTGGTGCTTCTC-1,AAACCTGGTGTGACGA-1,AAACCTGTCCGTTGTC-1,...,TTTGGTTTCCTAAGTG-1,TTTGGTTTCTACCAGA-1,TTTGTCAAGAAACCAT-1,TTTGTCAAGAACTGTA-1,TTTGTCAAGGCTAGGT-1,TTTGTCAAGGCTCTTA-1,TTTGTCACAAGCCGTC-1,TTTGTCACAGCGTTCG-1,TTTGTCACAGGATCGA-1,TTTGTCATCTGCGACG-1
0,ENSG00000243485,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSG00000237613,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ENSG00000186092,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000238009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000239945,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
sample_id = get_sample_id("data/GSM3478792_P5_matrix.csv")
sample_id

'GSM3478792'

# Izvlacimo sample_id iz putanje fajla i trazimo njemu odgovarajuci genome

In [23]:
genome = get_genome(sample_id, metadata)

In [21]:
def check_grch38(sample_genome, common_human_list_columns):
    if sample_genome not in common_human_list_columns:
        return 'Ensembl_GRCh38.p12_rel94'
    else:
        return sample_genome

# Imamo neke fajlove ciji genome je grch38, al te kolone nema u common_human_list, pa koristimo Ensembl_GRCh38.p12_rel94
# Isto ima jedan fajl ciji genome je oblika "GrCH38       60"; vrv cu da ignorisem taj sufix, tj pp da je samo grch38

In [25]:
lookup_column = check_grch38(genome, common_human_list.columns)
lookup_column

'Ensembl_GRCh38.p12_rel94'

# Kada nadjemo genome, u common_human_listi zadrzavamo samo njemu odg kolonu (lookup_column) i ENSG_ID

In [30]:
filtered_common_human_list = common_human_list[["ENSG_ID", lookup_column]]
filtered_common_human_list.head()

Unnamed: 0,ENSG_ID,Ensembl_GRCh38.p12_rel94
0,ENSG00000181638,ZFP41
1,ENSG00000111875,ASF1A
2,ENSG00000176142,TMEM39A
3,ENSG00000177186,OR2M7
4,ENSG00000135624,CCT7


In [31]:
filtered_common_human_list.shape

(30710, 2)

# ENSG-ovi su jedinstveni ali ostale kolone nisu, pa dropujem sve duplikate za kolonu genoma 

In [32]:
filtered_unique_common_human_list = filtered_common_human_list.drop_duplicates(subset=lookup_column, keep='first')

In [33]:
filtered_unique_common_human_list.shape

(30694, 2)

# Ovde se razlikuju fajlovi koji imaju Index kolonu i oni koji imaju 'gene' kolonu
# Oni koji imaju Index, spajamo sa filtriranim common_human_list na ENSG_ID
# Oni koji imaju gene kolonu(vec zamenjene gene sa ensg) spajamo na genome koloni

In [34]:
# ovde se racva na dva dela
# za one fajlove koji vec imaju 'gene' kolonu, spajamo po genima,
# oni koji imaju Index, spajamo na ENSG_ID
joined_data = filtered_unique_common_human_list.join(other=df.set_index('Index'),
                                                    on='ENSG_ID',
                                                    how='inner')
joined_data.head()

Unnamed: 0,ENSG_ID,Ensembl_GRCh38.p12_rel94,AAACCTGAGCCACCTG-1,AAACCTGAGTTTCCTT-1,AAACCTGCAAACAACA-1,AAACCTGCAAGAGGCT-1,AAACCTGCAAGGTTTC-1,AAACCTGGTCGTGGCT-1,AAACCTGGTGCTTCTC-1,AAACCTGGTGTGACGA-1,...,TTTGGTTTCCTAAGTG-1,TTTGGTTTCTACCAGA-1,TTTGTCAAGAAACCAT-1,TTTGTCAAGAACTGTA-1,TTTGTCAAGGCTAGGT-1,TTTGTCAAGGCTCTTA-1,TTTGTCACAAGCCGTC-1,TTTGTCACAGCGTTCG-1,TTTGTCACAGGATCGA-1,TTTGTCATCTGCGACG-1
0,ENSG00000181638,ZFP41,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSG00000111875,ASF1A,0,0,1,1,0,1,0,0,...,0,0,1,1,2,1,0,0,0,1
2,ENSG00000176142,TMEM39A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000177186,OR2M7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000135624,CCT7,3,0,0,1,0,1,1,0,...,3,0,1,0,0,0,0,1,0,0


In [35]:
joined_data.shape

(30694, 4488)

In [38]:
joined_data['gene_id'] = joined_data['ENSG_ID'] + "_" + genome + "_" + joined_data[lookup_column]
joined_data.head()

Unnamed: 0,ENSG_ID,Ensembl_GRCh38.p12_rel94,AAACCTGAGCCACCTG-1,AAACCTGAGTTTCCTT-1,AAACCTGCAAACAACA-1,AAACCTGCAAGAGGCT-1,AAACCTGCAAGGTTTC-1,AAACCTGGTCGTGGCT-1,AAACCTGGTGCTTCTC-1,AAACCTGGTGTGACGA-1,...,TTTGGTTTCTACCAGA-1,TTTGTCAAGAAACCAT-1,TTTGTCAAGAACTGTA-1,TTTGTCAAGGCTAGGT-1,TTTGTCAAGGCTCTTA-1,TTTGTCACAAGCCGTC-1,TTTGTCACAGCGTTCG-1,TTTGTCACAGGATCGA-1,TTTGTCATCTGCGACG-1,gene_id
0,ENSG00000181638,ZFP41,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000181638_GRCh38_ZFP41
1,ENSG00000111875,ASF1A,0,0,1,1,0,1,0,0,...,0,1,1,2,1,0,0,0,1,ENSG00000111875_GRCh38_ASF1A
2,ENSG00000176142,TMEM39A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000176142_GRCh38_TMEM39A
3,ENSG00000177186,OR2M7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000177186_GRCh38_OR2M7
4,ENSG00000135624,CCT7,3,0,0,1,0,1,1,0,...,0,1,0,0,0,0,1,0,0,ENSG00000135624_GRCh38_CCT7


In [40]:
key_column = joined_data['gene_id']
joined_data.drop(columns=['ENSG_ID', lookup_column, 'gene_id'], inplace=True)
joined_data.insert(0, 'gene_id', key_column)
joined_data.head()

Unnamed: 0,gene_id,AAACCTGAGCCACCTG-1,AAACCTGAGTTTCCTT-1,AAACCTGCAAACAACA-1,AAACCTGCAAGAGGCT-1,AAACCTGCAAGGTTTC-1,AAACCTGGTCGTGGCT-1,AAACCTGGTGCTTCTC-1,AAACCTGGTGTGACGA-1,AAACCTGTCCGTTGTC-1,...,TTTGGTTTCCTAAGTG-1,TTTGGTTTCTACCAGA-1,TTTGTCAAGAAACCAT-1,TTTGTCAAGAACTGTA-1,TTTGTCAAGGCTAGGT-1,TTTGTCAAGGCTCTTA-1,TTTGTCACAAGCCGTC-1,TTTGTCACAGCGTTCG-1,TTTGTCACAGGATCGA-1,TTTGTCATCTGCGACG-1
0,ENSG00000181638_GRCh38_ZFP41,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ENSG00000111875_GRCh38_ASF1A,0,0,1,1,0,1,0,0,1,...,0,0,1,1,2,1,0,0,0,1
2,ENSG00000176142_GRCh38_TMEM39A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ENSG00000177186_GRCh38_OR2M7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ENSG00000135624_GRCh38_CCT7,3,0,0,1,0,1,1,0,0,...,3,0,1,0,0,0,0,1,0,0
