In [1]:
import os
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import csv
import gzip
import anndata as ad
from pathlib import Path
import glob
from sklearn.preprocessing import StandardScaler


sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

-----
anndata     0.8.0
scanpy      1.9.1
-----
MulticoreTSNE               NA
PIL                         9.1.0
appnope                     0.1.2
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
cffi                        1.15.0
cloudpickle                 2.1.0
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2022.9.0
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
executing                   0.8.3
fsspec                      2022.8.2
h5py                        3.7.0
hypergeom_ufunc             NA
igraph                      0.10.1
ipykernel                   6.9.1
ipython_genutils            0.2.0
ipywidgets                  7.6.5
jedi                        0.18.1
jinja2                      3.0.3
joblib   

In [2]:
pwd

'/Users/lidiayung/github/2022PhD/notebooks'

In [8]:
regulons = pd.read_csv('/Users/lidiayung/project/resource/regulons.csv', sep=';')

In [17]:
string_list = []

# Iterate over the columns
for column in regulons.columns:
    # Iterate over the rows
    for value in regulons[column]:
        # Check if the value is a string
        if isinstance(value, str):
            # Append the string to the list
            string_list.append(value)


In [18]:
string_list

['SOX10',
 'SREBF1',
 'FOS',
 'LEF1',
 'SMAD6',
 'ERG',
 'BCL11B',
 'IRF4',
 'ETS1',
 'ELF1',
 'REL',
 'TFEC',
 'IRF8',
 'MAF',
 'TLX2',
 'MLXIPI',
 'NFIC',
 'EGR1',
 'E2F2',
 'FOXM1',
 'E2F7',
 'SOX8',
 'SOX11',
 'SOX5',
 'SOX2',
 'RBBP5',
 'NR112',
 'CEBPG',
 'BARX2',
 'CTCF',
 'MYCN',
 'KLF11',
 'RUNX1',
 'CREM',
 'RUNX3',
 'ELF4',
 'IKZF1',
 'NFATC2',
 'PRDM1',
 'JUN',
 'TCF7L1',
 'SoX9',
 'PRRX1',
 'PRRX1',
 'HES4',
 'SREBF2',
 'MAZ',
 'LHX5',
 'TCF4',
 'ASCL1',
 'SOX11',
 'SOX4',
 'E2F2',
 'ZEB1',
 'HES1',
 'GATA6',
 'NR1D1',
 'FOXC1',
 'CREB5',
 'FOX01',
 'SHOX2',
 'AR',
 'E2F7',
 'MYBL1',
 'E2F8',
 'MYBL2',
 'BRCA1',
 'E2F2',
 'PITX2',
 'RFX4',
 'SREBF1',
 'DDIT3',
 'STAT3',
 'EPAS1',
 'SOX2',
 'SOX5',
 'SOX10',
 'SOX4',
 'ELF2',
 'PBX3',
 'ZEB1',
 'TWIST1',
 'HMGA2',
 'ERG',
 'FOXP1',
 'JDP2',
 'SPI1',
 'MEF2C',
 'TFEC',
 'THAP1',
 'DUXA',
 'TBX20',
 'HNF4A',
 'NFIC',
 'TCF7L2',
 'TCF7L1',
 'NFATC4',
 'TEAD',
 'TEAD',
 'BCL6',
 'ESRRG',
 'BHLHE22',
 'SOX5',
 'ZMAT4',
 'CUX2',


In [20]:
# create an empty dictionary to store string counts
string_count = {}

# loop through the data and add each string to the dictionary
for string in string_list:
    count = string_count.get(string, 0)
    string_count[string] = count + 1

# create a new dictionary to store only the repeated strings
repeated_strings = {}
for key, value in string_count.items():
    if value > 1:
        repeated_strings[key] = value

In [22]:
import operator

# Sort the repeated_strings dictionary by value (frequency) in descending order
sorted_items = sorted(repeated_strings.items(), key=operator.itemgetter(1), reverse=True)

# Print the table
print("Item\tFrequency")
for key, value in sorted_items:
    print(f"{key}\t{value}")


Item	Frequency
IRF8	5
SOX5	5
SOX4	5
SPI1	5
ELF1	4
E2F2	4
SOX11	4
ZEB1	4
CREB5	4
CUX2	4
SOX10	3
SREBF1	3
REL	3
E2F7	3
SOX2	3
TCF7L1	3
PRRX1	3
STAT3	3
ZMAT4	3
FOS	2
LEF1	2
ERG	2
ETS1	2
TFEC	2
NFIC	2
RBBP5	2
CEBPG	2
MYCN	2
IKZF1	2
JUN	2
SREBF2	2
MAZ	2
TCF4	2
FOXC1	2
FOX01	2
AR	2
MYBL1	2
MYBL2	2
BRCA1	2
RFX4	2
EPAS1	2
PBX3	2
JDP2	2
MEF2C	2
TEAD	2
ARNT2	2
IRF2	2
ETS2	2
FLI1	2
OLIG1	2
E2F1	2
MITF	2
TBX15	2
SOX6	2
E2F3	2
CEBPB	2
FOSL2	2
LHX1	2


In [25]:
genes = pd.read_csv('/Users/lidiayung/project/resource/genes.csv', sep=';')

In [26]:
genes

Unnamed: 0,SF2777,SF2990,SF3076,SF9358,SF11916,SF11082,SF9798,SF3391
0,THSD7A,ADAMTS18,KAZN,NXPH1,THEMIS,TOX,GABBR2,STXBP6
1,PID1,NLGN1,NDRG1,RBFOX1,CCL5,ALK,CELF4,KANK4
2,LRRTM4,DLG2,ANGPTL4,ERBB4,SKAP1,C10orf90,PTPR,LINC00844
3,KCNIP4,UGT8,VEGFA,GALNTL6,PLXDC2,GPR39,ADARB2,RP4-630C24.3
4,KCNB2,AC093590.1,DNAH11,KAZN,EPB41L3,TRPM3,SEMA3E,SLC5A11
5,RYR2,LINC00463,JUN,SLIT2,ADAM28,CD44,CADPS,LINC01378
6,FSTL4,VWE,GLIS3,CLSTN2,VSIG4,SDK1,POSTN,ITGA8
7,ADARB,ADGRF5,RNF219-AS1,MYTIL,TNFRSF1B,PRUNE2,UNC5D,TLL1
8,MYT1L,ADAMTS2,ADGRV1,KCNMA1,C10orf11,TNIK,FAM83D,ARRB1
9,NEBL,PDGFRB,KCP,IQCJ-SCHIP1,PEX5L,3020.1,KIF18B,LRRK1


In [27]:
string_list = []

# Iterate over the columns
for column in genes.columns:
    # Iterate over the rows
    for value in genes[column]:
        # Check if the value is a string
        if isinstance(value, str):
            # Append the string to the list
            string_list.append(value)


In [28]:
# create an empty dictionary to store string counts
string_count = {}

# loop through the data and add each string to the dictionary
for string in string_list:
    count = string_count.get(string, 0)
    string_count[string] = count + 1

# create a new dictionary to store only the repeated strings
repeated_strings = {}
for key, value in string_count.items():
    if value > 1:
        repeated_strings[key] = value

In [29]:
import operator

# Sort the repeated_strings dictionary by value (frequency) in descending order
sorted_items = sorted(repeated_strings.items(), key=operator.itemgetter(1), reverse=True)

# Print the table
print("Item\tFrequency")
for key, value in sorted_items:
    print(f"{key}\t{value}")


Item	Frequency
SKAP1	5
RYR2	3
PDGFRB	3
ADARB2	3
VEGFA	3
KCNIP4	2
FSTL4	2
CADPS	2
RNF220	2
VWE	2
COL1A2	2
SLFN121	2
TRPM3	2
KIF18B	2
TOP2A	2
KAZN	2
ANGPTL4	2
ST18	2
EEPD1	2
PLXDC2	2
CD44	2
CENPF	2
RBFOX1	2
GALNTL6	2
MYTIL	2
IQCJ-SCHIP1	2
DTL	2
LHFPL3	2
SRGN	2
ADGRL4	2
FCGBP	2
