In [11]:
from prepare_original_data import read_cd8_data_for_mageck, data_to_file_for_mageck
import pandas as pd

# Data Process with Mageck

""The genome-wide CRISPRa (**Calabrese** A, catalog no. 92379 and Calabrese B, catalog no. 92380) and CRISPRi libraries (**Dolcetto** A, catalog no. 92385 and Dolcetto B, catalog no. 92386) (22) were obtained from Addgene. Forty nanograms of each library were transformed into Endura ElectroCompetent Cells (Lucigen, catalog no. 60242-2) following the manufacturer’s instructions. After transformation, Endura cells were grown in a shaking incubator for 16 hours at 30°C in the presence of ampicillin. Library plasmid has been isolated using the Plasmid Plus MaxiKit (Qiagen, catalog no. 12963) and sequenced for sgRNA representation as described under the section titled “Genome-wide CRISPRa and CRISPRi screens.”""

## Data source:

GSE174255: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE174255

"Reads were aligned to the appropriate reference library using MAGeCK version 0.5.9.2 (45) using the –trim-5 22,23,24,25,26,28,29,30 argument to remove the staggered 5′ adapter. Next, raw read counts across both library sets were normalized to the total read count in each sample, and each of the matching samples across two sets were merged to generate a single normalized read count table. Normalized read counts in high versus low bins were compared using mageck test with –norm-method none, –paired, and –control-sgrna options, pairing samples by donor and using nontargeting sgRNAs as controls, respectively. Gene hits were classified as having a median absolute log2-fold change >0.5 and a false discovery rate (FDR) <0.05. For supplemental CD4+ screens (fig. S9), reads were aligned to the full Calabrese A and B library in a single reference file. For the supplemental CD4+ IFN-γ screen, which was sorted and sequenced as two technical replicates, normalized counts were averaged across technical replicates before analysis with mageck test."

## Data CRISPRa


CD8+ cells?

In [5]:
crispr_type = 'CRISPRa'
cell_type = 'CD8'

raw_data = read_cd8_data_for_mageck(
    crispr_type=crispr_type
)

In [6]:
raw_data

Unnamed: 0,sgRNA,Gene,Plasmid,Donor1_IFNG_low,Donor1_IFNG_high,Donor1_IL2_low,Donor1_IL2_high,Donor1_IFNG_unsorted,Donor1_IL2_unsorted,Donor2_IFNG_low,Donor2_IFNG_high,Donor2_IL2_low,Donor2_IL2_high,Donor2_IFNG_unsorted,Donor2_IL2_unsorted
0,A1BG_CAAGACAGGGAAGATGAAGC,A1BG,98,64,97,91,75,76,66,88,56,94,73,79,65
1,A1BG_CACACCCCAGGCCACACCCC,A1BG,162,109,194,134,191,182,141,141,207,167,147,139,120
2,A1BG_CTGCGCCCAGGAACAAGACA,A1BG,35,62,63,27,58,50,46,47,62,61,52,34,23
3,A1CF_CTGCATGGATGCAAGAGACA,A1CF,88,74,52,44,97,67,78,51,90,57,78,65,43
4,A1CF_TGACCCAATTATCTGGTCAA,A1CF,133,94,82,91,86,107,85,125,78,97,97,72,65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56471,ZZEF1_CTGAGACAGACTCGTCCTCT,ZZEF1,124,62,107,40,88,91,89,65,83,55,173,106,91
56472,ZZEF1_TCTACCAACTCACGACAGCT,ZZEF1,101,93,78,84,71,90,90,70,88,68,124,112,84
56473,ZZZ3_TCCTTTCTCCTCGTTGGGAC,ZZZ3,36,22,17,11,6,19,16,8,16,13,17,10,33
56474,ZZZ3_TTCTCCTCGTTGGGACTGGC,ZZZ3,82,50,61,46,54,58,62,61,45,39,77,83,51


In [8]:

data_to_file_for_mageck(raw_data, cell_type, crispr_type)


## Data CRISPRi
CD8 cells?

In [9]:
crispr_type = 'CRISPRi'
cell_type = 'CD8'

raw_data = read_cd8_data_for_mageck(
    crispr_type=crispr_type
)

CRISPRa.DolcettoSetA.count
CRISPRa.DolcettoSetB.count


In [10]:
data_to_file_for_mageck(raw_data, cell_type, crispr_type)

## Data Source
GSE190846: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE190846

CD4+ cells

In [27]:
crispr_type = 'CRISPRa'
cell_type = 'CD4'

In [25]:
df_all = pd.read_csv(
    '../data/GSE190846/GSE190846_supp_CD4_CRISPR_screens_read_counts.tsv', sep='\t'
)
# Rename columns
base_cols = ['sgRNA', 'Gene']
df_all.columns = base_cols + ['_'.join(c.split('_')[1:5]) for c in df_all.columns[2:]]
df_all

Unnamed: 0,sgRNA,Gene,Donor15_IFNG_high_Rep1,Donor15_IFNG_high_Rep2,Donor15_IFNG_low_Rep1,Donor15_IFNG_low_Rep2,Donor15_IL2_high_S6,Donor15_IL2_low_S5,Donor15_TNFa_high_S10,Donor15_TNFa_low_S9,Donor16_IFNG_high_Rep1,Donor16_IFNG_high_Rep2,Donor16_IFNG_low_Rep1,Donor16_IFNG_low_Rep2,Donor16_IL2_high_S8,Donor16_IL2_low_S7,Donor16_TNFa_high_S12,Donor16_TNFa_low_S11
0,A1BG_CAAGACAGGGAAGATGAAGC,A1BG,114,148,78,144,160,148,135,120,53,76,75,156,42,80,78,126
1,A1BG_CACACCCCAGGCCACACCCC,A1BG,360,546,165,253,219,163,311,249,431,449,217,291,239,208,272,176
2,A1BG_CACGCCCCAGGCCACACCCC,A1BG,668,1153,443,670,536,451,577,428,547,839,453,648,694,411,586,407
3,A1BG_CGGAGGAGCCCTCTGCGCCC,A1BG,293,447,267,317,191,292,284,207,193,291,171,323,123,225,213,224
4,A1BG_CTGCGCCCAGGAACAAGACA,A1BG,47,144,79,65,104,75,50,43,60,104,50,74,70,34,31,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113233,ZZZ3_TCCTTTCTCCTCGTTGGGAC,ZZZ3,38,88,85,86,99,78,42,42,70,69,98,155,59,127,78,69
113234,ZZZ3_TCGTTGGGACTGGCGGGCGG,ZZZ3,68,100,103,137,162,104,75,106,125,155,155,143,105,122,90,95
113235,ZZZ3_TTCTCCTCGTTGGGACTGGC,ZZZ3,94,227,186,270,182,234,126,181,248,293,184,344,247,166,214,180
113236,ZZZ3_TTGGGACTGGCGGGCGGCGG,ZZZ3,182,297,194,421,240,152,284,333,214,288,354,478,278,283,335,309


### Average replicates:
"For the supplemental CD4+ IFN-γ screen, which was sorted and sequenced as two technical replicates, normalized counts were averaged across technical replicates before analysis with mageck test."

In [26]:
donors = [15, 16]
hls = ['high', 'low']
for d in donors:
    for hl in hls:
        print(d, hl)
        df_all[f'Donor{d}_IFNG_{hl}'] = (df_all[f'Donor{d}_IFNG_{hl}_Rep1'] + df_all[f'Donor{d}_IFNG_{hl}_Rep2'])/2
df_all = df_all[[c for c in df_all.columns if 'Rep' not in c]]
df_all.columns = base_cols + ['_'.join(c.split('_')[0:3]) for c in df_all.columns[2:]]
df_all

15 high
15 low
16 high
16 low


Unnamed: 0,sgRNA,Gene,Donor15_IL2_high,Donor15_IL2_low,Donor15_TNFa_high,Donor15_TNFa_low,Donor16_IL2_high,Donor16_IL2_low,Donor16_TNFa_high,Donor16_TNFa_low,Donor15_IFNG_high,Donor15_IFNG_low,Donor16_IFNG_high,Donor16_IFNG_low
0,A1BG_CAAGACAGGGAAGATGAAGC,A1BG,160,148,135,120,42,80,78,126,131.0,111.0,64.5,115.5
1,A1BG_CACACCCCAGGCCACACCCC,A1BG,219,163,311,249,239,208,272,176,453.0,209.0,440.0,254.0
2,A1BG_CACGCCCCAGGCCACACCCC,A1BG,536,451,577,428,694,411,586,407,910.5,556.5,693.0,550.5
3,A1BG_CGGAGGAGCCCTCTGCGCCC,A1BG,191,292,284,207,123,225,213,224,370.0,292.0,242.0,247.0
4,A1BG_CTGCGCCCAGGAACAAGACA,A1BG,104,75,50,43,70,34,31,44,95.5,72.0,82.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113233,ZZZ3_TCCTTTCTCCTCGTTGGGAC,ZZZ3,99,78,42,42,59,127,78,69,63.0,85.5,69.5,126.5
113234,ZZZ3_TCGTTGGGACTGGCGGGCGG,ZZZ3,162,104,75,106,105,122,90,95,84.0,120.0,140.0,149.0
113235,ZZZ3_TTCTCCTCGTTGGGACTGGC,ZZZ3,182,234,126,181,247,166,214,180,160.5,228.0,270.5,264.0
113236,ZZZ3_TTGGGACTGGCGGGCGGCGG,ZZZ3,240,152,284,333,278,283,335,309,239.5,307.5,251.0,416.0


In [29]:
# Save NO_TARGET sgRNA
input_folder = 'input_data'
nt_filename = f'{input_folder}/{crispr_type}.{cell_type}_NO-TARGET.supplemental.txt'
nt_df = df_all[df_all['Gene'].apply(lambda x: x=='NO-TARGET')]['sgRNA']
nt_df.to_csv(f'{nt_filename}', index=False, header=False)
nt_df

63054    NO-TARGET_AAAAAGCTTCCGCCTGATGG
63055    NO-TARGET_AAAACAGGACGATGTGCGGC
63056    NO-TARGET_AAAACATCGACCGAAAGCGT
63057    NO-TARGET_AAAATAGCAGTAAACTCAAC
63058    NO-TARGET_AAAATCGATGGGCTGAATCT
                      ...              
64041    NO-TARGET_TTTGGTCAACGCATAGCTTG
64042    NO-TARGET_TTTTACCTTGTTCACATGGA
64043    NO-TARGET_TTTTGACTCTAATCACCGGT
64044    NO-TARGET_TTTTTAATACAAGGTAATCT
64045    NO-TARGET_TTTTTCTCACCCGATGAATC
Name: sgRNA, Length: 992, dtype: object

In [30]:
# Save sgRMA
filename = f'{input_folder}/{crispr_type}.{cell_type}_count.supplemental.txt'
df_all.to_csv(filename, sep='\t', index=False)
