In [1]:
import pandas as pd
import csv
import random
import numpy as np
from typing import List

### Load the sample/source

In [9]:
''' Change these vars '''
sample_filename: str = "./data/mini/sampled_sgRNA_design.txt";
source_filename: str = "./data/source_sgRNA_enAsCas12a_ko_en.txt";

df = pd.read_csv(sample_filename, sep='\t'); # change to source for real deal
df_len = len(df.index)
%time
df

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs


Unnamed: 0,Input,Quota,Target Taxon,Target Gene ID,Target Gene Symbol,Target Transcript,Target Alias,CRISPR Mechanism,Target Domain,Reference Sequence,...,On-Target Ruleset,On-Target Efficacy Score,On-Target Rank,Off-Target Rank,On-Target Rank Weight,Off-Target Rank Weight,Combined Rank,Pick Order,Picking Round,Picking Notes
0,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.8569,27,143,1.0,1.0,32,23.0,2.0,Previously skipped due to: Spacing Violation: ...
1,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.4900,294,77,1.0,1.0,142,329.0,4.0,Previously skipped due to: Outside Target Wind...
2,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.4440,331,45,1.0,1.0,148,92.0,2.0,Previously skipped due to: Spacing Violation: ...
3,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.9154,11,410,1.0,1.0,184,111.0,2.0,Previously skipped due to: Spacing Violation: ...
4,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.5483,240,370,1.0,1.0,365,,,Spacing Violation: Too close to earlier pick a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135384,ENSMUSG00000118638,MAX,10090,ENSMUSG00000118638,AL805980.1,ENSMUST00000239206.1,,CRISPRko,CDS,NC_000086.7,...,enPAM+GB,0.5379,257,376,1.0,1.0,335,179.0,2.0,Previously skipped due to: Spacing Violation: ...
135385,ENSMUSG00000118638,MAX,10090,ENSMUSG00000118638,AL805980.1,ENSMUST00000239206.1,,CRISPRko,CDS,NC_000086.7,...,enPAM+GB,0.4902,343,424,1.0,1.0,447,500.0,4.0,Previously skipped due to: Outside Target Wind...
135386,ENSMUSG00000118638,MAX,10090,ENSMUSG00000118638,AL805980.1,ENSMUST00000239206.1,,CRISPRko,CDS,NC_000086.7,...,enPAM+GB,0.3797,502,399,1.0,1.0,527,276.0,2.0,Previously skipped due to: Spacing Violation: ...
135387,ENSMUSG00000118640,MAX,10090,ENSMUSG00000118640,AC167036.2,ENSMUST00000239185.1,,CRISPRko,CDS,NC_000067.6,...,enPAM+GB,0.3833,282,38,1.0,1.0,141,198.0,3.0,Previously skipped due to: Outside Target Wind...


## Utility Functions

In [23]:
# String to String; Returns the reverse of the sequence
def util_reverse_seq(seq: str):
    return seq[::-1]

# String to String; 
def util_complement(seq: str):
    map = {
        ord('A'): 'T',
        ord('T'): 'A',
        ord('G'): 'C',
        ord('C'): 'G'
    }
    
    return seq.translate(map)
    

## Exploration

In [11]:
# 30bp-long sequences in context sequence col
tmp = df['sgRNA Context Sequence'].str.len().sort_values().unique()
print("Length(s) of seq in this col: " + str(tmp))

Length(s) of seq in this col: [34]


In [12]:
# Number of sequences whose 25th-26th bp are 'GG'
tmp = np.count_nonzero(df['sgRNA Context Sequence'].str[25:27] == 'GG')
print("Count of GG's in bp25-26: " + str(tmp))
print("Count expected if GG's are random: " + str(df_len / 16))

Count of GG's in bp25-26: 9259
Count expected if GG's are random: 8461.8125


## Filtering 

In [13]:
df_filtered = df[
    df['# Off-Target Tier I Match Bin I Matches'] != 'MAX'
];
df_filtered

Unnamed: 0,Input,Quota,Target Taxon,Target Gene ID,Target Gene Symbol,Target Transcript,Target Alias,CRISPR Mechanism,Target Domain,Reference Sequence,...,On-Target Ruleset,On-Target Efficacy Score,On-Target Rank,Off-Target Rank,On-Target Rank Weight,Off-Target Rank Weight,Combined Rank,Pick Order,Picking Round,Picking Notes
0,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.8569,27,143,1.0,1.0,32,23.0,2.0,Previously skipped due to: Spacing Violation: ...
1,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.4900,294,77,1.0,1.0,142,329.0,4.0,Previously skipped due to: Outside Target Wind...
2,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.4440,331,45,1.0,1.0,148,92.0,2.0,Previously skipped due to: Spacing Violation: ...
3,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.9154,11,410,1.0,1.0,184,111.0,2.0,Previously skipped due to: Spacing Violation: ...
4,ENSMUSG00000000001,MAX,10090,ENSMUSG00000000001,Gnai3,ENSMUST00000000001.4,,CRISPRko,CDS,NC_000069.6,...,enPAM+GB,0.5483,240,370,1.0,1.0,365,,,Spacing Violation: Too close to earlier pick a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135384,ENSMUSG00000118638,MAX,10090,ENSMUSG00000118638,AL805980.1,ENSMUST00000239206.1,,CRISPRko,CDS,NC_000086.7,...,enPAM+GB,0.5379,257,376,1.0,1.0,335,179.0,2.0,Previously skipped due to: Spacing Violation: ...
135385,ENSMUSG00000118638,MAX,10090,ENSMUSG00000118638,AL805980.1,ENSMUST00000239206.1,,CRISPRko,CDS,NC_000086.7,...,enPAM+GB,0.4902,343,424,1.0,1.0,447,500.0,4.0,Previously skipped due to: Outside Target Wind...
135386,ENSMUSG00000118638,MAX,10090,ENSMUSG00000118638,AL805980.1,ENSMUST00000239206.1,,CRISPRko,CDS,NC_000086.7,...,enPAM+GB,0.3797,502,399,1.0,1.0,527,276.0,2.0,Previously skipped due to: Spacing Violation: ...
135387,ENSMUSG00000118640,MAX,10090,ENSMUSG00000118640,AC167036.2,ENSMUST00000239185.1,,CRISPRko,CDS,NC_000067.6,...,enPAM+GB,0.3833,282,38,1.0,1.0,141,198.0,3.0,Previously skipped due to: Outside Target Wind...


## Generate Michlits-compatible input tsv

Generate an empty dataframe with the correct columns

In [14]:
mich_cols: List[str] = [
    'sgRNA_ID', # optional
    'ntseq',
    'phylo21',
    'phast21',
    'Pfam_domain',
    'Pfam_family',
    'distance',
    'mod3',
    'exlen',
    'SD_distance',
    'SA_distance',
    'ntseq',
    'AAcons7', # optional?
    'AA_before_cut',
    'AA_at_cut',
    'AA_after_cut',
    'Framshift_ratio_inDelphi', # leave blank (not listed as optional)
    'Doench2016', # leave blank
    'tracrv2' # leave blank
]
    
michlits = pd.DataFrame(columns = mich_cols)
michlits

Unnamed: 0,sgRNA_ID,ntseq,phylo21,phast21,Pfam_domain,Pfam_family,distance,mod3,exlen,SD_distance,SA_distance,ntseq.1,AAcons7,AA_before_cut,AA_at_cut,AA_after_cut,Framshift_ratio_inDelphi,Doench2016,tracrv2


In [20]:
# try to get seq for one gene: ENSMUSG00000000001
import requests, sys
 
server = "https://rest.ensembl.org"
ext = "/sequence/id/ENSMUSG00000000001?type=cds"
 
r = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})
 
if not r.ok:
  r.raise_for_status()
  sys.exit()
 
 
print(r.text)
print(len(r.text))

ATGGGCTGCACGTTGAGCGCCGAGGACAAGGCGGCGGTGGAGCGGAGCAAGATGATCGACCGCAACTTGCGGGAGGACGGGGAGAAAGCGGCCAAAGAAGTGAAGCTGCTGCTGCTCGGCGCTGGAGAATCTGGTAAAAGTACCATCGTGAAACAGATGAAAATCATTCATGAGGACGGCTATTCAGAGGACGAATGTAAACAGTATAAAGTAGTTGTCTACAGCAATACTATTCAGTCCATCATTGCAATCATACGAGCCATGGGACGGTTGAAGATTGATTTTGGGGAATCTGCCAGAGCAGATGATGCCCGACAGTTATTTGTTTTAGCTGGGAGTGCTGAAGAAGGAGTCATGACTTCAGAACTAGCAGGCGTGATTAAACGTTTATGGCGAGATGGCGGGGTACAGGCATGCTTTAGCAGGTCCAGGGAATATCAGCTCAATGATTCTGCTTCATACTACCTAAATGATTTGGATAGAATATCCCAGACCAACTACATTCCAACTCAGCAAGATGTTCTTCGGACAAGAGTGAAGACTACAGGCATTGTGGAGACCCACTTCACCTTCAAGGAACTCTACTTCAAAATGTTTGATGTAGGTGGCCAAAGATCCGAACGAAAAAAGTGGATTCACTGTTTTGAGGGAGTGACAGCAATTATCTTTTGTGTGGCTCTCAGTGATTACGACCTTGTTCTGGCTGAGGATGAGGAAATGAACCGAATGCATGAAAGCATGAAATTGTTTGACAGCATTTGTAACAACAAATGGTTTACAGACACTTCAATCATTCTCTTTCTTAATAAGAAAGACCTTTTTGAGGAAAAAATAAAGAGGAGTCCATTAACAATCTGTTATCCAGAATACACAGGTTCCAATACATACGAAGAGGCAGCTGCTTACATTCAGTGCCAGTTTGAAGATCTGAACCGAAGAAAAGATACCAAGGAGGTCTACACTCACTTCACCTGTGCCACAGACACCAAAAATGTGCAGT