In [69]:
import pandas as pd
from src.utils import *
import seaborn as sns
import matplotlib.pyplot as plt
import random
from scipy.stats import truncnorm

In [47]:
real = pd.read_csv('./data/new/cleaned_celegans_crispr_data.csv')
real.head()

Unnamed: 0,sgRNA-1,sgRNA-2,target_name,sgRNA_cut,num_worms,num_lines,sgRNA_type,promoter_used,cas9_type,screening_method,repair_mechanism
0,GATTCTCGCGATAACCACGGTGG,,rde-12,Yes,160,95.0,DNA,K09B11.12 U6 promoter,circular plasmid DNA,Positive selectable markers,NHEJ
1,GGACATTGACACTAAAATCAAGG,,dpy-13,Yes,40,1.0,DNA,K09B11.12 U6 promoter,circular plasmid DNA,PCR and/or DNA sequencing,NHEJ
2,GTTATGAAGCTGACGGCGGACGG,,dpy-13,Yes,35,3.0,DNA,K09B11.12 U6 promoter,circular plasmid DNA,phenotypic analysis,NHEJ
3,GGCAATGCTGAGTGACACGGTGG,,dpy-7,Yes,300,240.0,DNA,K09B11.12 U6 promoter,circular plasmid DNA,phenotypic analysis,NHEJ
4,GCTACCATAGGCACCACGAGCGG,,dpy-10,Yes,250,246.0,DNA,K09B11.12 U6 promoter,circular plasmid DNA,phenotypic analysis,NHEJ


In [48]:
dumy = pd.read_csv('./data/new/sgRNA_dummy_candidates.csv')
dumy.head()

Unnamed: 0,sgRNA,target_name,true_PAM
0,GCTCGAAATTTCCCTGCTCTCGAGGCAAGG,Y57G11C.33a,1
1,GTCTGGGAGAACACAATTGTCAGGATAAAT,Y57G11C.33a,1
2,GGACGATGGCTCAAATGCGAAATGGCACAA,Y57G11C.33a,1
3,GTCAGTGAAATTGTGAAGGGAGGACTAGCG,C52A11.4i,1
4,GTCGGGAGAAAGAATGAACCGGGTGTATAT,C52A11.4i,1


In [49]:
real.corr()

Unnamed: 0,num_worms,num_lines
num_worms,1.0,0.879828
num_lines,0.879828,1.0


## Further Filtering dummy candidates by similarity between real data

In [50]:
real_melt_temps = list(map(melt_temp, list(real['sgRNA-1'])))
real_gc_counts = list(map(calc_gc, list(real['sgRNA-1'])))

In [51]:
dumy['melting_temp'] = pd.Series(map(melt_temp, list(dumy['sgRNA'])))
dumy['gc_counts'] = pd.Series(map(calc_gc, list(dumy['sgRNA'])))

In [52]:
dumy = dumy[((dumy['melting_temp'] > min(real_gc_counts)) & 
             (dumy['melting_temp'] < max(real_gc_counts))) & 
             ((dumy['gc_counts'] > min(real_melt_temps)) & 
             (dumy['gc_counts'] < max(real_melt_temps))) ].reset_index(drop=True)

In [53]:
dumy.shape

(92718, 5)

## Check the Variables and Look for Patterns

---


## sgRNA_cut

In [54]:
real['sgRNA_cut'] = real['sgRNA_cut'].map({'Yes': 1, 'No': 0})

In [59]:
pos_ratio = real[real['sgRNA_cut'] == 1].shape[0] / len(real['sgRNA_cut'])
neg_ratio = real[real['sgRNA_cut'] == 0].shape[0] / len(real['sgRNA_cut'])
print(pos_ratio, neg_ratio)

0.863013698630137 0.136986301369863


In [65]:
# Create artificial data points by keeping the real rations
pos_lst = []
neg_lst = []
pos_lst = [1] * int(np.ceil(pos_ratio*dumy.shape[0]))
neg_lst = [0] * int(np.floor(neg_ratio*dumy.shape[0]))
cut_lst = pos_lst + neg_lst
random.shuffle(cut_lst)

## num_worms

In [84]:
# Getting min and max of real data 
a, b = min(real['num_worms']), max(real['num_worms'])
# Getting mean and standard deviation of real data
mu, sigma = np.mean(real['num_worms']), np.std(real['num_worms'])
# Create the distribution object
dist = truncnorm((a - mu) / sigma, (b - mu) / sigma, loc=mu, scale=sigma)
# Creat data size number of values
values = dist.rvs(dumy.shape[0])
num_worms_lst = [int(i) for i in values]

## num_lines

In [85]:
# Getting min and max of real data 
a, b = min(real['num_lines']), max(real['num_lines'])
# Getting mean and standard deviation of real data
mu, sigma = np.mean(real['num_lines']), np.std(real['num_lines'])
# Create the distribution object
dist = truncnorm((a - mu) / sigma, (b - mu) / sigma, loc=mu, scale=sigma)
# Creat data size number of values
values = dist.rvs(dumy.shape[0])
num_lines_lst = [int(i) for i in values]

In [88]:
# Adding the simulated lists into dataframe
dumy['sgRNA_cut'] = pd.Series(cut_lst)
dumy['num_worms'] = pd.Series(num_worms_lst)
dumy['num_lines'] = pd.Series(num_lines_lst)

# Reorganize the columns
dumy = dumy[['sgRNA', 'target_name', 'sgRNA_cut', 'num_worms', 'num_lines',
                             'melting_temp', 'gc_counts', 'true_PAM']]

# Drop the unused columns 
dumy = dumy.drop(['melting_temp', 'gc_counts', 'true_PAM'], axis=1)

In [90]:
dumy.head()

Unnamed: 0,sgRNA,target_name,sgRNA_cut,num_worms,num_lines
0,GCTCGAAATTTCCCTGCTCTCGAGGCAAGG,Y57G11C.33a,1,115,97
1,GGACGATGGCTCAAATGCGAAATGGCACAA,Y57G11C.33a,1,73,4
2,GTCAGTGAAATTGTGAAGGGAGGACTAGCG,C52A11.4i,1,55,16
3,GTCGGGAGAAAGAATGAACCGGGTGTATAT,C52A11.4i,1,62,79
4,GTGAGACTCAACAAGAACCAGCTGGATTGT,C52A11.4i,1,142,24


In [91]:
dumy.to_csv('./data/new/artificial_sgRNA_c_elegans_data.csv', index=None)