# Enables running the process to make FASTA files for gRNA outside a script

### import needed modules

In [1]:
import csv
import os
import pandas as pd
import sys
import subprocess

import cauldron_sdk as api
from build_fastas.get_gene_mrna import get_gene_mrna
from build_fastas.make_gene_crRNA_fastas import make_FASTA

%load_ext autoreload
%autoreload 2

### define functions

In [2]:
def get_cauldron_guides(guide_list):
    """
    Query Cauldron with a list of guide REC IDs and return a dataframe with
    gene, rec_id, target_sequence
    Returning information on errors encountered is not great
    """
    GnC = [['gene','rec_id','sequence']]
    for guide in guide_list:
        try:
            X = [i for i in api.guides.find(rec_ids = guide)][0]
            GnC.append([X['gene'], X['rec_id'], X['target_sequence']])
        except:
            print ('Did not find ',guide)
    return pd.DataFrame(GnC[1:], columns = GnC[0])

def get_file_guides(guide_list, master_list):
    '''
    untested
    Query a master guide df with a list of guides
    The master guide df should be in the form of a pandas df with the headers of 'guide','gene','sequence'
    but they can be in any order
    '''
    GnC = [['gene','rec_id','sequence']]
    out_df = master_list[master_list['guide'].isin(guide_list)]
    return out_df['gene','guide','sequence']

### set and check some useful file locations

In [3]:
# Set some relative locations
get_genes = 'build_fastas/get_gene_mrna.py'
make_fasta = 'build_fastas/make_gene_crRNA_fastas.py'
temp_dir = 'build_fastas/temp'
genes_out = os.path.join(temp_dir,'gene_list.csv')
df_out = os.path.join(temp_dir,'temp.csv')
collected_genes = os.path.join(temp_dir,'genome_mRNA.fasta')

# Check for files and paths
if not os.path.isfile(get_genes): print(get_genes,'not found')
if not os.path.isfile(make_fasta): print(make_fasta,'not found')    
if not os.path.isdir(temp_dir): os.mkdir(temp_dir)

### User input: set the source of the gRNA list

In [4]:
gRNA_file = '/Users/chris.johnson/Documents/scripts/python/CRISPR_genotyping/20201207_platform_exp_1/Platform genotype exp 1 Guides.csv'

In [5]:
working_path = os.path.dirname(gRNA_file)

In [6]:
guides = []
with open(gRNA_file,'r') as infile:
    readfile = csv.reader(infile, delimiter = ',')
    for row in readfile: guides.append(row)

In [7]:
guides

[['Guide'],
 ['REC-GRNA-0028168'],
 ['REC-GRNA-0028166'],
 ['REC-GRNA-0028169'],
 ['REC-GRNA-0028173'],
 ['REC-GRNA-0027594'],
 ['REC-GRNA-0027596'],
 ['REC-GRNA-0027598'],
 ['REC-GRNA-0027600'],
 ['REC-GRNA-0027883'],
 ['REC-GRNA-0027876'],
 ['REC-GRNA-0027879'],
 ['REC-GRNA-0027880'],
 ['REC-GRNA-0010883'],
 ['REC-GRNA-0010884'],
 ['REC-GRNA-0010885'],
 ['REC-GRNA-0027306'],
 ['REC-GRNA-0018851'],
 ['REC-GRNA-0018852'],
 ['REC-GRNA-0018853'],
 ['REC-GRNA-0018856'],
 ['REC-GRNA-0027934'],
 ['REC-GRNA-0027936'],
 ['REC-GRNA-0027941'],
 ['REC-GRNA-0027942'],
 ['REC-GRNA-0027456'],
 ['REC-GRNA-0027454'],
 ['REC-GRNA-0027458'],
 ['REC-GRNA-0027460'],
 ['REC-GRNA-0027521'],
 ['REC-GRNA-0027514'],
 ['REC-GRNA-0027519'],
 ['REC-GRNA-0027522'],
 ['REC-GRNA-0000324'],
 ['REC-GRNA-0000323'],
 ['REC-GRNA-0028389'],
 ['REC-GRNA-0028394'],
 ['REC-GRNA-0025574'],
 ['REC-GRNA-0025575'],
 ['REC-GRNA-0025577'],
 ['REC-GRNA-0025578'],
 ['REC-GRNA-0028219'],
 ['REC-GRNA-0028217'],
 ['REC-GRNA-0028221'],

### Use this if you are able to connect to Cauldron to recover guide info

In [8]:
GnC_df = get_cauldron_guides(guides[1:]) # this takes a while

### Use this if you are using a master file containing guide information

master_file = ''
master_list = pd.read_csv(master_file)

master_list.head()

GnC_df = get_file_guides(guides[1:], master_list)

### Save the guide information to a file, where it will be used later
This is awkward and could be removed by improving the overall pipeline so that subsequent steps can have a dataframe passed to them instead of a file location.

In [9]:
GnC_df.to_csv(df_out, index = False)

### Run the scripts that will make the FASTA files

In [10]:
# This is a function imported from the python script get_gene_mrna.py
# it will take a couple minutes and throw some warnings as it executes
get_gene_mrna(df_out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.genes['gene'] = self.genes['attributes'].apply(gene_name_parse)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.genes['geneID'] = self.genes['attributes'].apply(gene_id_parse)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.genes['biotype'] = self.genes['attributes'].apply(biotype_parse)


Loaded GFF


In [11]:
# This is a function imported from the python script make_gene_crRNA_fastas.py
make_FASTA(collected_genes, df_out, working_path)