In [None]:
# Objective: Read the csv and obtain enzyme objects from them. 

In [14]:
import pandas as pd
from Bio.Seq import Seq

In [20]:
### restriction enzyme class designed by Lunkyadi Sucipto
class RestrictionEnzyme(Seq):
    def __init__(self,
                 enzyme_name:str,
                 enzyme_seq:str,
                 head_after_digestion:str,
                 tail_after_digestion:str
                 ):
        super().__init__(enzyme_seq)
        self.enz_name = enzyme_name
        self.head_add = - len(tail_after_digestion)
        self.tail_add = len(head_after_digestion)
        self.length = len(enzyme_seq)

In [21]:
df = pd.read_csv("enzymes.csv", sep=';')
df

Unnamed: 0,Name,Sequence,Overhang,Extreme,Temperature,length_recognition,Literature
0,AccII,CG^vCG,Blunt,Blunt,37.0,4bp,
1,AciI,CvCG^C,CG,5,37.0,4bp,
2,AfaI,GT^vAC,Blunt,Blunt,37.0,4bp,
3,AfiI,CCNN^NNNvNNGG,NNN,3,,4bp,
4,AluBI,AG^vCT,Blunt,Blunt,60.0,4bp,
...,...,...,...,...,...,...,...
438,XmaJI,CvCTAG^G,CTAG,5,,6bp,
439,XmnI,GAANN^vNNTTC,Blunt,Blunt,,6bp,
440,ZraI,GAC^vGTC,Blunt,Blunt,,6bp,
441,ZrmI,AGT^vACT,Blunt,Blunt,,6bp,


In [22]:
print(df.columns.tolist()) 

['Name', 'Sequence', 'Overhang', 'Extreme', 'Temperature', 'length_recognition', 'Literature']


In [23]:
### First case with enzymes that do not have spaces between the recognized sequence: 
# 1. first remove the Enzymes that recognize trough spaces / remove substring N  
# filter the rows that contain the substring
substring = 'N'
filter = df['Sequence'].str.contains(substring)
filtered_df = df[~filter]
filtered_df

Unnamed: 0,Name,Sequence,Overhang,Extreme,Temperature,length_recognition,Literature
0,AccII,CG^vCG,Blunt,Blunt,37.0,4bp,
1,AciI,CvCG^C,CG,5,37.0,4bp,
2,AfaI,GT^vAC,Blunt,Blunt,37.0,4bp,
4,AluBI,AG^vCT,Blunt,Blunt,60.0,4bp,
5,AluI,AG^vCT,Blunt,Blunt,,4bp,
...,...,...,...,...,...,...,...
437,XmaI,CvCCGG^G,CCGG,5,,6bp,
438,XmaJI,CvCTAG^G,CTAG,5,,6bp,
440,ZraI,GAC^vGTC,Blunt,Blunt,,6bp,
441,ZrmI,AGT^vACT,Blunt,Blunt,,6bp,


In [26]:
### We need now to create 2 columns for the enzumes, the sequence without ( ^ v) and the pieces that are kept, the tail and head. 
### for the tail and head we need to take into account only the ^ because thats the one that indicates where the cut is done in the same strand
### that is recognized by the enzyme. 
# 1. remove the v in another column 
filtered_df['Sequence_wo_cuts'] = filtered_df['Sequence'].str.replace('v', '')
#print(filtered_df)
# 2. we divide the sequence then by the ^ Use the str.split method to split the data based on ^
filtered_df[['pt1', 'pt2']] = filtered_df['Sequence_wo_cuts'].str.split('^', expand=True) #part 1 and part 2. 
#print(filtered_df)
# 3. Finish the removal of non ATCG characters 
filtered_df['Sequence_wo_cuts'] = filtered_df['Sequence_wo_cuts'].str.replace('^', '')
print(filtered_df)



      Name  Sequence Overhang Extreme  Temperature length_recognition  \
0    AccII    CG^vCG    Blunt   Blunt         37.0                4bp   
1     AciI    CvCG^C       CG       5         37.0                4bp   
2     AfaI    GT^vAC    Blunt   Blunt         37.0                4bp   
4    AluBI    AG^vCT    Blunt   Blunt         60.0                4bp   
5     AluI    AG^vCT    Blunt   Blunt          NaN                4bp   
..     ...       ...      ...     ...          ...                ...   
437   XmaI  CvCCGG^G     CCGG       5          NaN                6bp   
438  XmaJI  CvCTAG^G     CTAG       5          NaN                6bp   
440   ZraI  GAC^vGTC    Blunt   Blunt          NaN                6bp   
441   ZrmI  AGT^vACT    Blunt   Blunt          NaN                6bp   
442  Zsp2I  A^TGCAvT     TGCA       3          NaN                6bp   

    Literature Sequence_wo_cuts    pt1    pt2  
0          NaN             CGCG     CG     CG  
1          NaN             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Sequence_wo_cuts'] = filtered_df['Sequence'].str.replace('v', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[['pt1', 'pt2']] = filtered_df['Sequence_wo_cuts'].str.split('^', expand=True) #part 1 and part 2.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Seq

In [35]:
# 4. Now depending on the Extreme from which the enzyme reads: 5' or 3' we define pt1 or pt2 as head or tail for the enzyme. The case for blunt end 
#    is the same as 5'. For now, to have a first working example we will use only the 5' and blunt. 
filtered_df = filtered_df[~filtered_df["Extreme"].str.contains("3", na=False)]
print(filtered_df)

      Name  Sequence Overhang Extreme  Temperature length_recognition  \
0    AccII    CG^vCG    Blunt   Blunt         37.0                4bp   
1     AciI    CvCG^C       CG       5         37.0                4bp   
2     AfaI    GT^vAC    Blunt   Blunt         37.0                4bp   
4    AluBI    AG^vCT    Blunt   Blunt         60.0                4bp   
5     AluI    AG^vCT    Blunt   Blunt          NaN                4bp   
..     ...       ...      ...     ...          ...                ...   
436   XhoI  CvTCGA^G     TCGA       5          NaN                6bp   
437   XmaI  CvCCGG^G     CCGG       5          NaN                6bp   
438  XmaJI  CvCTAG^G     CTAG       5          NaN                6bp   
440   ZraI  GAC^vGTC    Blunt   Blunt          NaN                6bp   
441   ZrmI  AGT^vACT    Blunt   Blunt          NaN                6bp   

    Literature Sequence_wo_cuts    pt1  pt2  
0          NaN             CGCG     CG   CG  
1          NaN             CCGC

In [None]:
filtered_df = filtered_df.reset_index()  # make sure indexes pair with number of rows


In [42]:
### restriction enzyme class designed by Lunkyadi Sucipto, the class includes an enzyme name, sequence, 
### head that is pt1 and tails that is pt2. 

### list of enzymes: 
enzyme_list = []
#
for index, row in filtered_df.iterrows():
    # a. create temporal 
    tmp = RestrictionEnzyme(enzyme_name=row['Name'],
                            enzyme_seq=row['Sequence_wo_cuts'],
                            head_after_digestion=row['pt1'],
                            tail_after_digestion=row['pt2'])
    # b. look at what is added 
    #print(row['Name'], row['Sequence_wo_cuts'],row['pt1'], row['pt2'])
    # c. add to list 
    enzyme_list.append(tmp)

# result: list of enzymes 
#print(len(enzyme_list))
#print(enzyme_list[1].enz_name,enzyme_list[1].head_add,enzyme_list[1].tail_add,enzyme_list[1].length)    

254
AciI -1 3 4


In [52]:
def csv_to_enzyme_list(        
        csv_file: str,
):
    import pandas as pd
    from Bio.Seq import Seq
    
    ### read csv 
    df = pd.read_csv(csv_file, sep=';')
    df
    ### First case with enzymes that do not have spaces between the recognized sequence: 
    # 1. first remove the Enzymes that recognize trough spaces / remove substring N  
    # filter the rows that contain the substring
    substring = 'N'
    filter = df['Sequence'].str.contains(substring)
    filtered_df = df[~filter]
    filtered_df
    ### We need now to create 2 columns for the enzumes, the sequence without ( ^ v) and the pieces that are kept, the tail and head. 
    ### for the tail and head we need to take into account only the ^ because thats the one that indicates where the cut is done in the same strand
    ### that is recognized by the enzyme. 
        # 1. remove the v in another column 
    filtered_df['Sequence_wo_cuts'] = filtered_df['Sequence'].str.replace('v', '')
        #print(filtered_df)
        # 2. we divide the sequence then by the ^ Use the str.split method to split the data based on ^
    filtered_df[['pt1', 'pt2']] = filtered_df['Sequence_wo_cuts'].str.split('^', expand=True) #part 1 and part 2. 
        #print(filtered_df)
        # 3. Finish the removal of non ATCG characters 
    filtered_df['Sequence_wo_cuts'] = filtered_df['Sequence_wo_cuts'].str.replace('^', '')
    #print(filtered_df)
        # 4. Now depending on the Extreme from which the enzyme reads: 5' or 3' we define pt1 or pt2 as head or tail for the enzyme. The case for blunt end 
        #    is the same as 5'. For now, to have a first working example we will use only the 5' and blunt. 
    filtered_df = filtered_df[~filtered_df["Extreme"].str.contains("3", na=False)]
    #print(filtered_df)
        #
    filtered_df = filtered_df.reset_index()  # make sure indexes pair with number of rows
    ######## 
    ### restriction enzyme class designed by Lunkyadi Sucipto, the class includes an enzyme name, sequence, 
    ### head that is pt1 and tails that is pt2. 

    ### list of enzymes: 
    enzyme_list = []
    #
    for index, row in filtered_df.iterrows():
        # a. create temporal 
        tmp = RestrictionEnzyme(enzyme_name=row['Name'],
                                enzyme_seq=row['Sequence_wo_cuts'],
                                head_after_digestion=row['pt1'],
                                tail_after_digestion=row['pt2'])
        # b. look at what is added 
        #print(row['Name'], row['Sequence_wo_cuts'],row['pt1'], row['pt2'])
        # c. add to list 
        enzyme_list.append(tmp)
    
    #print(len(enzyme_list))
    #print(enzyme_list[1].enz_name,enzyme_list[1].head_add,enzyme_list[1].tail_add,enzyme_list[1].length)    
    # result: list of enzymes 
    return enzyme_list


In [56]:
test = csv_to_enzyme_list(csv_file = "enzymes.csv")
print(len(test))
print(test[1].enz_name)

254
AciI


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Sequence_wo_cuts'] = filtered_df['Sequence'].str.replace('v', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[['pt1', 'pt2']] = filtered_df['Sequence_wo_cuts'].str.split('^', expand=True) #part 1 and part 2.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[['pt