In [1]:
import pandas as pd

import ablang
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import matplotlib
import matplotlib.pyplot as plt

#rojan's class definition

class OASDBDesc:
    
    def __init__(self):
        pass

    def read_data(self, rawdata_dir):
        "Gather gz files from the directory and extract these files"
    
        paired_files = [os.path.join(rawdata_dir, f) for f in os.listdir(rawdata_dir) if f.endswith(".gz")] 
        t_cols = ['v_call_heavy', 'd_call_heavy', 'j_call_heavy', 'sequence_alignment_aa_light', 
                  'sequence_alignment_aa_heavy', 'ANARCI_status_light', 'ANARCI_status_heavy']

        df_seqs = pd.DataFrame()
        for paired_file in paired_files:
            df = pd.read_csv(paired_file, compression = 'gzip', sep=',', skiprows=1)
            df_seqs = pd.concat([df_seqs, df[t_cols]], ignore_index=True)
        return df_seqs.copy()

    def encode_seq(self, df, column):
        #function to encode sequences
        
        #5 main types of protein encoding methods: binary encoding, 
        #physiochemical properties encoding, evolution-based encoding, structure-based encoding, 
        #and machine-learning encoding.
        
        #ablang
        
        #heavy sequence encoding
        heavy_ablang = ablang.pretrained("heavy")
        heavy_ablang.freeze()
        
        seqs_heavy = df.loc[1:30, 'sequence_alignment_aa_heavy']

        seqcodings_heavy = heavy_ablang(seqs_heavy, mode='seqcoding')
        print("-"*100)
        print("The output shape of the heavy seq-codings:", seqcodings_heavy.shape)
        print("-"*100)

        print(seqcodings_heavy)
        
        #light sequence encoding
        light_ablang = ablang.pretrained("light")
        light_ablang.freeze()
        
        seqs_light = df.loc[1:30, 'sequence_alignment_aa_light']

        seqcodings_light = light_ablang(seqs_light, mode='seqcoding')
        print("-"*100)
        print("The output shape of the light seq-codings:", seqcodings_light.shape)
        print("-"*100)

        print(seqcodings_light)
        
    
    #cheryl's one-hot encoding 
        
    def one_hot_encode_seq(self, df, column):
    #Output a df with a specific columns that want to get dummies in
    
        #label_encode
        le = LabelEncoder()
        le.fit(df[column])
        integer_encoded_letters_arry = le.transform(small_df[column])

        #append
        integer_encoded_letters_series = pd.Series(integer_encoded_letters_arry)
        df['integer_encoded_letters'] = integer_encoded_letters_series

        #one hot encode
        df_dummies = pd.get_dummies(df, prefix = ['integer_encoded_letters'], columns = ['integer_encoded_letters'], drop_first = True)
        #return df_dummies
    
        
    #cheryl's code - physiochemical properties encoding
    
    def physchemvh_gen(self, df, column):
        alph = np.array(sorted('ACDEFGHIKLMNPQRSTVWY'))
        residue_info = pd.read_csv("residue_dict_copy.csv", header = 0, index_col = 0)
        
        res_counts = pd.DataFrame(index = alph)
        df = df.set_index(column)
        for i in df.index:
            characters = pd.Series(list(i))
            res_counts = pd.concat([res_counts, characters.value_counts()], axis = 1, ignore_index = False)
        res_counts.fillna(0, inplace = True)
        res_counts = res_counts.T
        hydrophobicity = []    
        for column in res_counts:
            hydros = []
            for index, row in res_counts.iterrows():
                hydros.append(row[column]*residue_info.loc[column, 'Hydropathy Score'])
            hydrophobicity.append(hydros)
        hydrophobicity = pd.DataFrame(hydrophobicity).T
        hydrophobicity['ave'] = hydrophobicity.sum(axis = 1)/115
        res_counts['Hydro'] = res_counts['A'] +  res_counts['I'] +  res_counts['L']+  res_counts['F']+  res_counts['V']
        res_counts['Amph'] = res_counts['W'] +  res_counts['Y']+  res_counts['M']
        res_counts['Polar'] = res_counts['Q'] +  res_counts['N'] + res_counts['S'] +  res_counts['T'] +  res_counts['C']+  res_counts['M']
        res_counts['Charged'] =  res_counts['R'] +  res_counts['K'] + res_counts['D'] +  res_counts['E'] +  res_counts['H']
        res_counts.reset_index(drop = True, inplace = True)
        physchemvh = pd.concat([res_counts, hydrophobicity['ave']], axis = 1, ignore_index = False)
        
        return "One Hot Encoding: ", df_dummies, "Physiochemical Properties Encoding: ", physchemvh


# In[8]:


# how to use class
#rawdata_dir = "/datasets/merck-files/Merck-files/"
obj = OASDBDesc()
#df_seq = obj.read_data(rawdata_dir)


# In[ ]:
df_seq =  pd.read_csv("/datasets/merck-files/Merck-files/sampled_df.csv")

df_seq.head() 

#annotate all of these columns


# In[ ]:

#obj.encode_seq(df_seq, "sequence_alignment_aa_heavy")






  from .autonotebook import tqdm as notebook_tqdm
2023-03-15 00:23:13 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


Unnamed: 0,sequence_id_heavy,sequence_heavy,locus_heavy,stop_codon_heavy,vj_in_frame_heavy,v_frameshift_heavy,productive_heavy,rev_comp_heavy,complete_vdj_heavy,v_call_heavy,...,cdr3_end_light,np1_light,np1_length_light,np2_light,np2_length_light,c_region_light,Isotype_light,Redundancy_light,ANARCI_numbering_light,ANARCI_status_light
0,GAACGGAAGTAGCCGA-1_contig_1,AGCTCTGAGAGAGGAGCCTTAGCCCTGGATTCCAAGGCCTATCCAC...,H,F,T,F,T,F,T,IGHV3-21*02,...,388.0,TC,2.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||
1,TACTTACTCAACACAC-1_contig_1,ATCATCCAACAACCACATCCCTTCTCTACAGAAGCCTCTGAGAGGA...,H,F,T,F,T,F,T,IGHV1-46*01,...,407.0,,0.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'V', '3 ': 'V', '4 ...",|||||
2,AAGGTTCTCGTAGGAG-1_contig_1,CCACATCCCTCCTCAGAAGCCCCCAGAGCACAACGCCTCACCATGG...,H,F,T,F,T,F,T,IGHV1-3*01,...,387.0,TCTC,4.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||
3,CGTGTAATCCAAATGC-1_contig_2,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCAC...,H,F,T,F,T,F,T,IGHV3-7*03,...,399.0,T,1.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'V', '4 ...",|||||
4,TATGCCCCACAACTGT-1_contig_1,TGGGGAGCTCTGGGAGAGGAGCCCCAGCCCTGAGATTCCCAGGTGT...,H,F,T,F,T,F,T,IGHV3-9*01,...,398.0,,0.0,,,GTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTC...,Bulk,1.0,"{'fwl1': {'1 ': 'Q', '2 ': 'S', '3 ': 'A', '4 ...",|||||


In [2]:
pd.read_csv("/datasets/merck-files/Merck-files/sampled_df.csv")

Unnamed: 0,sequence_id_heavy,sequence_heavy,locus_heavy,stop_codon_heavy,vj_in_frame_heavy,v_frameshift_heavy,productive_heavy,rev_comp_heavy,complete_vdj_heavy,v_call_heavy,...,cdr3_end_light,np1_light,np1_length_light,np2_light,np2_length_light,c_region_light,Isotype_light,Redundancy_light,ANARCI_numbering_light,ANARCI_status_light
0,GAACGGAAGTAGCCGA-1_contig_1,AGCTCTGAGAGAGGAGCCTTAGCCCTGGATTCCAAGGCCTATCCAC...,H,F,T,F,T,F,T,IGHV3-21*02,...,388.0,TC,2.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||
1,TACTTACTCAACACAC-1_contig_1,ATCATCCAACAACCACATCCCTTCTCTACAGAAGCCTCTGAGAGGA...,H,F,T,F,T,F,T,IGHV1-46*01,...,407.0,,0.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'V', '3 ': 'V', '4 ...",|||||
2,AAGGTTCTCGTAGGAG-1_contig_1,CCACATCCCTCCTCAGAAGCCCCCAGAGCACAACGCCTCACCATGG...,H,F,T,F,T,F,T,IGHV1-3*01,...,387.0,TCTC,4.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||
3,CGTGTAATCCAAATGC-1_contig_2,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCAC...,H,F,T,F,T,F,T,IGHV3-7*03,...,399.0,T,1.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'V', '4 ...",|||||
4,TATGCCCCACAACTGT-1_contig_1,TGGGGAGCTCTGGGAGAGGAGCCCCAGCCCTGAGATTCCCAGGTGT...,H,F,T,F,T,F,T,IGHV3-9*01,...,398.0,,0.0,,,GTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTC...,Bulk,1.0,"{'fwl1': {'1 ': 'Q', '2 ': 'S', '3 ': 'A', '4 ...",|||||
5,TAGGCATCACCGTTGG-1_contig_2,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCA...,H,F,T,F,T,F,T,IGHV3-33*01,...,384.0,C,1.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||
6,GTTCATTCATGTCTCC-1_contig_2,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCA...,H,F,T,F,T,F,T,IGHV3-33*08,...,385.0,,0.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||
7,TTGCCGTCATGTAAGA-1_contig_2,GGGAGCATCACCCAGCAACCACATCTGTCCTCTAGAGAATCCCCTG...,H,F,T,F,T,F,T,IGHV1-2*04,...,390.0,CC,2.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'E', '2 ': 'I', '3 ': 'V', '4 ...",|||||
8,TGCGGGTAGATAGCAT-1_contig_2,TCTCTTTCTCTAAAGGTGGCTCTTGCGATGTTGCGGGTTGATAGCA...,H,F,T,F,T,F,T,IGHV3-74*01,...,396.0,,0.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'T', '3 ': 'V', '4 ...",|||||
9,AAGGAGCGTGTAATGA-1_contig_1,ACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCT...,H,F,T,F,T,F,T,IGHV4-61*02,...,385.0,T,1.0,,,GAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGA...,Bulk,1.0,"{'fwk1': {'1 ': 'D', '2 ': 'I', '3 ': 'Q', '4 ...",|||||


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5b236d6d-7b98-492c-b976-1e7a4e078701' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>