In [19]:
import pandas as pd

reverse = False
gene = 'COL4A3'
if reverse:
    revel_data = pd.read_csv(f"{gene}_reverse.csv", header=None)
else:
    revel_data = pd.read_csv(f"{gene}.csv", header=None)
revel_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2,228029443,227164727,A,C,M,L,0.49,ENST00000396578;ENST00000328380;ENST0000033558...
1,2,228029443,227164727,A,G,M,V,0.47,ENST00000396578;ENST00000328380;ENST0000033558...
2,2,228029443,227164727,A,T,M,L,0.498,ENST00000396578;ENST00000328380;ENST0000033558...
3,2,228029444,227164728,T,A,M,K,0.539,ENST00000396578;ENST00000328380;ENST0000033558...
4,2,228029444,227164728,T,C,M,T,0.528,ENST00000396578;ENST00000328380;ENST0000033558...


In [20]:
# Add header for the columns
revel_data.columns = ['chr', 'pos_hg19', 'pos_grch38', 'ref_na', 'alt_na', 'aa_wt', 'aa_mut', 'revel_score', "transcript_id"]
revel_data.head(20)

Unnamed: 0,chr,pos_hg19,pos_grch38,ref_na,alt_na,aa_wt,aa_mut,revel_score,transcript_id
0,2,228029443,227164727,A,C,M,L,0.49,ENST00000396578;ENST00000328380;ENST0000033558...
1,2,228029443,227164727,A,G,M,V,0.47,ENST00000396578;ENST00000328380;ENST0000033558...
2,2,228029443,227164727,A,T,M,L,0.498,ENST00000396578;ENST00000328380;ENST0000033558...
3,2,228029444,227164728,T,A,M,K,0.539,ENST00000396578;ENST00000328380;ENST0000033558...
4,2,228029444,227164728,T,C,M,T,0.528,ENST00000396578;ENST00000328380;ENST0000033558...
5,2,228029444,227164728,T,G,M,R,0.55,ENST00000396578;ENST00000328380;ENST0000033558...
6,2,228029445,227164729,G,A,M,I,0.535,ENST00000396578;ENST00000328380;ENST0000033558...
7,2,228029445,227164729,G,C,M,I,0.527,ENST00000396578;ENST00000328380;ENST0000033558...
8,2,228029445,227164729,G,T,M,I,0.527,ENST00000396578;ENST00000328380;ENST0000033558...
9,2,228029446,227164730,A,C,S,R,0.249,ENST00000396578;ENST00000328380;ENST0000033558...


In [21]:
def add_aa_position_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """This function adds the amino acid position to the dataframe, under new column 'aa_pos'
    Method:
    Add a new column 'aa_pos' to the dataframe
    Iterate through the "pos_hg19" column, and for each increasment of 3, add the value to the new column
    """
    # Initialize variables to keep track of the last value and the incremental number
    last_value = None
    incremental_number = 0
    number_column = []
    
    # Iterate through the pos_hg19 column and calculate the numbers column
    for value in df['pos_hg19']:
        if last_value is None:
            last_value = value
            incremental_number = 1
        elif value - last_value >= 3:
            incremental_number += 1
            last_value = value
        else:
            pass
        number_column.append(incremental_number)
    
    # Add the numbers column to the DataFrame
    df['aa_pos'] = number_column
    return df

def add_aa_position_to_df_reverse(df: pd.DataFrame) -> pd.DataFrame:
    """This function adds the amino acid position to the dataframe, under new column 'aa_pos'
    Method:
    Add a new column 'aa_pos' to the dataframe
    Iterate through the "pos_hg19" column, and for each increasment of 3, add the value to the new column
    """
        # Initialize variables to keep track of the last value and the incremental number
    last_value = None
    incremental_number = 0
    number_column = []
    
    # Iterate through the pos_hg19 column and calculate the numbers column
    for value in df['pos_hg19']:
        if last_value is None:
            last_value = value
            incremental_number = 1
        elif last_value - value >= 3:
            incremental_number += 1
            last_value = value
        else:
            pass
        number_column.append(incremental_number)
    
    # Add the numbers column to the DataFrame
    df['aa_pos'] = number_column
    return df

def encode_amino_acids(df):
    df = df.sort_values(by='aa_pos').reset_index(drop=True)  # Sort the DataFrame by 'aa_pos' and reset the index
    aa_seq = ''

    for index, row in df.iterrows():
        aa_wt = row['aa_wt']
        aa_seq += aa_wt

    return aa_seq


# Uniprot data
import requests as req

def get_uniprot_url(gene_name) -> str:
    """Returns the URL for the Uniprot page for the given gene name."""
    url = f"https://rest.uniprot.org/uniprotkb/search?query=(gene:{gene_name})%20AND%20(taxonomy_id:9606)%20AND%20(reviewed:true)"
    return url

def get_uniprot_json(gene_name) -> dict:
    # The base URL for UniProt's search API
    url = get_uniprot_url(gene_name)
    # Make a request to the search API
    response = req.get(url)
    # Extract the JSON data from the response
    data = response.json()
    return data

def get_uniprot_id(gene_name) -> str:
    """Returns the Uniprot ID for the given gene name."""
    data = get_uniprot_json(gene_name)
    primary_accession = data['results'][0]['primaryAccession']
    return primary_accession


def get_sequence(gene_name) -> str:
    """Returns the sequence for the given Uniprot"""
    data = get_uniprot_json(gene_name)
    sequence = data['results'][0]['sequence']['value']
    return sequence

def test_add_aa_position_to_df(revel_data_with_positions, aa_sequence):
    # Check that the

In [22]:

if reverse:
    revel_data = revel_data[::-1].reset_index(drop=True)
    revel_data = add_aa_position_to_df_reverse(revel_data)
else:
    revel_data = add_aa_position_to_df(revel_data)
revel_data.to_csv(f'{gene}_revel_with_pos.csv', index=False)
# amino_acid_sequence = encode_amino_acids(revel_data)
# 
# print(amino_acid_sequence)
# print(get_sequence('COL4A3'))
# count = 0
# for i,j in zip(amino_acid_sequence, get_sequence('COL4A3')):
#     count += 1
#     if i != j:
#         print(i, j)
#         print(count)

       chr   pos_hg19  pos_grch38 ref_na alt_na aa_wt aa_mut  revel_score  \
0        2  228029443   227164727      A      C     M      L        0.490   
1        2  228029443   227164727      A      G     M      V        0.470   
2        2  228029443   227164727      A      T     M      L        0.498   
3        2  228029444   227164728      T      A     M      K        0.539   
4        2  228029444   227164728      T      C     M      T        0.528   
...    ...        ...         ...    ...    ...   ...    ...          ...   
10429    2  228176582   227311866      A      C     H      P        0.217   
10430    2  228176582   227311866      A      G     H      R        0.197   
10431    2  228176582   227311866      A      T     H      L        0.246   
10432    2  228176583   227311867      C      A     H      Q        0.401   
10433    2  228176583   227311867      C      G     H      Q        0.401   

                                           transcript_id aa_pos  
0      EN

KeyboardInterrupt: 

In [None]:
######### TEST MINI
data = {'pos_hg19': [1,1,1]}