In [109]:
import pandas as pd

revel_data = pd.read_csv('COL4A3.csv', header=None)
revel_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2,228029443,227164727,A,C,M,L,0.49,ENST00000396578;ENST00000328380;ENST0000033558...
1,2,228029443,227164727,A,G,M,V,0.47,ENST00000396578;ENST00000328380;ENST0000033558...
2,2,228029443,227164727,A,T,M,L,0.498,ENST00000396578;ENST00000328380;ENST0000033558...
3,2,228029444,227164728,T,A,M,K,0.539,ENST00000396578;ENST00000328380;ENST0000033558...
4,2,228029444,227164728,T,C,M,T,0.528,ENST00000396578;ENST00000328380;ENST0000033558...


In [110]:
# Add header for the columns
revel_data.columns = ['chr', 'pos_hg19', 'pos_grch38', 'ref_na', 'alt_na', 'aa_wt', 'aa_mut', 'revel_score', "transcript_id"]
revel_data.head(20)

Unnamed: 0,chr,pos_hg19,pos_grch38,ref_na,alt_na,aa_wt,aa_mut,revel_score,transcript_id
0,2,228029443,227164727,A,C,M,L,0.49,ENST00000396578;ENST00000328380;ENST0000033558...
1,2,228029443,227164727,A,G,M,V,0.47,ENST00000396578;ENST00000328380;ENST0000033558...
2,2,228029443,227164727,A,T,M,L,0.498,ENST00000396578;ENST00000328380;ENST0000033558...
3,2,228029444,227164728,T,A,M,K,0.539,ENST00000396578;ENST00000328380;ENST0000033558...
4,2,228029444,227164728,T,C,M,T,0.528,ENST00000396578;ENST00000328380;ENST0000033558...
5,2,228029444,227164728,T,G,M,R,0.55,ENST00000396578;ENST00000328380;ENST0000033558...
6,2,228029445,227164729,G,A,M,I,0.535,ENST00000396578;ENST00000328380;ENST0000033558...
7,2,228029445,227164729,G,C,M,I,0.527,ENST00000396578;ENST00000328380;ENST0000033558...
8,2,228029445,227164729,G,T,M,I,0.527,ENST00000396578;ENST00000328380;ENST0000033558...
9,2,228029446,227164730,A,C,S,R,0.249,ENST00000396578;ENST00000328380;ENST0000033558...


In [111]:
def add_aa_position_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """This function adds the amino acid position to the dataframe, under new column 'aa_pos'
    Method:
    Add a new column 'aa_pos' to the dataframe
    Iterate through the "pos_hg19" column, and for each increasment of 3, add the value to the new column
    """
    df = df.sort_values(by='pos_hg19').reset_index(drop=True)  # Sort the DataFrame by 'pos_hg19' and reset index
    prev_pos = df['pos_hg19'][0]
    aa_pos = 1
    aa_pos_column = []

    for index, row in df.iterrows():
        pos_hg19 = row['pos_hg19'] # For example, 1
        if (pos_hg19 - prev_pos) > 3:
            print(f"Gap in nucleic acid positions detected. {prev_pos} to {pos_hg19}")
        
        aa_pos_column.append(aa_pos)
        
        if (pos_hg19 - prev_pos) == 2:
            aa_pos += 1
        prev_pos = pos_hg19

    df['aa_pos'] = aa_pos_column
    return df

def encode_amino_acids(df):
    df = df.sort_values(by='aa_pos').reset_index(drop=True)  # Sort the DataFrame by 'aa_pos' and reset the index
    aa_seq = ''

    for index, row in df.iterrows():
        aa_wt = row['aa_wt']
        aa_seq += aa_wt

    return aa_seq


# Uniprot data
import requests as req

def get_uniprot_url(gene_name) -> str:
    """Returns the URL for the Uniprot page for the given gene name."""
    url = f"https://rest.uniprot.org/uniprotkb/search?query=(gene:{gene_name})%20AND%20(taxonomy_id:9606)%20AND%20(reviewed:true)"
    return url

def get_uniprot_json(gene_name) -> dict:
    # The base URL for UniProt's search API
    url = get_uniprot_url(gene_name)
    # Make a request to the search API
    response = req.get(url)
    # Extract the JSON data from the response
    data = response.json()
    return data

def get_uniprot_id(gene_name) -> str:
    """Returns the Uniprot ID for the given gene name."""
    data = get_uniprot_json(gene_name)
    primary_accession = data['results'][0]['primaryAccession']
    return primary_accession


def get_sequence(gene_name) -> str:
    """Returns the sequence for the given Uniprot"""
    data = get_uniprot_json(gene_name)
    sequence = data['results'][0]['sequence']['value']
    return sequence

In [112]:
revel_data = add_aa_position_to_df(revel_data)
# revel_data.to_csv('COL4A3_revel_with_pos.csv', index=False)

amino_acid_sequence = encode_amino_acids(revel_data)

print(amino_acid_sequence)
print(get_sequence('COL4A3'))
count = 0
for i,j in zip(amino_acid_sequence, get_sequence('COL4A3')):
    count += 1
    if i != j:
        print(i, j)
        print(count)

Gap in nucleic acid positions detected. 228029529 to 228102684
Gap in nucleic acid positions detected. 228102740 to 228104859
Gap in nucleic acid positions detected. 228104948 to 228109036
Gap in nucleic acid positions detected. 228109080 to 228109667
Gap in nucleic acid positions detected. 228109710 to 228110670
Gap in nucleic acid positions detected. 228110732 to 228111401
Gap in nucleic acid positions detected. 228111453 to 228112274
Gap in nucleic acid positions detected. 228112300 to 228113159
Gap in nucleic acid positions detected. 228113236 to 228115856
Gap in nucleic acid positions detected. 228115918 to 228116052
Gap in nucleic acid positions detected. 228116087 to 228118012
Gap in nucleic acid positions detected. 228118052 to 228118277
Gap in nucleic acid positions detected. 228118353 to 228118828
Gap in nucleic acid positions detected. 228118889 to 228119372
Gap in nucleic acid positions detected. 228119431 to 228120742
Gap in nucleic acid positions detected. 228120786 to 22