In [None]:
import pandas  as pd
import numpy as np
from scipy import stats
import os

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = 'Results/'
if not os.path.exists(path):
    print("make" + path)
    os.makedirs(path)
fold_table = pd.read_csv('../Sample_data/fold_table.csv')

## Run statical tests.

In [None]:
"""
def make_dataset(fold,fold_table):
    header = ['ID', 'Length', 'Database', 'Accession', 'Signature description', 'Start', 'End', 'Date', 'InterPro accession', 'InterPro description', 'GO']
    col_names = list(range(14))
    interpro = pd.read_csv('../OUTPUT_DIR/fold_'+str(fold)+'_interpro.tsv', index_col = None, sep = '\t', header= None, names=col_names)
    interpro = interpro.iloc[:,[0,2,3,4,5,6,7,10,11,12,13]]
    interpro.columns = header
    interpro['ID'] = interpro['ID'].apply(lambda x: x.split('|')[0])
    interpro = pd.merge(interpro, fold_table[['ID', 'mechanism']], on='ID', how='left')
    attention = pd.read_csv('../attention/fold_'+str(fold)+'_attention.csv', index_col=0)
    return interpro,attention
"""

def run_U_test_for_a_sequence(accession,attention_of_target_sequence,interpro_of_target_sequence):
    attention_of_target_region = []
    interpro_of_target_region = interpro_of_target_sequence[interpro_of_target_sequence['Accession'] == accession]
    for i in interpro_of_target_region.index:
        start = interpro_of_target_region.loc[i,'Start']
        end = interpro_of_target_region.loc[i,'End']
    attention_of_target_region += attention_of_target_sequence[start-1:end]
    stats_U, p = stats.mannwhitneyu(attention_of_target_region, attention_of_target_sequence, True, 'greater')
    return p

def run_U_test_for_a_dataset(interpro,attention):
    result = pd.DataFrame()
    for ID in interpro['ID'].unique():
        #try:
        interpro_of_target_sequence = interpro[interpro['ID'] == ID]
        accessions_of_target_region = interpro_of_target_sequence['Accession'].tolist()

        attention_of_target_sequence = attention[ID].dropna().tolist()[:-1]

        p_value = pd.DataFrame({accession: run_U_test_for_a_sequence(accessions_of_target_region, attention_of_target_sequence,interpro_of_target_sequence) for accession in accessions_of_target_region}, index =['U p-value']).T.reset_index().rename(columns = {'index':'Accession'})
        p_value['Significance'] = (p_value['U p-value']<(0.05/len(interpro_of_target_sequence))).tolist()
        interpro_of_target_sequence = pd.merge(interpro_of_target_sequence, p_value, on='Accession', how='left')
        result = pd.concat([result, interpro_of_target_sequence], axis=0)
        #except:
        #    print(ID)
    return result

In [None]:
result_all =  pd.DataFrame()

for fold in range(5):
    interpro_per_fold = pd.read_csv(path + 'Interpro/fold_'+str(fold)+'.interpro.csv', index_col = 0)
    attention = pd.read_csv(path + 'attention/fold_'+str(fold)+'.attention.csv', index_col =0)
    result_per_fold = run_U_test_for_a_dataset(interpro_per_fold,attention)
    result_all = pd.concat([result_all, result_per_fold])
result_all.to_csv(path + 'result_attention-intensive_regions.csv')

# Visualization

### Decide the target sequence.

In [None]:
## Decide from its AMR Family

familyname_to_id = {
    'blaOXA-114s': 'U3N8W9',
    'rpoB': 'NP_273190.1',
    'macB': 'A0A011P660',
    'tetW': 'ABN80187',
}

familyname  = 'tetW'
target_id = familyname_to_id[familyname]
os.mkdir(path + familyname)

In [None]:
## Decide from its resistance mechanism

mechanism_to_id = {
    'antibiotic inactivation': 'U3N8W9',
 'antibiotic target alteration': 'NP_273190.1',
 'antibiotic efflux': 'A0A011P660',
 'antibiotic target protection': 'ABN80187',
}

mechanism = 'antibiotic target protection'
target_id = mechanism_to_id[mechanism]


In [None]:
fold = fold_table[fold_table['ID']==target_id]['fold'].tolist()[0]
print(fold, mechanism)

interpro_1domain = pd.read_csv(path + 'result_attention-intensive_regions.csv', index_col=0)
interpro_1domain = interpro_1domain[interpro_1domain['ID'] == target_id]
interpro_1domain[interpro_1domain['Significance']].to_csv(path + familyname + '/' + familyname +'.csv')

### Corresponding Attention-intensive areas and their positions.

In [None]:
length = interpro_1domain['Length'].unique()[0]
accession_list = interpro_1domain['Accession'].unique()
position = pd.DataFrame(columns = accession_list, index = list(range(1,length+1)))

for accession in accession_list:
    # print(accession)
    interpro_accession = interpro_1domain[interpro_1domain['Accession'] == accession]
    for l in range(len(interpro_accession)):
        start = interpro_accession.iloc[l,5]
        end = interpro_accession.iloc[l,6]
        # print(start,end)
        if interpro_accession.iloc[0,-1]:
            position.loc[start:end,accession] = -1
        else:
            position.loc[start:end,accession] = 1

position.to_csv(path + familyname + '/position.csv')
position = pd.read_csv(path + familyname + '/position.csv',index_col=0).T

### Visualization

In [None]:
attention = pd.read_csv('attention/fold_'+str(fold)+'.attention.csv', index_col = 0)
attention_of_target_sequence = attention[target_id].dropna().iloc[1:-1]

## Focus only on the Attention-intensive regions.
position = position[position.sum(axis = 1)<0]

In [None]:
sns.set()
fig, ax = plt.subplots(2, 1, figsize=(15, 10), sharex = True)

sns.heatmap(position,cmap='bwr',ax = ax[0], cbar= False, yticklabels=False, xticklabels=True)
ax[0].set_title('Attention-intensive regions', fontsize = 30)
ax[0].tick_params(labelsize=30)
ax[0].set_yticks([y + 0.5 for y in list(range(len(position.index)))])
ax[0].set_yticklabels(position.index.tolist())


attention_of_target_sequence.rolling(5, center=True).apply(lambda x: x.mean()).plot(fontsize = 10, legend=False)
ax[1].set_title('Attention', fontsize = 30)
ax[1].tick_params(labelsize=30)
ax[1].set_xticks(range(0,len(position.T),100))
ax[1].set_xticklabels(list(range(0,len(position.T),100)),rotation = 45)
fig.savefig(path + familyname + '/' + familyname +'.png')