# DB[RC/S3]
Density-Based Residue Clustering by Dissimilarity Between Sequence SubSets)
#
## Transthyretins/Hydroxy-isourate Hydrolases

In [1]:
from protlearn import *
from sklearn.ensemble import RandomForestRegressor

In [2]:
###====================================================================================================
### Parameters
###====================================================================================================

class Args(object):
    def __init__(self) -> None:
        self.__getattr__ = None
args = Args()
# args.file = '../transthyretin/sample.fasta'
args.file = 'aligned.fasta'
args.expand_alphabet = False
args.min_freq = .1
args.max_dist = 1.
args.min_size = 3
args.out = None

if args.out is None:
    args.out = 'output/' + args.file.split('.')[0]

In [3]:
# #====================================================================================================
msa = MSA(args.file)
msa.read()

In [5]:
msa.weights = [1./ float(msa.size) for i in range(msa.size)]
msa.weights

[0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666666667,
 0.0004166666666

In [6]:
df_msa = pd.DataFrame(msa.sequences)

# Calculate the proportion of "-" values weighted by row weight
gap_ratio = (df_msa == '-').mul(msa.weights, axis=0).sum()

# Filter the columns based on the condition that "-" is present in more than 90% of the rows
selected_columns = gap_ratio.index[(gap_ratio < args.min_freq)]

# Select only the columns corresponding to the selected features
df_selected = df_msa[selected_columns]
df_selected

Unnamed: 0,579,580,581,582,583,584,586,588,590,595,...,1587,1588,1589,1590,1591,1592,1593,1594,1595,1597
0,-,-,M,Y,P,K,D,F,W,Y,...,R,R,I,I,A,R,L,C,N,L
1,-,-,M,F,L,K,N,T,W,Y,...,R,K,V,L,D,R,L,I,V,R
2,-,-,M,F,L,K,N,T,W,Y,...,R,K,V,L,D,R,L,I,E,A
3,-,-,M,F,L,K,N,A,W,Y,...,-,-,-,-,-,-,-,-,-,-
4,-,-,M,H,P,K,N,A,W,Y,...,R,K,I,I,E,R,I,V,M,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,M,E,R,I,F,H,T,S,W,I,...,M,K,F,W,L,Q,L,M,T,Q
2396,I,E,K,I,F,H,S,S,W,V,...,M,R,Y,W,L,K,L,M,T,K
2397,I,E,R,I,F,H,G,S,W,L,...,M,R,Y,W,L,E,L,M,T,Q
2398,Q,E,R,I,F,E,G,T,W,V,...,M,R,Q,W,R,E,L,M,T,A


In [73]:
"""
Attention!!! This snippet is specific for this particular example so it must be adapted to
each particular case.
"""

import json


def find_key_in_dict(key, dictionary):
    """
    Recursively searches for a key in a nested dictionary and returns its value if found.
    """
    if key in dictionary:
        return dictionary[key]
    for value in dictionary.values():
        if isinstance(value, dict):
            result = find_key_in_dict(key, value)
            if result is not None:
                return result
    return None

with open('full_data.json') as full_json_file:
    full_data = json.load(full_json_file)

classes = {}
for entry in full_data['results']:
    result = find_key_in_dict('ecNumbers', entry)
    if result:
        classes[entry['primaryAccession']] = result[0]['value']

with open('data.json') as json_file:
    data = json.load(json_file)

options = {}
for representative in data['results']:
    members = find_key_in_dict('members', representative)
    if members:
        candidates = [member.split(',')[0] for member in members]
        options[representative['to']['id']] = candidates

targeted = {}
for seq_id in options:
    for entry in classes:
        if entry in options[seq_id]:
            targeted[seq_id] = classes[entry]
targeted


{'UniRef90_A0A024HID4': '1.14.13.239',
 'UniRef90_A0A0F0FVU7': '1.14.13.239',
 'UniRef90_A0A0F7HC06': '1.14.12.19',
 'UniRef90_A0A0F7XZS8': '1.14.13.239',
 'UniRef90_A0A0H2W5P6': '1.14.13.239',
 'UniRef90_A0A0J6IH19': '1.14.13.239',
 'UniRef90_A0A0J6NKR6': '1.14.13.239',
 'UniRef90_A0A0M3CZW0': '1.14.13.239',
 'UniRef90_A0A127MVG4': '1.14.13.239',
 'UniRef90_A0A1C6LXW4': '1.14.13.239',
 'UniRef90_A0A1D9H7T1': '1.14.13.239',
 'UniRef90_A0A1G6YSW4': '1.14.13.239',
 'UniRef90_A0A1H1URA8': '1.14.13.239',
 'UniRef90_A0A1I4Y4T0': '1.14.13.239',
 'UniRef90_A0A1N6MRK3': '1.14.13.239',
 'UniRef90_A0A1W5DQ75': '1.14.13.239',
 'UniRef90_A0A221FU25': '1.14.13.239',
 'UniRef90_A0A257E5N0': '1.14.13.239',
 'UniRef90_A0A2H9U5T4': '1.14.13.239',
 'UniRef90_A0A2U2AP97': '1.14.13.239',
 'UniRef90_A0A2U9L171': '1.14.13.239',
 'UniRef90_A0A2W1N8I2': '1.14.13.239',
 'UniRef90_A0A2Z3EVA1': '1.14.12.19',
 'UniRef90_A0A315T2P1': '1.14.13.239',
 'UniRef90_A0A345Y863': '1.14.13.239',
 'UniRef90_A0A375HRQ2': '1.

In [75]:
print(len(classes), len(options), len(targeted))

826 2400 79


In [None]:
df_target = pd.DataFrame(target, columns=['Target'])
df_target

In [None]:
df_chars = pd.concat([df_selected, df_target], axis=1).sort_values(by='Target')
df_chars

In [None]:
def target_mean(df, by, on):
    means = df.groupby(by)[on].mean()
    return df[by].map(means)

In [None]:
df_num = df_chars.copy()
for col in df_num.columns:
    df_num[col] = target_mean(df_num, by=col, on='Target')
df_num

In [None]:
# Split the dataset into features (X) and target (y)
X = df_num.drop('Target', axis=1)
y = df_num['Target']

# Fit a random forest model to the data
rf = RandomForestRegressor(random_state=0)
rf.fit(X, y)

# Get the feature importances and sort them in descending order
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

# Calculate the cumulative sum of the importance values
cumulative_importance = importances.cumsum()
cumulative_importance

In [None]:
# Filter the feature importances to keep only those that contribute to 99% of the importance
most_important = importances[cumulative_importance <= 0.99].sort_values(ascending=False)
selected_features = importances[cumulative_importance <= 0.99].index
higher_importance = cumulative_importance[selected_features]

fig, ax1 = plt.subplots()

# Bar chart of percentage importance
xvalues = range(len(most_important))
ax1.bar(xvalues, most_important, color='b')
ax1.set_ylabel('Percentage of total importance')
ax1.tick_params(axis='y')

# Line chart of cumulative percentage importance
ax2 = ax1.twinx()
ax2.plot(xvalues, higher_importance, color='r', marker='.')
ax2.set_ylabel('Cumulative importance')
ax2.tick_params(axis='y')

# Rotate x-axis labels
plt.xticks(xvalues, most_important.index, rotation=90)

plt.show()

In [None]:
# Select only the columns corresponding to the selected features
df_selected = df_num[selected_features]
df_selected

In [None]:
_, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))

# Plot the original DataFrame
sns.heatmap(X, cmap='coolwarm', xticklabels=False, yticklabels=False, ax=axs[0])
axs[0].set_title('Original DataFrame')

# Plot the sorted DataFrame
sns.heatmap(X[importances.index], cmap='coolwarm', xticklabels=False, yticklabels=False, ax=axs[1])
axs[1].set_title('Sorted DataFrame by\ndescending feature importance')

# Plot the filtered DataFrame
sns.heatmap(df_selected, cmap='coolwarm', yticklabels=False, ax=axs[2])
axs[2].set_title('Filtered DataFrame by\ncumulative feature importance')

plt.tight_layout()
plt.show()

In [None]:
# #====================================================================================================
R = []
for col in most_important.index:
    R += msa.collection[col]

In [None]:
# #====================================================================================================
G = nx.Graph()
for i, a in enumerate(R[:-1]):
    if a.p() >= args.min_freq:
        for b in R[i + 1:]:
            if b.p() >= args.min_freq:
                G.add_edge(
                    a,
                    b,
                    weight = float(
                        sum(
                            map(lambda x: msa.weights[x], a.sequence_indices ^ b.sequence_indices)
                        )
                    ) / float(
                        sum(
                            map(lambda x: msa.weights[x], a.sequence_indices | b.sequence_indices)
                        )
                    )
                )
# #====================================================================================================
N = sorted(G.nodes(), key=lambda x: x.p(), reverse=True)
for n in N:
    print(n)
# #====================================================================================================
D = nx.to_numpy_array(G, nodelist=N)
D

In [None]:
# Plot the distance matrix
fig, ax = plt.subplots()
im = ax.imshow(D, cmap='viridis')

# Add a colorbar
cbar = ax.figure.colorbar(im, ax=ax)

# Show the plot
plt.show()

In [None]:
# #====================================================================================================
optics_instance = optics(D, args.max_dist, args.min_size, None, 'distance_matrix')
optics_instance.process()
clusters = optics_instance.get_clusters()
# #====================================================================================================
ordering = ordering_analyser(optics_instance.get_ordering())
ordering = ordering.cluster_ordering
plt.figure()
plt.bar(range(0, len(ordering)), ordering[0:len(ordering)], width=1., color='black')
plt.xlim([0, len(ordering)])
plt.xlabel('Points')
plt.ylabel('Reachability Distance')
plt.savefig('%s_reachability_plot.png' % args.out)

In [None]:
# #====================================================================================================
clusters = sorted(clusters, key=lambda x: mean(list(map(lambda y: N[y].p(), x))), reverse=True)
i = 0
while i < len(clusters):
    positions = set(map(lambda x: N[x].position, clusters[i]))
    same_position = {k: [] for k in positions}
    for j in clusters[i]:
        same_position[N[j].position].append(j)
    temp = []
    c = Subset(msa, list(set.union(*map(lambda x: set(N[x].sequence_indices), clusters[i]))))
    for j in clusters[i]:
        if j == max(same_position[N[j].position], key=lambda x: N[x].p.given(c)):
            temp.append(j)
    if len(temp) >= args.min_size:
        clusters[i] = temp
        i += 1
    else:
        del clusters[i]
clusters

In [None]:
# #====================================================================================================
with open('%s_clusters.csv' % args.out, 'w') as outfile:
    for i in range(len(clusters)):
        outfile.write('Cluster %d\n' % (i + 1))
        d = {'MSA\nColumn': [], 'Feature': [], 'Frequency': []}
        for j in sorted(clusters[i], key=lambda x: N[x].position):
            d['MSA\nColumn'].append(N[j].position + 1)
            d['Feature'].append(N[j])
            d['Frequency'].append('%.2f' % round(N[j].p(), 2))
        df = pd.DataFrame(d)
        outfile.write(df.to_csv(index=False))
        outfile.write('\n')

In [None]:
# #====================================================================================================
H = []
for i in range(msa.size):
    row = []
    for j in range(len(clusters)):
        count = 0
        for k in clusters[j]:
            if i in N[k].sequence_indices:
                count += 1
        row.append(float(count) / float(len(clusters[j])))
    H.append(row)
H = array(H)
H

In [None]:
# #====================================================================================================
Z = linkage(H, 'average')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z, labels=array(msa.headers))
plt.savefig('%s_dendrogram.png' % args.out)
tree = to_tree(Z, False)
with open('%s_dendrogram.nwk' % args.out, 'w') as outfile:
    outfile.write(get_newick(tree, "", tree.dist, msa.headers))

In [None]:
# #====================================================================================================
df = get_df(H, msa, range(msa.size), range(len(clusters)))
seq = df.pop('Seq. ID')
try:
    g = sns.clustermap(df, col_cluster=False, yticklabels=False, figsize=(4,4))
except SystemExit:
    raise 'Warning: few clusters to draw a heatmap!'
row_idx = g.dendrogram_row.reordered_ind
# col_idx = g.dendrogram_col.reordered_ind
col_idx = range(len(clusters))  # Keep column index without dendrogram
H = [H[i] for i in row_idx]
H = array(H)
df = get_df(H, msa, row_idx, col_idx)
df.to_csv('%s_seq_adhesion.csv' % args.out)
plt.savefig('%s_seq_adhesion.png' % args.out)

In [None]:
# #====================================================================================================
# # Optional viewing
# #====================================================================================================
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", normalized_stress='auto')
pts = mds.fit(D).embedding_
clf = PCA(n_components=2)
pts = clf.fit_transform(pts)
# #====================================================================================================
colors = array(list(map(lambda x: x.p(), N))) * 100
_, axs = plt.subplots(1, 2, figsize=(12, 4))

# Plot 1: Residue Plot with noise
X_full, Y_full = zip(*pts)
sc = axs[0].scatter(X_full, Y_full, c=colors, cmap='rainbow', vmin=0., vmax=100., alpha=.5)
cb = plt.colorbar(sc, ax=axs[0])
cb.set_label('Frequency (%s)' % '%')
axs[0].set_title('Residue Plot with noise')

# #====================================================================================================
# Plot 2: Residue Plot without noise
noise = optics_instance.get_noise()
points, colors = [], []
for i, (p, c) in enumerate(zip(pts, list(map(lambda x: x.p(), N)))):
    if i not in noise:
        points.append(p)
        colors.append(c)
colors = array(colors) * 100
X_clean, Y_clean = zip(*points)
sc = axs[1].scatter(X_clean, Y_clean, c=colors, cmap='rainbow', vmin=0., vmax=100., alpha=0.5)
cb = plt.colorbar(sc, ax=axs[1])
cb.set_label('Frequency (%s)' % '%')
axs[1].set_title('Residue Plot without noise')

plt.savefig('%s_residue_plot_combined.png' % args.out)


In [None]:
# #====================================================================================================
# # END
# #====================================================================================================