In [54]:
import pandas as pd
import ast
from collections import Counter

In [55]:
file_path = '/n/groups/marks/projects/public_epitope/notebooks/Hailey/all_motifs'
df = pd.read_csv(file_path)

In [56]:
# convert string representations of motifs to sets
df['ab_motif'] = df['ab_motif'].apply(ast.literal_eval)

# flatten that list of sets into a single list to count occurrences
all_motifs = [motif for sublist in df['ab_motif'].tolist() for motif in sublist]

# count occurrences of each motif
motif_counts = Counter(map(tuple, df['ab_motif']))

# find top 20 most common motifs
top_motifs = motif_counts.most_common(20)

# map motifs to group ids
motif_to_group_id = {motif: i+1 for i, (motif, count) in enumerate(top_motifs)}
df['group_id'] = pd.NA

# loop through  top motifs
for group_id, (motif_tuple, _) in enumerate(top_motifs, start=1):
    # grab rows containing the current motif
    is_motif_present = df['ab_motif'].apply(lambda x: set(x) == set(motif_tuple))
    
    # set a 'group_id' for rows where the motif is present 
    # -- helps us identify which complexes are part of which motif group 
    df.loc[is_motif_present, 'group_id'] = group_id

top_motif_tables = df.dropna(subset=['group_id'])
top_motif_tables = top_motif_tables.drop_duplicates(subset='uq_id')

top_motif_tables = top_motif_tables.sort_values(by='group_id')

# add column for motif count (for sorting purposes)
top_motif_tables['motif_count'] = top_motif_tables['group_id'].map(lambda x: top_motifs[x-1][1])
top_motif_tables = top_motif_tables.sort_values(by=['motif_count', 'group_id'], ascending=[False, True])

In [63]:
top_motif_tables[['uq_id', 'ag_aa', 'ab_motif', 'motif_count', 'group_id']]

# top_motif_tables.to_csv('top_motif_tables.csv', index=False)  # Specify the path where you want to save the CSV

Unnamed: 0,uq_id,ag_aa,ab_motif,motif_count,group_id
13105,4khx_H(H)_ant(A),LEU,"{TYR37, SER36}",7250,1
12770,4k24_H(H)_ant(U),LEU,"{TYR37, SER36}",7250,1
12954,4k8r_H(D)_ant(B),THR,"{TYR37, SER36}",7250,1
23947,5cba_H(C)_ant(F),ARG,"{TYR37, SER36}",7250,1
23707,5c7x_H(M)_ant(B),CYS,"{TYR37, SER36}",7250,1
...,...,...,...,...,...
99628,7s11_H(J)_ant(E),TYR,"{ASN37, SER58}",876,19
99698,7s11_H(K)_ant(F),TYR,"{ASN37, SER58}",876,19
50674,6obg_H(C)_ant(A),ARG,"{ASN37, SER58}",876,19
99757,7s13_H(H)_ant(D),TYR,"{ASN37, SER58}",876,19


In [58]:
# count the number of complexes in each motif group
group_counts = top_motif_tables.groupby('group_id').size()
group_counts_df = group_counts.reset_index(name='complex_count')

In [59]:
group_counts_df

Unnamed: 0,group_id,complex_count
0,1,167
1,2,34
2,3,57
3,4,13
4,5,81
5,6,64
6,7,84
7,8,33
8,9,42
9,10,42
