## Feature matrix generation 

In [1]:
import numpy as np 
import pandas as pd 
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
import seaborn as sns
from statannotations.Annotator import Annotator
import matplotlib.pyplot as plt
import os
import statistics 
import scipy
from scipy.stats import mannwhitneyu, combine_pvalues
from feature_matrix_generator import generate_feature_mx
import collections 

In [2]:
dint_encoder = {
  'AA': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'AC': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'AG': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'AT': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'CA': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'CC': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'CG': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'CT': [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
  'GA': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  'GC': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  'GG': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
  'GT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
  'TA': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  'TC': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  'TG': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
  'TT': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
}
def __dinucleotide_encode(seq):
  ohe = []
  for idx in range(len(seq) - 1):
    ohe += dint_encoder[seq[idx : idx + 2]]
  return ohe

# skips 0; PAM starts at 1, last int inclusive
def __get_dinucleotide_nms(start_pos, end_pos):
    nms = []
    dints = sorted(list(dint_encoder.keys()))
    for pos in range(start_pos, end_pos+1):
        if pos == 0:
            continue
        for dint in dints:
            nms.append('%s%s' % (dint, pos))
    return nms

# for context, skips middle 
def encode_7mer_excluding_middle(seq):
    assert len(seq) == 7, "Sequence must be exactly 7 nucleotides long"
    ohe = []
    for idx in range(2): 
        ohe += dint_encoder[seq[idx: idx + 2]]
    
    for idx in range(4, 6):  
        ohe += dint_encoder[seq[idx: idx + 2]]
    return ohe

# SPECIFIC FOR CONTEXT (7 NUCLEOTIDES)
def get_dinucleotide_nms_context():
    dints = sorted(list(dint_encoder.keys()))
    feature_names = []
    for pos in [1, 2]:
        for dint in dints:
            feature_names.append(f'{dint}{pos}')
    for pos in [5, 6]:
        for dint in dints:
            feature_names.append(f'{dint}{pos}')
    return feature_names


def generate_feature_mx(df_ed):
    feature_records = []
    
    nucleotides = ['A', 'C', 'G', 'T']
    

    for _, row in df_ed.iterrows():
        features = {
            'eff': row['eff'],
            # 'SGN_Strand': row['strand'],
            'Editing_Position': -1 * row['pos']
        }

        unique_editing_positions = set(df_ed['pos'].tolist())
        positions = [0 for x in unique_editing_positions]
        editing_position = row['pos']

        # for index, position in enumerate(unique_editing_positions):
        #     if position != editing_position:
        #         features[f'Editing_P{position}'] = positions[index]
        #     else:
        #         positions[index] = 1
        #         features[f'Editing_P{position}'] = positions[index]


        seq_nb = row['context']
        for k, v in enumerate(seq_nb):
            if k == 3:
                continue
            for nt in nucleotides:
                features[f'Context_P{str(k - 3)}_{nt}'] = int(nt == v)

        features['Editing_mt'] = mt.Tm_Wallace(seq_nb) / 100
        sgn = row['target']
        for k in range(0, 30, 4):
            seq_nb2 = Seq(sgn[k:k+10])
            mt_w = mt.Tm_Wallace(seq_nb)
            features['SGN_mt_w_'+str(k)] = mt_w/100

        # Processing target sequence
        for k, nt in enumerate(sgn, -len(sgn)):
            pos_label = k + 11 if k + 10 >= 0 else k + 10
            for bs in nucleotides:
                features[f'SGN_P{pos_label}_{bs}'] = int(bs == nt)

        # dincucleotide
        # context_ohe = encode_7mer_excluding_middle(seq_nb)
        # context_nm = get_dinucleotide_nms_context()
        # for ohe in zip(context_nm,context_ohe):
        #     features["di_context_"+ohe[0]] = ohe[1]
        
        # target
        target_no_pam = row['target_no_pam']
        # dseq = __dinucleotide_encode(target_no_pam)
        # dnms = __get_dinucleotide_nms(-31,-1)
        # for ohe in zip(dnms,dseq):
        #     features["di_target_no_pam_"+ohe[0]] = ohe[1]

        # pam 
        # pam = row['pam']
        # pamseq = __dinucleotide_encode(pam)
        # pamnms = __get_dinucleotide_nms(1,10)
        # for ohe in zip(pamnms,pamseq):
        #     features["di_pam_"+ohe[0]] = ohe[1]

        # nucleotide counts
        counts = len(target_no_pam)
        nuc_content = [
            target_no_pam.count('A')/counts,
            target_no_pam.count('C')/counts,
            target_no_pam.count('G')/counts,
            target_no_pam.count('T')/counts,
            (target_no_pam.count('G') + target_no_pam.count('C'))/counts
          ]
        nuc_names = ['A', 'C', 'G', 'T', 'GC']

        for nc in zip(nuc_names, nuc_content):
            features[nc[0]+'content'] = nc[1]

        feature_records.append(features)

    df_stat_feat = pd.DataFrame(feature_records)
    return df_stat_feat

## Main function 

In [3]:
df_editing = pd.read_csv("~/internship/p12_editing_summary.csv")

df_editing['target_no_pam'] = df_editing['target'].apply(lambda x: x[:-10])
df_editing['pam'] = df_editing['target'].apply(lambda x: x[-10:])
df_editing = df_editing[df_editing.spacer_length == 23]
df_editing = generate_feature_mx(df_editing)


In [4]:
df_editing.head()

Unnamed: 0,eff,Editing_Position,Context_P-3_A,Context_P-3_C,Context_P-3_G,Context_P-3_T,Context_P-2_A,Context_P-2_C,Context_P-2_G,Context_P-2_T,...,SGN_P9_T,SGN_P10_A,SGN_P10_C,SGN_P10_G,SGN_P10_T,Acontent,Ccontent,Gcontent,Tcontent,GCcontent
0,0.001117,23.0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0.225806,0.548387,0.16129,0.064516,0.709677
1,0.006667,11.0,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0.225806,0.548387,0.16129,0.064516,0.709677
2,0.000483,5.0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0.225806,0.548387,0.16129,0.064516,0.709677
3,6.7e-05,3.0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0.225806,0.548387,0.16129,0.064516,0.709677
4,0.000167,25.0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0.129032,0.387097,0.354839,0.129032,0.741935


In [5]:
unwanted_list = ['SGN_Strand', 'Editing_Position', 'Acontent', 'Tcontent', 'Ccontent', 'Gcontent', 'GCcontent', 'Editing_mt']
begin = 'SGN_mt_w_'

In [6]:
amp = ['Editing_Position', 'Context_P1_C', 'SGN_P-10_C', 'Editing_P-14', 'SGN_P-29_C', 'Acontent', 'SGN_P-12_G', 'SGN_P2_A', 'Context_P2_C', 'Editing_P-15', 'Ccontent', 'SGN_P1_G', 'SGN_P-16_C', 'GCcontent', 'SGN_P-27_T', 'SGN_P-20_C', 'Context_P3_G', 'Editing_P-9', 'SGN_P2_G', 'SGN_P-12_A', 'SGN_P-28_G', 'Context_P3_T', 'SGN_P-14_G', 'SGN_P7_A', 'SGN_P-4_A', 'Editing_P-18', 'SGN_P-9_T', 'SGN_P-23_T', 'SGN_P-11_A', 'Context_P-3_G', 'SGN_P-16_T', 'Context_P1_G', 'SGN_P-13_C', 'SGN_P7_C', 'SGN_P3_T', 'Context_P-3_C', 'SGN_P-27_A', 'SGN_P-1_C', 'SGN_P-17_C', 'SGN_P-10_G', 'SGN_P-19_C', 'Context_P1_T', 'SGN_P-24_C', 'SGN_P-18_T', 'SGN_P10_A', 'SGN_mt_w_0', 'SGN_P-15_A', 'Tcontent', 'Context_P-1_G', 'Context_P-2_T', 'SGN_P-6_A', 'SGN_mt_w_24', 'SGN_P-20_G', 'SGN_P-27_G', 'SGN_P-9_C', 'SGN_P7_G', 'Context_P3_C', 'Context_P-3_T', 'SGN_P-25_T', 'Context_P-1_T', 'SGN_P-1_T', 'SGN_P-20_T', 'SGN_P-3_G', 'Gcontent', 'SGN_P-17_A', 'Editing_P-17']
cv = ['Editing_Position', 'SGN_P3_T', 'SGN_P7_A', 'GCcontent', 'SGN_P9_T', 'SGN_P-10_A', 'SGN_P3_G', 'SGN_P-12_A', 'Acontent', 'SGN_P2_G', 'Tcontent', 'Gcontent', 'SGN_P-2_A', 'SGN_P-25_G', 'SGN_P-27_G', 'SGN_P7_G', 'SGN_P-2_T', 'SGN_P-9_G', 'SGN_P-22_C', 'SGN_P-1_G', 'SGN_P-10_T', 'SGN_P-13_G', 'Ccontent', 'SGN_P-7_A', 'SGN_P-15_A', 'SGN_P-20_T', 'SGN_P-3_A', 'Context_P1_C', 'Context_P-1_C', 'SGN_P-14_T', 'Context_P-3_G']
fh = ['Editing_Position', 'SGN_P3_T', 'Tcontent', 'SGN_P-8_T', 'Context_P-1_T', 'SGN_P-23_G', 'Gcontent', 'SGN_P-11_T', 'SGN_P9_T', 'SGN_P-14_G','SGN_P-26_G', 'SGN_P-28_C', 'Context_P1_C', 'SGN_P-26_C', 'SGN_P-17_G', 'Context_P-3_G', 'SGN_P-8_G', 'SGN_P-24_C', 'SGN_P-7_C', 'SGN_P-3_C', 'SGN_P10_G', 'SGN_P-14_T', 'SGN_P8_G', 'SGN_P-21_T', 'SGN_P-10_G', 'SGN_P-4_A', 'SGN_P-4_G', 'SGN_P-15_A', 'SGN_P1_G', 'SGN_P8_T', 'SGN_P10_C', 'SGN_P-28_T', 'SGN_P-14_A', 'SGN_P-9_A', 'SGN_P-15_T','SGN_P4_G', 'SGN_P1_T', 'SGN_P-24_G', 'SGN_P1_C', 'SGN_P-21_A', 'SGN_P-18_C', 'SGN_P-12_G', 'SGN_P-5_A', 'SGN_P-7_G', 'SGN_P-22_T', 'SGN_P-31_T', 'SGN_P-7_T', 'SGN_P-2_G','SGN_P7_T', 'SGN_P-21_C', 'SGN_P2_C', 'SGN_P-2_A', 'SGN_P-1_G', 'SGN_P-9_G', 'SGN_P-16_C', 'SGN_P-22_C', 'SGN_P2_A', 'SGN_P-12_A', 'SGN_P-20_G', 'SGN_P-28_A', 'SGN_P-30_A', 'SGN_P-19_C', 'SGN_P8_C', 'SGN_P-10_C', 'SGN_P3_C', 'SGN_P-19_T', 'SGN_P-1_C', 'SGN_P-6_C', 'SGN_P7_A', 'SGN_P3_G', 'SGN_P-24_A', 'SGN_mt_w_24', 'SGN_P-25_C', 'SGN_P-12_C', 'SGN_P4_A', 'SGN_P-20_T', 'SGN_P9_C', 'SGN_P-13_T', 'SGN_P-29_A', 'SGN_P-31_C', 'SGN_P-27_A', 'SGN_P7_C', 'SGN_P-18_T', 'SGN_P-9_C', 'SGN_P-1_A', 'Context_P1_T', 'SGN_P-17_T', 'SGN_P-5_T', 'SGN_P-23_A', 'SGN_P2_G', 'SGN_P-9_T', 'SGN_P-25_G', 'SGN_P-25_T', 'SGN_P10_T', 'SGN_P-15_C', 'SGN_P-6_A', 'SGN_P-31_A', 'SGN_P-30_C', 'SGN_P4_T', 'SGN_P-1_T', 'SGN_P-17_A', 'SGN_P-11_A', 'SGN_P-31_G', 'SGN_P-2_C', 'SGN_P-10_A', 'SGN_P-3_G', 'SGN_P-12_T', 'Editing_mt', 'SGN_P-14_C', 'SGN_mt_w_28', 'SGN_mt_w_20', 'SGN_P-21_G', 'SGN_P-17_C', 'SGN_P-16_G', 'SGN_P1_A', 'SGN_P3_A', 'SGN_P-24_T', 'SGN_P-23_T', 'SGN_mt_w_4', 'SGN_P-22_G', 'SGN_P-6_T', 'SGN_P-29_C', 'SGN_P7_G', 'SGN_P-18_A', 'SGN_P-13_C', 'SGN_P-13_G', 'SGN_P-18_G', 'SGN_P-27_T', 'SGN_P-3_T', 'SGN_P-25_A', 'SGN_P-11_C', 'Context_P-2_C', 'SGN_P-29_T', 'SGN_P-8_C', 'SGN_mt_w_8', 'SGN_mt_w_12', 'SGN_mt_w_16', 'SGN_P-3_A', 'SGN_P-15_G', 'SGN_P-8_A', 'SGN_P-2_T', 'SGN_P-30_G', 'SGN_P-4_C', 'SGN_P-6_G', 'SGN_P-27_C', 'SGN_mt_w_0', 'SGN_P-19_G', 'SGN_P-23_C', 'Context_P3_C', 'SGN_P10_A', 'SGN_P-13_A', 'Acontent', 'SGN_P-16_T', 'SGN_P-30_T', 'SGN_P-27_G', 'SGN_P2_T', 'SGN_P8_A', 'Context_P3_G', 'Context_P3_A', 'SGN_P-19_A', 'Context_P-3_A', 'Context_P1_A', 'SGN_P-5_G', 'Context_P-3_C', 'Context_P2_C', 'Context_P2_A', 'SGN_P4_C', 'SGN_P9_G', 'GCcontent', 'Context_P3_T', 'Context_P2_G', 'Context_P2_T', 'Context_P-2_T', 'Context_P-2_G', 'Context_P-3_T', 'Context_P-2_A', 'Ccontent', 'Context_P-1_A', 'SGN_P-7_A', 'Context_P-1_G', 'Context_P1_G', 'SGN_P-28_G', 'Context_P-1_C']

In [7]:
for x in amp:
    if x.startswith('Editing'):
        amp.remove(x)

In [8]:
feature_list = [feature for feature in cv if feature not in unwanted_list and not feature.startswith(begin)]

In [9]:
bin_dict = {(0, 0): 0, (0, 1): 1, (1, 0): 2, (1, 1): 3}

overall_results = {}
for i, feature1 in enumerate(feature_list):
    for feature2 in feature_list[i+1:]:  
        f1 = df_editing[feature1].tolist()
        f2 = df_editing[feature2].tolist()
        results = []
        for index, tup in enumerate(zip(f1, f2)):
            result = bin_dict.get(tup, None)
            results.append(result)
        overall_results[f"paired_{feature1}_{feature2}"] = results

paired_dataframe = pd.DataFrame(overall_results)
result_df = pd.concat([df_editing, paired_dataframe], axis=1)
eff_list = result_df['eff'].tolist()


In [10]:
# function that splits the string and returns list of features
def split_string(input_string, possible_substrings):
    found_substrings = []
    
    possible_substrings.sort(key=len, reverse=True)
    
    for substring in possible_substrings:
        pos = input_string.find(substring)
        if pos != -1:
            found_substrings.append((pos, substring))
            input_string = input_string.replace(substring, ' ' * len(substring), 1)

    found_substrings.sort()
    sorted_substrings = [substring for pos, substring in found_substrings]

    return sorted_substrings

In [11]:
def test_significant_category(col_list, eff_list):
    df = pd.DataFrame({
        'Category': col_list,
        'Efficiency': eff_list
    })
    mean_efficiency = df.groupby('Category')['Efficiency'].mean().reset_index()
    mean_efficiency_list = list(mean_efficiency.itertuples(index=False, name=None))
    if mean_efficiency.empty:
        return None, pd.DataFrame()
    
    index_of_max = mean_efficiency['Efficiency'].idxmax()
    target_category = mean_efficiency.loc[index_of_max, 'Category']
    
    p_values = []
    comparisons = []
    
    target_data = df[df['Category'] == target_category]['Efficiency']
    is_universally_significant = True
    
    for category in mean_efficiency['Category']:
        if category != target_category:
            other_data = df[df['Category'] == category]['Efficiency']
            stat, p_value = scipy.stats.mannwhitneyu(target_data, other_data, alternative='two-sided')
            p_values.append(p_value)
            comparisons.append(f"{target_category} vs {category}")
            
            if p_value >= 0.05:
                is_universally_significant = False

    combined_pvalues = scipy.stats.combine_pvalues(p_values, method='fisher')[1]
    p_value_df = pd.DataFrame({
        'Comparison': comparisons,
        'P-value': p_values,
        'Combined P-value': [combined_pvalues] * len(comparisons),
        'Mean Efficiency': [mean_efficiency_list] * len(comparisons) 
    })
    
    if is_universally_significant:
        return target_category, p_value_df
    else:
        return None, p_value_df



In [12]:
cv_set = set(cv)
fh_set = set(fh)
intersection = cv_set.intersection(fh_set)

In [13]:
combined = cv_set.union(fh_set)

In [14]:
cv_df = pd.read_csv("~/internship/feature_extraction/cv_p_values.csv")
fh_df = pd.read_csv("~/internship/feature_extraction/fh_p_values.csv")
amp_df = pd.read_csv("~/internship/feature_extraction/amp_p_values.csv")

In [15]:
set_amp = set(amp_df['column'])
set_fh = set(fh_df['column'])
set_cv = set(cv_df['column'])

common_elements = set_amp & set_cv & set_fh

common_elements = list(common_elements)

print(common_elements)


['paired_SGN_P2_G_SGN_P-27_G']


In [73]:
df = pd.read_csv("~/internship/p12_editing_summary.csv")
result_df = generate_feature_mx(df)

In [20]:
plot_dir = "./new"
os.makedirs(plot_dir, exist_ok=True)

for x in ['paired_SGN_P3_T_SGN_P7_A']:
    if x in ['paired_SGN_P3_T_SGN_P7_A']:
        y = "eff"
        # finding out the columns 
        sub_df = fh_df[fh_df['column'] == x]
        num_of_columns = len(sub_df) + 1
        order = []
        main = -1
        for z in sub_df[sub_df['column'] == x]['Comparison'].tolist():
            for index, num in enumerate(z.split("vs")):
                num = int(num.strip())
                if index == 0:
                    main = num 
                    if num in order:
                        continue
                    else:
                        order.append(num)
                else:
                    order.append(num)
    
        order.sort()
        print(order)
        print(sub_df)
        print(x)
        ax = sns.barplot(data=result_df, x=x, y=y, order=order)
        ax.set_xlabel("")
        ax.set_xticklabels([]) 
        ax.set_xticks([]) 
        ax.grid(True, linestyle='--', linewidth=0.5, color='grey', axis='y', alpha=0.7)
        pairs=[]
    
        for a in order:
            if a == main:
                continue 
            else:
                pairs.append((main, a))
        
        annotator = Annotator(ax, pairs, data=result_df, x=x, y=y, order=order)
        annotator.configure(test='Mann-Whitney', text_format='full', loc='outside', show_test_name=False)
        annotator.apply_and_annotate()
        #
        dic = {0: [0, 0], 1: [0, 1], 2: [1, 0], 3: [1, 1]}
        df = {}
        for num in order:
            df[num] = dic[num]
        summary_table = pd.DataFrame(df)
        x_cropped = "_".join(x.split("_")[1:])
        row_labels = split_string(x_cropped, feature_list)
        colors = plt.cm.BuPu(np.linspace(0, 0.5, len(summary_table)))
        colors = colors[::-1]
        
        the_table = plt.table(cellText=summary_table.values, 
                              rowLabels=row_labels,
                              cellLoc='center', rowLoc='center',
                              rowColours=colors,
                              loc='bottom',
                            bbox=[0.001, -0.12, 1, 0.1])  
    
        plt.savefig(os.path.join(plot_dir, f"{x}_plot.png"), bbox_inches='tight')  
        plt.savefig(os.path.join(plot_dir, f"{x}_plot.pdf"), bbox_inches='tight')  
        plt.close() 

    else:
        continue 


[0, 1, 2, 3]
                     column Comparison       P-value  Combined P-value  \
6  paired_SGN_P3_T_SGN_P7_A     1 vs 0  1.126234e-07     3.052329e-112   
7  paired_SGN_P3_T_SGN_P7_A     1 vs 2  1.693489e-77     3.052329e-112   
8  paired_SGN_P3_T_SGN_P7_A     1 vs 3  4.447524e-33     3.052329e-112   

                                     Mean Efficiency  
6  [(0, 0.09973277164291978), (1, 0.1300039195619...  
7  [(0, 0.09973277164291978), (1, 0.1300039195619...  
8  [(0, 0.09973277164291978), (1, 0.1300039195619...  
paired_SGN_P3_T_SGN_P7_A
0 vs. 1: Mann-Whitney-Wilcoxon test two-sided, P_val:1.324e-06 U_stat=5.813e+04
1 vs. 2: Mann-Whitney-Wilcoxon test two-sided, P_val:3.589e-19 U_stat=2.770e+04
1 vs. 3: Mann-Whitney-Wilcoxon test two-sided, P_val:2.477e-09 U_stat=1.835e+04


In [60]:
targets = ['SGN_P3_T', 'SGN_P2_G', 'SGN_P7_A','SGN_P7_T', 'SGN_P7_C', 'SGN_P7_G', 'SGN_P3_T', 'SGN_P9_A','SGN_P9_T', 'SGN_P9_C', 'SGN_P9_G', 'SGN_P8_A', 'SGN_P8_T', 'SGN_P8_C','SGN_P8_G']

In [61]:
cv_targets = []
for row in cv_df.iterrows():
    string = row[1]['column']
    string_cropped = "_".join(string.split("_")[1:])
    row_labels = split_string(string_cropped, cv)
    for x in row_labels:
        if x in targets:

            cv_targets.append(row_labels)

In [62]:
fh_targets = []
for row in fh_df.iterrows():
    string = row[1]['column']
    string_cropped = "_".join(string.split("_")[1:])
    row_labels = split_string(string_cropped, fh)
    for x in row_labels:
        if x in targets:

            fh_targets.append(row_labels)

In [63]:
amp_targets = []
for row in amp_df.iterrows():
    string = row[1]['column']
    string_cropped = "_".join(string.split("_")[1:])
    row_labels = split_string(string_cropped, amp)
    for x in row_labels:
        if x in targets:

            amp_targets.append(row_labels)

In [64]:
for x in amp_targets:
    x.sort()
for x in fh_targets:
    x.sort()
for x in cv_targets:
    x.sort()

In [65]:
for x in fh_targets:
    if x in cv_targets:
        if x in amp_targets:
            print(x)

['SGN_P-27_G', 'SGN_P2_G']
['SGN_P-27_G', 'SGN_P2_G']
['SGN_P-27_G', 'SGN_P2_G']
