In [None]:
import pandas as pd
import numpy as np

In [None]:
from itertools import combinations
from upsetplot import from_memberships
from upsetplot import plot
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'

def intersect(data, upset_plot = False):
    """A function that returns all possible distinct intersections and generates an upset plot
    Parameters
    ----------
    data = pandas dataframe
    upset_plot = boolean
    
    Returns
    -------
    df_final = dataframe with list of matches for each comparison and counts
    upset = data formatted to generate upset plots """

    #convert data column names to strings  
    col_names = []
    for i in data.columns:
        col_names.append(str(i))
    data.columns = col_names
    
    #total groups
    n = len(col_names)

    #generate all possible combinations for intersection analysis
    comb_list = []
    for i in range(2,n+1):
        comb_list.append(list(combinations(col_names, i)))

    #find all unique elements and drop na
    unique_elem = []
    tot_elements = []
    for i in col_names:
        unique_elem.append(set(data[i].dropna().to_list()))
    for i in range(len(unique_elem)):
        tot_elements.append([col_names[i], len(unique_elem[i])])
    
    print("Total unique number of items", tot_elements)

    
    #make dictionary for unique elements 
    dict_ = {}
    for i in range(len(col_names)):
        dict_.update({col_names[i]: unique_elem[i]})

    #intersect data, find distinct sets, drop na
    list_intersect = []
    for i in comb_list:
        for j in i:
            if len(j) == 2:
                func_1 = "set(data['{x}'].dropna().to_list()).intersection(data['{y}'].dropna().to_list())".format(x = j[0], y = j[1])
                inter = eval(func_1)
                dict_adj = []
                for i, k in dict_.items():  
                    if i != j[0] and i != j[1]:
                        dict_adj.append(k) 
                for i in dict_adj:
                    unique = inter - i
                    inter = unique
                list_intersect.append([j,list(inter), len(list(inter))])
            else:
                func_2 = "set(data['{x}'].dropna().to_list()).intersection(data['{y}'].dropna().to_list())".format(x = j[0], y = j[1])
                cond = "i != j[0] and i != j[1]"
                for _ in range(2,len(j)):
                    decor_1 = ".intersection(data['{z}'].dropna().to_list())".format(z = j[_])
                    decor_2 = " and i != j[{x}]".format(x = _)
                    func_2 = func_2 + decor_1
                    cond = cond + decor_2
                inter = eval(func_2)
                dict_adj = []
                for i, k in dict_.items():  
                    if eval(cond):
                        dict_adj.append(k) 
                for i in dict_adj:
                    unique = inter - i
                    inter = unique
                list_intersect.append([j,list(inter), len(list(inter))])

    #obtain elements found only in individual datasets
    for j in range(len(col_names)):
        for i in list_intersect:
            if col_names[j] in set(i[0]):
                unique_elem[j] = unique_elem[j] - set(i[1]) 
        unique_elem[j] = list(unique_elem[j])

    #create dataframe for elements found only in individual datasets
    df_1 = pd.DataFrame(col_names)
    df_1[1] = unique_elem
    df_1[2] = [len(i) for i in unique_elem]

    #combine intersect data and unique elements found within individual sets
    df_2 = pd.DataFrame(list_intersect)
    df_3 = pd.concat([df_1,df_2])
    df_3.columns = ["Intersection", "Match", "Counts"]
    df_3 = df_3.reset_index(drop = True)
    
    #generate data structure for upset plot
    upset = df_3.drop("Match", axis=1)
    lst_1 = df_3["Intersection"].to_list()
    lst_2 = df_3["Intersection"].to_list()
    for i in range(len(col_names)):
        lst_1[i] = [lst_2[i]]
    upset = from_memberships(
    lst_1, data=upset["Counts"])
    
    #make upset plot
    if upset_plot == True:
        plot(upset)
    
    return df_3, upset

<img src="./images/intersection.png" width="500" height="800">

This function will find distinct sets between groups of infinite size. The function will return groups being compared, a list of matches, and the total number of unique elements. This function will be useful in proteomic and transcriptomic analysis to find distinct matches amongst all combinations of groups. In addition, an upset plot will be generated and the corresponding UpSet plot data format used will be outputed.

# Test 1 (Small Dataset)

In [None]:
arr = np.array([["a","b","c","d"],
         ["a","e","f","d"],
         ["a","c","z","l"]])

In [None]:
#using integer column names
df = pd.DataFrame(arr.T)
df.columns = [0,1,2] 

In [None]:
df

In [None]:
data,upset = intersect(df, upset_plot = True)
#plt.savefig("upset.svg")

In [None]:
data

In [None]:
upset

# Test 2 (Small Dataset)

In [None]:
arr = np.array([["a","b","c","d"],
                ["a","e","f","d"],
                ["a","c","z","l"],
                ["x","c","w","l"],
                ["a","c","z","y"]])

In [None]:
df = pd.DataFrame(arr.T)
df.columns = ["a","b","c", "d", "e"] 

In [None]:
df

In [None]:
data,upset = intersect(df[0:3], upset_plot = True)

In [None]:
data

In [None]:
upset

# Test 3 (Medium Dataset)

In [None]:
arr_1 = np.round(np.random.normal(loc = 10, scale = 2, size = 1000), 2)
arr_2 = np.round(np.random.normal(loc = 10, scale = 2, size = 1000), 2)
arr_3 = np.round(np.random.normal(loc = 10, scale = 2, size = 1000), 2)

In [None]:
df = pd.DataFrame(arr_1)
df[1] = arr_2
df[2] = arr_3

df.columns = ["Exp1", "Exp2", "Exp3"]

In [None]:
df

In [None]:
data,upset = intersect(df, upset_plot = True)

In [None]:
data

In [None]:
upset

# Test 4 (Medium High Dataset)

In [None]:
arr_1 = np.round(np.random.normal(loc = 10, scale = 2, size = 10000), 2)
arr_2 = np.round(np.random.normal(loc = 10, scale = 2, size = 10000), 2)
arr_3 = np.round(np.random.normal(loc = 10, scale = 2, size = 10000), 2)

In [None]:
df = pd.DataFrame(arr_1)
df[1] = arr_2
df[2] = arr_3

df.columns = ["Exp1", "Exp2", "Exp3"]

In [None]:
df

In [None]:
data,upset = intersect(df, upset_plot = True)

In [None]:
data

In [None]:
upset

# Test 5 (Unequal length dataset)

In [None]:
arr_1 = np.round(np.random.normal(loc = 10, scale = 2, size = 800), 2)
arr_2 = np.round(np.random.normal(loc = 10, scale = 2, size = 1000), 2)
arr_3 = np.round(np.random.normal(loc = 10, scale = 2, size = 1200), 2)

In [None]:
df_1 = pd.DataFrame(arr_1)
df_2 = pd.DataFrame(arr_2)
df_3 = pd.DataFrame(arr_3)

In [None]:
unequal = pd.concat([df_1, df_2, df_3],axis=1)
unequal.columns = ["Exp1", "Exp2", "Exp3"]

In [None]:
unequal

In [None]:
data,upset = intersect(unequal, upset_plot = True)

In [None]:
data

In [None]:
upset

This function passed all tests. It is able to handle large datasets pretty well and return the proper number of matches and counts. A proper pytest will be developed later.

# Using package

In [None]:
import gen_analysis as gn

In [None]:
data, upset = gn.intersect(unequal, upset_plot=True)

In [None]:
data

In [None]:
upset

In [None]:
#numbers add up for exp 1
65+73+97+245

In [None]:
#double check random values to see if it is true
i, c = np.where(unequal.values == 4.25)
np.unique(unequal.columns[c])

In [None]:
#double check random values to see if it is true
i, c = np.where(unequal.values == 2.71)
np.unique(unequal.columns[c])

In [None]:
#double check random values to see if it is true
i, c = np.where(unequal.values == 8.78)
np.unique(unequal.columns[c])

In [None]:
#double check random values to see if it is true
i, c = np.where(unequal.values == 10.44)
np.unique(unequal.columns[c])

In [None]:
#double check random values to see if it is true
i, c = np.where(unequal.values == 7.5)
np.unique(unequal.columns[c])

Everything looks good!