# TESTING CORRELATIONS BETWEEN MICROBIOME AND RESISTOME
We want to see if there's any direct correlation between any particular zOTU and any particular ARG and MGE. To begin with, I'll try just with taxa that make up each core, to deal with way less data at once 

In [1]:
import os

import pandas as pd
from scipy import stats as stat
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import numpy as np

from tqdm.notebook import tqdm

In [2]:
# I'm lazy so I'm going to hardcode the file routes to begin with
micro_df = pd.read_csv("../data/r_data/rclr_counts_resistome_format.csv")
micro_df.rename(columns = {"Unnamed: 0": "ZOTU"}, inplace = True) # this works because the unnamed column gets automatically assigned that name 
micro_df

Unnamed: 0,ZOTU,argl_1,argl_16,argl_18,argl_22,argl_2,argl_23,argl_24,argl_25,argl_26,...,argl_10,argl_11,argl_12,argl_4,argl_5,argl_6,argl_13,argl_14,argl_17,argl_15
0,zOTU_1038,0.000000,0.000000,0.000000,-0.562286,0.000000,-0.812174,-1.057733,-1.305141,0.000000,...,0.000000,-0.526270,2.054452,0.00000,0.000000,0.000000,0.0,-1.0,0.0,0.0
1,zOTU_255,0.000000,-0.103321,0.000000,0.000000,-2.687746,0.000000,0.000000,0.000000,0.000000,...,-0.972064,1.334483,-0.205574,0.00000,-2.893764,-1.802858,2.0,2.0,-1.0,-1.0
2,zOTU_885,0.381853,1.374781,-0.222600,-1.168421,0.808762,-2.891616,-0.605748,-1.815967,-1.589599,...,-1.195208,0.000000,0.000000,0.00000,0.000000,0.000000,1.0,0.0,-1.0,1.0
3,zOTU_638,0.000000,0.000000,0.000000,0.407115,0.000000,0.574120,0.292194,1.998076,1.067157,...,-1.482890,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0
4,zOTU_1166,-1.322895,-1.281976,-2.707506,0.000000,-1.994599,0.000000,0.000000,1.442130,-0.203305,...,-2.581502,0.000000,-0.744570,0.00000,0.000000,0.000000,-1.0,0.0,-2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2032,zOTU_1599,0.000000,0.000000,0.000000,0.889967,0.000000,-0.119027,-0.295593,-1.815967,-1.589599,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0
2033,zOTU_1336,0.000000,0.000000,0.000000,0.130862,0.000000,-0.183565,0.215233,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0
2034,zOTU_910,0.000000,0.000000,0.000000,1.749349,0.000000,1.713555,1.539651,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0
2035,zOTU_575,1.465198,1.864329,-0.142557,0.000000,1.574934,0.000000,0.000000,0.000000,0.000000,...,-0.502060,-0.680421,-1.081043,0.00000,0.000000,0.000000,1.0,-2.0,0.0,1.0


In [3]:
arg_df = pd.read_csv("../data/r_data/count_arg.csv")
mge_df = pd.read_csv("../data/r_data/count_mge.csv")
arg_df.rename(columns = {"Assay": "ARG"}, inplace = True)
mge_df.rename(columns = {"Assay": "MGE"}, inplace = True)
arg_df.head()

Unnamed: 0,ARG,argl_25,argl_27,argl_20,argl_22,argl_26,argl_23,argl_24,argl_19,argl_21,...,argl_12,argl_16,argl_4,argl_5,argl_6,argl_1,argl_2,argl_3,argl_11,argl_14
0,aacC2,0.000222,0.000512,5.6e-05,0.000211,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,aacA/aphD,4.6e-05,3.9e-05,0.0,0.000149,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,aac(6')-II,0.006918,0.006194,0.0,0.006457,0.004699,0.007621,0.00863,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,aphA3,0.0,0.002323,0.0,0.000818,0.001205,0.000647,0.000738,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,sat4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Checking if colnames are the same
arg_colnames = arg_df.columns.values.tolist()
micro_colnames = micro_df.columns.values.tolist()
mge_colnames = mge_df.columns.values.tolist()
print(sorted(mge_colnames))
print(sorted(arg_colnames))
print(sorted(micro_colnames))

['MGE', 'argl_1', 'argl_10', 'argl_11', 'argl_12', 'argl_13', 'argl_14', 'argl_15', 'argl_16', 'argl_17', 'argl_18', 'argl_19', 'argl_2', 'argl_20', 'argl_21', 'argl_22', 'argl_23', 'argl_24', 'argl_25', 'argl_26', 'argl_27', 'argl_3', 'argl_4', 'argl_5', 'argl_6', 'argl_7', 'argl_8', 'argl_9']
['ARG', 'argl_1', 'argl_10', 'argl_11', 'argl_12', 'argl_13', 'argl_14', 'argl_15', 'argl_16', 'argl_17', 'argl_18', 'argl_19', 'argl_2', 'argl_20', 'argl_21', 'argl_22', 'argl_23', 'argl_24', 'argl_25', 'argl_26', 'argl_27', 'argl_3', 'argl_4', 'argl_5', 'argl_6', 'argl_7', 'argl_8', 'argl_9']
['ZOTU', 'argl_1', 'argl_10', 'argl_11', 'argl_12', 'argl_13', 'argl_14', 'argl_15', 'argl_16', 'argl_17', 'argl_18', 'argl_19', 'argl_2', 'argl_20', 'argl_21', 'argl_22', 'argl_23', 'argl_24', 'argl_25', 'argl_26', 'argl_27', 'argl_3', 'argl_4', 'argl_5', 'argl_6', 'argl_7', 'argl_8', 'argl_9']


Now, it doesn't make sense to check for correlations as is, so I'm going to study plastic and soil correlations by separate

In [5]:
plastic_micro = micro_df[["ZOTU", "argl_7", "argl_8", "argl_9", "argl_10", "argl_11", "argl_12", "argl_15", "argl_16", "argl_17", 
                          "argl_18", "argl_22", "argl_23", "argl_24", "argl_25", "argl_26", "argl_27"]]
plastic_arg = arg_df[["ARG", "argl_7", "argl_8", "argl_9", "argl_10", "argl_11", "argl_12", "argl_15", "argl_16", "argl_17", 
                          "argl_18", "argl_22", "argl_23", "argl_24", "argl_25", "argl_26", "argl_27"]]
plastic_mge = mge_df[["MGE", "argl_7", "argl_8", "argl_9", "argl_10", "argl_11", "argl_12", "argl_15", "argl_16", "argl_17", 
                          "argl_18", "argl_22", "argl_23", "argl_24", "argl_25", "argl_26", "argl_27"]]

In [6]:
def statistic(x, y):  # explore all possible pairings by permuting `x`
    rs = stat.spearmanr(x, y).statistic  # ignore pvalue
    dof = len(x) - 2 # will only work for cases where x and y are equal
    transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
    return transformed

In [7]:
def get_long_count_df(df1, df2, min_data):
    """
    The idea is to get two count dataframes such as the ones employed by phyloseq and TreeSummarizedExperiment and merge them in a long df in which all possible pairings of counted elements are represented.
    So, the final df will have just 4 columns: df1-names, df2-names, df1-names-corresponding-values, df2-names-corresponfing-values. It'll also perform some basic pre-filtering of the data, to get rid of those cases
    in which the count of any of the elmemnts is 0 in all cases and those in which both elements are present at once (!= 0) in less than min_data ocasions (i don't consider correlation results of low datapoints to 
    be credible)
    - df1: the first dataframe to correlate
    - df2 the second dataframe to correlate
    - min_data: minimum amount of samples in which at least one of the observed value is not 0 
    """
    # First, preformat the final df
    df1_var_name = df1.columns[0]
    df2_var_name = df2.columns[0]
    long_df_col3_name = df1_var_name + "_vals"
    long_df_col4_name = df2_var_name + "_vals"
    
    long_df = pd.DataFrame(columns = [df1_var_name, df2_var_name, long_df_col3_name, long_df_col4_name])
    # get a useful variable for filtering later
    total_observations = df1.shape[1]
    #it = 0
    
    for var1 in tqdm(df1[df1_var_name].unique()):
        #it += 1  
        #if it == 3:break
        current_var1 = df1.loc[df1[df1_var_name] == var1] # in the case of a zotu df, this will extract only the values of the row corresponfind to said zotu
        current_var1 = current_var1.T.iloc[1:] # then it flips the row to turn it into a column a removes the first element (which is the name, not a value)
        if current_var1.sum().iloc[0] == 0.0: continue
        for var2 in tqdm(df2[df2_var_name].unique(), leave = False):
            current_var2 = df2.loc[df2[df2_var_name] == var2]
            current_var2 = current_var2.T.iloc[1:]
            if current_var2.sum().iloc[0] == 0.0: continue
            
            # join both of them
            current_both = current_var1.join(current_var2, lsuffix = df1_var_name, rsuffix = df2_var_name)
            current_both.columns = [long_df_col3_name, long_df_col4_name] #give the final thing the same column names as the final df to ease appending them
            
            # A final check for the number of absences
            zero_count = current_both.sum(axis = 1).value_counts()
            if (0.0 in zero_count.index) and ((total_observations - zero_count[0]) < min_data): continue 
            
            # Start creating the final df
            current_both[df1_var_name] = var1
            current_both[df2_var_name] = var2
            long_df = pd.concat([long_df, current_both])
    return long_df.reset_index(drop = True)

In [8]:
lala = get_long_count_df(plastic_micro, plastic_arg, min_data = 7)
lala

  0%|          | 0/2037 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

KeyboardInterrupt: 

If I'm not mistaken, parallelization from here on is "just" a matter of seeing how to split the df along the first axis, letting each core deal with an individual fragment and putting them all together at the end.
This has to be done through a function, so we need to define a function that identifies which data to correlate within a chunk.

In [None]:
lala.to_csv("long_zap.csv")

In [None]:
del lala
lala = get_long_count_df(plastic_micro, plastic_mge, min_data = 7)
lala.to_csv("long_zmp.csv")

In [None]:
del lala
lala = get_long_count_df(plastic_arg, plastic_mge, min_data = 7)
lala.to_csv("long_amp.csv")

In [None]:
del lala
del plastic_micro
del plastic_arg
del plastic_mge

In [None]:
control_micro = micro_df[["ZOTU", "argl_1", "argl_2", "argl_3", "argl_4", "argl_5", "argl_6", "argl_13", "argl_14", "argl_19", 
                          "argl_20", "argl_21"]]
control_arg = arg_df[["ARG", "argl_1", "argl_2", "argl_3", "argl_4", "argl_5", "argl_6", "argl_13", "argl_14", "argl_19", 
                          "argl_20", "argl_21"]]
control_mge = mge_df[["MGE", "argl_1", "argl_2", "argl_3", "argl_4", "argl_5", "argl_6", "argl_13", "argl_14", "argl_19", 
                          "argl_20", "argl_21"]]

In [None]:
lala = get_long_count_df(control_micro, control_arg, min_data = 5)
lala.to_csv("long_zac.csv")
del lala

In [None]:
lala = get_long_count_df(control_micro, control_mge, min_data = 5)
lala.to_csv("long_zmc.csv")
del lala

In [None]:
lala = get_long_count_df(control_arg, control_mge, min_data = 5)
lala.to_csv("long_arc.csv")
del lala

In [None]:
def calculate_correlations (long_df)

In [10]:
res_df = pd.DataFrame(columns = ["zotu", "ARG", "statistic", "p-val"])
#it = 0
for zotu in plastic_micro["zotu_names"].unique():
    current_zotu = plastic_micro.loc[plastic_micro["zotu_names"] == zotu]
    current_zotu = current_zotu.T.iloc[1:]
    if current_zotu.sum().iloc[0] == 0.0:
            #print("zotu fully absent")
            continue
    #print(current_zotu)
    #it2 = 0
    for ARG in plastic_arg["Assay"].unique():
        current_ARG = plastic_arg.loc[plastic_arg["Assay"] == ARG]
        current_ARG = current_ARG.T.iloc[1:]
        #print(current_ARG)
        if current_ARG.sum().iloc[0] == 0.0:
            #print("ARG fully absent")
            continue
        #print(current_ARG)
        #print(zotu, "--", ARG)
        current_both = current_ARG.join(current_zotu, lsuffix = "ARG", rsuffix = "zOTU")
        cb_colnames = current_both.columns.values.tolist()
        #print(current_both)
        zero_count = current_both.sum(axis = 1) # compute sums row-wise. If both ARG and zOTU are absent, the sum will be 0
        try: # if there isn't a single absence, the code would fail on the next line
            zero_count = zero_count.value_counts()[0] # get only the values for 0 (aka, the double absence)
            #print(zero_count)
            if zero_count >= 5: break # I'm arbitrarly setting a minimum of 5 presences to minimize the amount of false positives
            else:
                #print(current_both)
                #print(stat.spearmanr(current_both[cb_colnames[0]], current_both[cb_colnames[1]]))
                #print(stat.spearmanr(current_both[cb_colnames[0]], current_both[cb_colnames[1]])[1])
                
                #ref = stat.permutation_test((lele[2], lele[2493]), statistic, alternative='two-sided', permutation_type='pairings')
                #corr = stat.spearmanr(current_both[cb_colnames[0]], current_both[cb_colnames[1]])
                corr = stat.permutation_test((current_both[cb_colnames[0]], current_both[cb_colnames[1]]), statistic, alternative='two-sided', permutation_type='pairings')
                if corr.pvalue <= 0.05: #before: corr[1]
                    #print("yeeee"
                    current_df = pd.DataFrame([[zotu, ARG, corr.statistic,corr.pvalue]], columns = ["zotu", "ARG", "statistic", "p-val"])
                    res_df = pd.concat(res_df, current_df)
        except:
            #print(current_both)
            #print(stat.spearmanr(current_both[cb_colnames[0]], current_both[cb_colnames[1]]))
            #corr = stat.spearmanr(current_both[cb_colnames[0]], current_both[cb_colnames[1]])
            corr = stat.permutation_test((current_both[cb_colnames[0]], current_both[cb_colnames[1]]), statistic, alternative='two-sided', permutation_type='pairings')
            #if corr[1] <= 0.05: print("haaaaaaaw")
            if corr.pvalue <= 0.05:
                current_df = pd.DataFrame([[zotu, ARG, corr.statistic,corr.pvalue]], columns = ["zotu", "ARG", "statistic", "p-val"])
                res_df = pd.concat([res_df, current_df])
        #it2 += 1
        #if it2 == 2: break
    #it += 1
    #if it == 2: break
res_df

  res_df = pd.concat([res_df, current_df])
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x753ea2b08790>>
Traceback (most recent call last):
  File "/home/pak/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x753ea2b08790>>
Traceback (most recent call last):
  File "/home/pak/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x753ea2b08790>>
Traceback (most recent call last):
  File "/home/pak/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py

KeyboardInterrupt: 

In [12]:
res_df.to_csv("rclr-permu-plas-mge.csv")

In [10]:
soil_micro = micro_df[["zotu_names", "alm2_1", "alm2_2", "alm2_3", "alm2_19", "alm2_20", "alm2_21", "alm2_43", "alm2_44", "alm2_45"]]
soil_arg = arg_df[["Assay", "alm2_1", "alm2_2", "alm2_3", "alm2_19", "alm2_20", "alm2_21", "alm2_43", "alm2_44", "alm2_45"]]
soil_mge = mge_df[["Assay", "alm2_1", "alm2_2", "alm2_3", "alm2_19", "alm2_20", "alm2_21", "alm2_43", "alm2_44", "alm2_45"]]

In [13]:
res_df = pd.DataFrame(columns = ["zotu", "ARG", "statistic", "p-val"])
for zotu in soil_micro["zotu_names"].unique():
    current_zotu = soil_micro.loc[soil_micro["zotu_names"] == zotu]
    current_zotu = current_zotu.T.iloc[1:]
    if current_zotu.sum().iloc[0] == 0.0: continue
    for ARG in soil_arg["Assay"].unique():
        current_ARG = soil_arg.loc[soil_arg["Assay"] == ARG]
        current_ARG = current_ARG.T.iloc[1:]
        if current_ARG.sum().iloc[0] == 0.0: continue
        current_both = current_ARG.join(current_zotu, lsuffix = "ARG", rsuffix = "zOTU")
        cb_colnames = current_both.columns.values.tolist()
        zero_count = current_both.sum(axis = 1) # compute sums row-wise. If both ARG and zOTU are absent, the sum will be 0
        try: # if there isn't a single absence, the code would fail on the next line
            zero_count = zero_count.value_counts()[0] # get only the values for 0 (aka, the double absence)
            if zero_count >= 5: break # I'm arbitrarly setting a minimum of 5 presences to minimize the amount of false positives
            else:
                #corr = stat.spearmanr(current_both[cb_colnames[0]], current_both[cb_colnames[1]])
                corr = stat.permutation_test((current_both[cb_colnames[0]], current_both[cb_colnames[1]]), statistic, alternative='two-sided', permutation_type='pairings')
                if corr.pvalue <= 0.05: 
                    current_df = pd.DataFrame([[zotu, ARG, corr.statistic,corr.pvalue]], columns = ["zotu", "ARG", "statistic", "p-val"])
                    res_df = pd.concat([res_df, current_df])
        except:
            corr = stat.permutation_test((current_both[cb_colnames[0]], current_both[cb_colnames[1]]), statistic, alternative='two-sided', permutation_type='pairings')
            if corr.pvalue <= 0.05:
                current_df = pd.DataFrame([[zotu, ARG, corr.statistic,corr.pvalue]], columns = ["zotu", "ARG", "statistic", "p-val"])
                res_df = pd.concat([res_df, current_df])
res_df

  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  res_df = pd.concat([res_df, current_df])
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))
  transformed = rs * np.sqrt(dof / ((rs+1.0)*(1.0-rs)))

KeyboardInterrupt



In [None]:
res_df.to_csv("rclr-permu-cont-mge.csv")

#### FIXING STUFF
So, up to this point I've made a mistake and I've kept the statistic when I wanted to save the correlation percentage. This can be fixed while also saving time by going over the results and just recalculating the correlations of those that came up as positive in the original. In the future, this can be fixed to keep both the statistic and the correlation pct. at the same time, but I don't have time to do so right now.

In [29]:
# m = microbiome a = arg t = mge's (t from transposon)
#os.chdir("../results/Corr_results")
m2a_p_corrs = pd.read_csv("rclr-permu-plas.csv").iloc[:, 1:]
m2a_s_corrs = pd.read_csv("final_soil_za.csv").iloc[:, 1:]

m2t_p_corrs = pd.read_csv("rclr-permu-plas-mge.csv").iloc[:, 1:]
m2t_s_corrs = pd.read_csv("final_soil_zm.csv").iloc[:, 1:]

a2t_p_corrs = pd.read_csv("res_corrs-permu-plas.csv").iloc[:, 1:]
a2t_s_corrs = pd.read_csv("final_soil_am.csv").iloc[:, 1:]
#a2t_s_corrs
m2t_s_corrs

Unnamed: 0,zotu,MGE,statistic,p-val
0,zOTU_631,bacA,4.878524,0.0036
1,zOTU_631,orf37-IS26,4.183300,0.0038
2,zOTU_705,fosb,-2.744974,0.0432
3,zOTU_705,bacA,-3.538831,0.0228
4,zOTU_705,orf39-IS26,3.790451,0.0198
...,...,...,...,...
3445,zOTU_521,orf37-IS26,3.866294,0.0140
3446,zOTU_199,dfrA1,-3.731151,0.0276
3447,zOTU_1299,fabK,3.414480,0.0174
3448,zOTU_1779,orf39-IS26,4.458892,0.0112


In [30]:
def fix_results(original_results, data1, data2):
    """
    Get a dataframe in which there are results for significant correlations that we believe, and add to them the missing correlation coefficient value. For that, it loops over
    each pair of related units (i.e.: zOTUs, ARGs or MGEs) by looping over the rows of the results df and then goes to their original dataframes, extracts the values used to 
    calculate the correlation and calculates said correlation, but only keeps the coefficient. 
    When doing so it recreates the original result dataframe, to facilitate a merge at the end. The result will be the original results df with significant correlations, but with 
    an added column indicating the coefficient.
    - original_results: the original results df
    - data1: on of the two dataframes in which the data used to re-calculate spearman's correlation can be found. It MUST correspond to that whose entries are found in the FIRST column of original_results
    (e.g.: if the first column of original_results is a list of MGE names and original_results contains plastic results, data1 must be the plastic-MGE df)
    -data2: same as data1, but for the second column
    """
    original_colnames = original_results.columns.values.tolist()
    #both data1 and data2 will always have a column containing the names of the biological thingy in question, named different whether the data 
    # refer to ARG, MGE or zOTU data. And we need to access it each iteration, so it's better to extract it once now than to access it in a list in the future 
    data1_tname = data1.columns.values.tolist()[0] #it's always in the first position, no matter the df
    data2_tname = data2.columns.values.tolist()[0]
    nres_df = pd.DataFrame(columns = [original_colnames[0], original_colnames[1], "correlation-coef"])
    for i in range(len(original_results)):
        col1_nam = original_results.iloc[i][original_colnames[0]] #it will always be the first one 
        col2_nam = original_results.iloc[i][original_colnames[1]] #idem
        # all dfs have a column full of 0's and no name as the first column, which we can drop
        vals1 = data1.loc[data1[data1_tname] == col1_nam].T.iloc[1:]
        vals2 = data2.loc[data2[data2_tname] == col2_nam].T.iloc[1:]

        current_df = pd.DataFrame([[col1_nam, col2_nam, stat.spearmanr(vals1, vals2).statistic]], 
                                  columns = [original_colnames[0], original_colnames[1], "correlation-coef"])
        nres_df = pd.concat([nres_df, current_df])
    
    original_results["merge_hack"] = original_results[original_colnames[0]] + original_results[original_colnames[1]]
    nres_df["merge_hack"] = nres_df[original_colnames[0]] + nres_df[original_colnames[1]]
    nres_df = nres_df[["correlation-coef", "merge_hack"]]
    fres_df = original_results.merge(nres_df, on = "merge_hack")
    fres_df.drop(columns = ["merge_hack"], inplace = True)
    return fres_df

In [31]:
# So this one is screwed, because it's reversed: ARGs to MGEs, instead of MGE's to ARG's
fix = fix_results(original_results = a2t_s_corrs, data1 = soil_mge, data2 = soil_arg)
fix.to_csv("fixed_res-corrs_soil.csv")
fix

  nres_df = pd.concat([nres_df, current_df])


Unnamed: 0,MGE,ARG,statistic,p-val,correlation-coef
0,ARR-3,aadB,2.682500,0.0454,0.711967
1,ARR-3,aac3-Via,2.682500,0.0440,0.711967
2,ARR-3,aac(3)-Xab,2.557510,0.0496,0.695015
3,ARR-3,cmlA1,2.942650,0.0320,0.743625
4,ARR-3,VanB,2.557510,0.0476,0.695015
...,...,...,...,...,...
1969,trfa,copA,4.123106,0.0042,0.841625
1970,trfa,sugE,4.878524,0.0034,0.879049
1971,trfa,pbrT,4.183300,0.0042,0.845154
1972,trfa,sul2,4.878524,0.0038,0.879049


In [32]:
fix = fix_results(original_results = a2t_p_corrs, data1 = plastic_mge, data2 = plastic_arg)
fix.to_csv("fixed_res-corrs_plas.csv")
fix

  nres_df = pd.concat([nres_df, current_df])


Unnamed: 0,MGE,ARG,statistic,p-val,correlation-coef
0,fosb,aac(6')-II,3.085120,0.0340,0.759091
1,fosb,spcN,4.323797,0.0058,0.852981
2,fosb,aac(3)-ib,3.334314,0.0148,0.783349
3,fosb,aac(3)-iid_iii_iif_iia_iie,3.764736,0.0128,0.818165
4,fosb,aac(3)-xaa,4.080754,0.0092,0.839076
...,...,...,...,...,...
1089,pBS228-IncP-1?,lnu(F),3.749412,0.0158,0.817059
1090,pBS228-IncP-1?,cat,3.121261,0.0208,0.762821
1091,pBS228-IncP-1?,cmlV,4.628175,0.0044,0.868156
1092,pBS228-IncP-1?,tetJ,4.884914,0.0066,0.879310


In [33]:
fix = fix_results(original_results = m2t_s_corrs, data1 = soil_micro, data2 = soil_mge)
fix.to_csv("fixed_micro-mge_soil.csv")
fix

  nres_df = pd.concat([nres_df, current_df])


Unnamed: 0,zotu,MGE,statistic,p-val,correlation-coef
0,zOTU_631,bacA,4.878524,0.0036,0.879049
1,zOTU_631,orf37-IS26,4.183300,0.0038,0.845154
2,zOTU_705,fosb,-2.744974,0.0432,-0.720000
3,zOTU_705,bacA,-3.538831,0.0228,-0.800909
4,zOTU_705,orf39-IS26,3.790451,0.0198,0.820000
...,...,...,...,...,...
3445,zOTU_521,orf37-IS26,3.866294,0.0140,0.825268
3446,zOTU_199,dfrA1,-3.731151,0.0276,-0.815730
3447,zOTU_1299,fabK,3.414480,0.0174,0.790468
3448,zOTU_1779,orf39-IS26,4.458892,0.0112,0.860000


In [34]:
fix = fix_results(original_results = m2t_p_corrs, data1 = plastic_micro, data2 = plastic_mge)
fix.to_csv("fixed_micro-mge_plas.csv")
fix
#m2t_p_corrs

  nres_df = pd.concat([nres_df, current_df])


Unnamed: 0,zotu,ARG,statistic,p-val,correlation-coef
0,zOTU_48,fabK,4.323797,0.0070,0.852981
1,zOTU_48,orf39-IS26,-2.593355,0.0466,-0.700000
2,zOTU_48,intl3,-3.159293,0.0210,-0.766667
3,zOTU_48,ISEcp1,-3.334314,0.0184,-0.783349
4,zOTU_48,IS1247,-2.853810,0.0284,-0.733333
...,...,...,...,...,...
509,zOTU_1213,IncHI2-smr0018,-6.939567,0.0026,-0.934393
510,zOTU_1213,tra-A,-4.677072,0.0052,-0.870388
511,zOTU_1213,pBS228-IncP-1?,-3.194793,0.0218,-0.770183
512,zOTU_337,int1-a-marko,6.015936,0.0004,0.915386


In [35]:
fix = fix_results(original_results = m2a_s_corrs, data1 = soil_micro, data2 = soil_arg)
fix.to_csv("fixed_micro-arg_soil.csv")
fix
#m2a_s_corrs

  nres_df = pd.concat([nres_df, current_df])


Unnamed: 0,zotu,ARG,statistic,p-val,correlation-coef
0,zOTU_705,aac(6')-II,-2.744974,0.0400,-0.720000
1,zOTU_1921,aadA9,3.029830,0.0278,0.753235
2,zOTU_1921,aadA7,-2.406082,0.0494,-0.672804
3,zOTU_1921,nimE,-3.000000,0.0244,-0.750000
4,zOTU_1921,erm(36),2.525588,0.0436,0.690490
...,...,...,...,...,...
7116,zOTU_84,sul2,3.385020,0.0210,0.787888
7117,zOTU_84,dfra21,3.106927,0.0250,0.761351
7118,zOTU_84,dfrA8,3.474416,0.0240,0.795589
7119,zOTU_84,dfrA10,3.106927,0.0254,0.761351


In [36]:
fix = fix_results(original_results = m2a_p_corrs, data1 = plastic_micro, data2 = plastic_arg)
fix.to_csv("fixed_micro-arg_plas.csv")
fix

  nres_df = pd.concat([nres_df, current_df])


Unnamed: 0,zotu,ARG,statistic,p-val,correlation-coef
0,zOTU_137,aac(6')-Ib,3.524542,0.0118,0.799745
1,zOTU_137,KPC,-2.626687,0.0454,-0.704545
2,zOTU_137,blaOXY-1,2.495299,0.0492,0.686120
3,zOTU_137,adeI,2.542500,0.0484,0.692898
4,zOTU_137,erm(Q),-2.515348,0.0462,-0.689021
...,...,...,...,...,...
2162,zOTU_494,aac(3)-id_ie,2.630163,0.0414,0.705015
2163,zOTU_494,norA,-2.588723,0.0418,-0.699362
2164,zOTU_494,lnuA,3.120758,0.0286,0.762770
2165,zOTU_494,dfrK,4.426610,0.0076,0.858366
