In [1]:
import numpy
import pandas as pd
import sys
sys.path.append("../")
import AGG

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning

# Mitochondrial Genome

# Bacterial Pangenome

## 1. Using bifrost colors as input to inference pedegree
## 2. Use pedegrees to facilitiate GWAS analysis

In [2]:
graph_file = "./P_aeruginosa_anchor_gg.gfa"
graph = AGG.GraphicalGenome(graph_file)

In [3]:
edgelist = sorted(graph.edges.keys())
for edge in edgelist:
    strainlist = []
    readlist = graph.edges[edge]['reads']
    for read in readlist:
        itemlist = read.split(' ')
        if len(itemlist)>1:
            strainlist.append(itemlist[4])
        else:
            strainlist.append(itemlist[0])
    graph.edges[edge]['strain'] = strainlist


In [4]:
strain_info = './Pseudomonas_aeruginosa/strains'
with open(strain_info, 'r') as fp:
    data = fp.readlines()
data.pop(0)
len(data)

282

In [5]:
Strains = [item.split('\t')[0] for item in data] + ["PAO1"]
df = pd.DataFrame(0, columns = Strains, index = edgelist)
for edge in edgelist:
    strainlist = list(set(graph.edges[edge]['strain']))
    df.loc[edge, strainlist] = 1

In [6]:
df = df.drop(columns=['PAO1'])

In [7]:
df = df.T
df.shape

(282, 3126)

In [8]:
df.head()

Unnamed: 0,E00000.0004,E00000.0014,E00000.0020,E00000.0025,E00000.0036,E00003.0001,E00004.0001,E00004.0005,E00005.0001,E00005.0004,...,E30483.0074,E30483.0075,E30483.0076,E30483.0077,E30483.0078,E30483.0079,E30483.0080,E30483.0081,E30483.0082,E30483.0083
WH-SGI-V-07060,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07058,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07057,0,0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07074,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07068,0,0,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0


# exclude MAF > 0.01 < 0.99

In [9]:
new_columns = df.columns[(df.sum()/float(df.shape[0]))>0.01]
n_df = df[new_columns]
new_columns = n_df.columns[(n_df.sum()/float(n_df.shape[0]))<0.99]
n_df = n_df[new_columns]
n_df.shape

(282, 2931)

In [10]:
# add groups

# group = []
# for edge in edgelist:
#     group.append(graph.incoming[edge][0])

# D = dict(zip(sorted(group), range(len(group))))

# df['group'] = [D[g] for g in group]
n_df.head()

Unnamed: 0,E00000.0004,E00000.0014,E00000.0020,E00000.0025,E00000.0036,E00003.0001,E00004.0001,E00004.0005,E00005.0001,E00005.0004,...,E21373.0000,E21374.0003,E21385.0000,E21966.0006,E22717.0001,E25448.0000,E25551.0000,E26963.0001,E28314.0002,E28441.0000
WH-SGI-V-07060,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07058,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07057,0,0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07074,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07068,0,0,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [11]:
n_df.to_csv("strain_edge_mt.csv")

In [12]:
d = pd.read_csv("strain_edge_mt.csv")

In [15]:
n_df

Unnamed: 0,E00000.0004,E00000.0014,E00000.0020,E00000.0025,E00000.0036,E00003.0001,E00004.0001,E00004.0005,E00005.0001,E00005.0004,...,E21373.0000,E21374.0003,E21385.0000,E21966.0006,E22717.0001,E25448.0000,E25551.0000,E26963.0001,E28314.0002,E28441.0000
WH-SGI-V-07060,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07058,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07057,0,0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07074,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07068,0,0,1,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WH-SGI-V-07311,0,0,1,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07318,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07320,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
WH-SGI-V-07326,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,1


# PCA

In [16]:
from sklearn.decomposition import PCA

In [24]:
pca = PCA(n_components=2,svd_solver='full') # sample vs features

X_fit = pca.fit(n_df)

In [26]:
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

X_fit.components_

[0.1559098  0.12435987]
[62.12679369 55.4858638 ]


array([[-0.00020762, -0.00012423, -0.00365195, ...,  0.00980515,
         0.00939227,  0.0094628 ],
       [-0.00252153, -0.00184542, -0.05729172, ...,  0.00663478,
         0.00629208,  0.00632908]])

# linear mixed model

In [29]:
phenotype = [item.split('\t')[1] for item in data]
#df = df.T
n_df.shape, len(phenotype)

((282, 2931), 282)

In [30]:
n_df['y'] = phenotype
#df["groups"] = 0 

In [32]:
n_df.to_csv("strain_edge_mt.csv")

In [132]:
all_columns = "+".join(df.columns[:-1])
#all_columns

In [136]:
my_formula = "y~" + all_columns
m = smf.mixedlm(my_formula, df, groups = df['groups']).fit()

print(m.summary())

RecursionError: maximum recursion depth exceeded