In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

seaborn 0.10.1
pandas  1.0.4
numpy   1.18.4



In [2]:
bacteria_metadata = "https://data.gtdb.ecogenomic.org/releases/release95/95.0/bac120_metadata_r95.tar.gz"
archea_metadata = "https://data.gtdb.ecogenomic.org/releases/release95/95.0/ar122_metadata_r95.tar.gz"

In [3]:
output_folder = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/metadata"

In [4]:
import os

os.makedirs(output_folder, exist_ok=True)

In [5]:
!wget {bacteria_metadata} --directory-prefix={output_folder}

--2021-02-19 17:45:32--  https://data.gtdb.ecogenomic.org/releases/release95/95.0/bac120_metadata_r95.tar.gz
Resolving data.gtdb.ecogenomic.org (data.gtdb.ecogenomic.org)... 203.101.230.55
Connecting to data.gtdb.ecogenomic.org (data.gtdb.ecogenomic.org)|203.101.230.55|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40465971 (39M) [application/octet-stream]
Saving to: ‘/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/metadata/bac120_metadata_r95.tar.gz.1’


2021-02-19 17:45:38 (8.58 MB/s) - ‘/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/metadata/bac120_metadata_r95.tar.gz.1’ saved [40465971/40465971]



In [6]:
!wget {archea_metadata} --directory-prefix={output_folder}

--2021-02-19 17:45:38--  https://data.gtdb.ecogenomic.org/releases/release95/95.0/ar122_metadata_r95.tar.gz
Resolving data.gtdb.ecogenomic.org (data.gtdb.ecogenomic.org)... 203.101.230.55
Connecting to data.gtdb.ecogenomic.org (data.gtdb.ecogenomic.org)|203.101.230.55|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 718199 (701K) [application/octet-stream]
Saving to: ‘/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/metadata/ar122_metadata_r95.tar.gz.1’


2021-02-19 17:45:40 (815 KB/s) - ‘/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/metadata/ar122_metadata_r95.tar.gz.1’ saved [718199/718199]



In [7]:
!tar -xvf {output_folder}/bac120_metadata_r95.tar.gz --directory {output_folder}

bac120_metadata_r95.tsv


In [8]:
!tar -xvf {output_folder}/ar122_metadata_r95.tar.gz --directory {output_folder}

ar122_metadata_r95.tsv


In [9]:
df_bac = pd.read_csv(output_folder + "/bac120_metadata_r95.tsv", sep="\t")
df_arch = pd.read_csv(output_folder + "/ar122_metadata_r95.tsv", sep="\t")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [13]:
df = pd.concat((df_arch, df_bac))

In [14]:
df.columns = ["gf_" + col for col in df.columns]

df['assembly_accession'] = [_.replace("GB_", "").replace("RS_", "") for _ in df['gf_accession']]


In [15]:
assembly_features = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/db_features.fixed.csv"

In [16]:
df_features = pd.read_csv(assembly_features, index_col=0)

In [17]:
df_merged = pd.merge(df_features, df, on="assembly_accession")

In [18]:
df_merged.to_csv("/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/db_features.fixed.extra.csv")