In [None]:
import os
import pickle

import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as hc
import scipy.spatial as sp

import matplotlib
import matplotlib.patches as patches
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm
import kaleido
pd.options.mode.chained_assignment = None


## Load in Metadata

In [None]:
metadata_path = '../../data/metadata/mash_scrubbed_species_metadata.csv'

# metadata_path = '/media/pekar2/pan_phylon/Enterobacter/metadata/enriched_metadata.csv' # new metadata path for my final run presently
df_metadata = pd.read_csv(metadata_path, index_col=0, dtype='object')

In [None]:
display(df_metadata.head(),df_metadata.shape)

## Observe columns to prune

In [None]:
df_metadata.columns

In [None]:
# Columns of interest for further pruning:
# completion_date, collection_date, geographic_location, host_name, isolation_country, isolation_source

### Host Name

In [None]:
human_host_index = df_metadata[df_metadata["host_name"].fillna("unknown").str.contains("Homo sapiens")].index
df_metadata.loc[human_host_index, "host_name"] = "Homo sapiens"
df_metadata["host_name"] = df_metadata["host_name"].fillna("Unknown")
df_metadata["host_name"] = df_metadata["host_name"].replace("Not Applicable", "Unknown")
df_metadata["host_name"] = df_metadata["host_name"].replace("Not applicable", "Unknown")

#not including birds or insects, has a wooly mammoth
animals_list = ['Dog, Canis lupus familiaris','Pan troglodytes verus','Bos taurus','Swine','Phyllomedusa distincta','Healthy cat','Mus musculus',
               'Chlrocebus sabaeus','Chelonia mydas','Canis lupus familiaris','Buffalo','Felis catus','Mammuthus primigenius','Papio papio',
               'Healthy dog','Cat','Dog','Caenorhabditis elegans','Goat, Capra hircus','Pig, Sus scrofa','Ailuropoda melanoleuca', 'Dairy cow', 
                'Osteoglossum bicirrhosum', 'Puffer fish']

birds = ['Gallus gallus','Anser indicus','Chicken','duck', 'Gull']

insects = ['Trinervitermes sp.','Blattoidea','Bombyx mori','Microtermes sp.','Helicoverpa armigera','Adaiphrotermes sp.','Promirotermes sp.',
          'Mealworm, Tenebrio molitor','Yellow mealworm, Tenebrio molitor','Mealworm, Zophobas morio','Caenorhabditis elegans','Macrotermes sp.',
          'Macrotermes bellicosus','Cubitermes sp.','Amitermes evuncifer', 'Tenebrio molitor', 'Galleria mellonella']

plants_list = ['Rice, Oryza sativa','Yellow yam, Dioscorea cayenensis','Maize','Nicotiana tabacum','Carpobrotus rossii','kiwifruit','Aloe vera','Sorghum',
              'Maize, Zea mays','Phaseolus vulgaris','Bannana','Date palm, Phoenix dactylifera','Suaeda salsa','halophyte grass',
              'Toxicodendron radicans','Sugarcane, Saccharum officinarum','Finger millet, Eleusine coracana ','Capsicum annuum cv. King Arthur',
              'Morus alba var. atropurpurea','peanut', 'Rice', 'Zea nicaraguensis', 'Oryza sativa', 'Allium sativum', 'Paris polyphylla', 'Zea mays']

other = ["Unknown", "Environmental surface", "Environment", "Rhizoctonia solani", "Not collected"]

humans = ["Homo sapiens"]

In [None]:
df_metadata.loc[df_metadata["host_name"].isin(animals_list),"isolation_source"] = "Other"
df_metadata.loc[df_metadata["host_name"].isin(animals_list),"host_name"] = "Animal"

df_metadata.loc[df_metadata["host_name"].isin(insects),"isolation_source"] = "Insect"
df_metadata.loc[df_metadata["host_name"].isin(insects),"host_name"] = "Animal"

df_metadata.loc[df_metadata["host_name"].isin(birds),"isolation_source"] = "Avian"
df_metadata.loc[df_metadata["host_name"].isin(birds),"host_name"] = "Animal"

df_metadata.loc[df_metadata["host_name"].isin(other),"host_name"] = "Unknown"


df_metadata.loc[df_metadata["host_name"].isin(humans),"host_name"] = "Human"


df_metadata.loc[df_metadata["host_name"].isin(plants_list),"host_name"] = "Plant"

In [None]:
for index in df_metadata.host_name.value_counts().index:
    print("'" +index + "',  " + str(df_metadata.host_name.value_counts()[index]))

### Isolation Source

In [None]:
# df_metadata["isolation_source"] = df_metadata["isolation_source"].fillna("Unknown")
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(["sputum", "missing; Sputum", "sptum", "Sputum-aspirate"], "Sputum")
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(["blood", "blood culture", "Peripheral blood",
#                                                                            "Blood and wound", "Blood culture"], "Blood")
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace("urine", "Urine")
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace("Uriue", "Urine")
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace("missing; Urine", "Urine")
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(["Urinary tract", "urine sample", "Urine/Genitourinary", 
#                                                                            "urine - permanent urinary catheter", "urea", "Catheter Urine"
#                                                                           'Urinary Tract Catheters'], "Urine")



# rectal_source_index = df_metadata[df_metadata["isolation_source"].str.contains("rectal")].index
# df_metadata.loc[rectal_source_index, "isolation_source"] = "Rectal"

# rectal_source_index = df_metadata[df_metadata["isolation_source"].str.contains("Rectal")].index
# df_metadata.loc[rectal_source_index, "isolation_source"] = "Rectal"
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(["perianal skin [UBERON:0012336]", "rectum"], "Rectal")


# wastewater_terms = ["Wastewater influent sample", "Wastewater effluent sample", "Freshwater sample from downstream of wastewater treatment plant",
#                    "hospital sewage", "wastewater", "Freshwater sample from upstream of wastewater treatment plant", "sewage water", "surface water", 
#                    "sink drain", "Stormwater drain", "Drainage", "hospital sewage water", "Medical waste water", "Treated sewage effluent",
#                     "raw sewage", "Sewage water"]
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(wastewater_terms, "Wastewater")

# bodily_fluid_terms = ["Bodily fluid", "bile", "Excreted bodily substance", "excreted bodily substance", "secretion", "pus", "the stomach sample of a gastric cancer patient",
#                      "spinal fluid", "miscellaneous body fluid", "Bile", "Peritoneal drainage fluid", "drainage fluid", "abdominal fluid", "Abdominal surgical drain fluid",
#                      "Central nervous system", "Secretion", "human bile", "prostatic fluid", 'hydrothorax','Biliary fluid','Pus','ascitic fluid']
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(bodily_fluid_terms, "bodily fluid")


# respiratory_terms = ['respiratory tract','respiratory','Respiratory','throat swab','tracheal aspirate','bronchoalveolar lavage fluid','Nasopharynx',
#                     'subgingival plaque','Bronchoalveolar lavage','phlegm','Broncho-alveolar lavage','endotracheal tube','Bronchioles',
#                     'bronchoalveolar lavage','Bronchial','mouth swab','Transtracheal aspirate','tongue','missing; Bronchial lavage','throat',
#                     'epithelium of nasopharynx [UBERON:0001951]']
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(respiratory_terms, "respiratory")

# fecal_terms = ['feces','Stool','faecal','stool','fecal sample','human feces','feces extracted directly from colon']
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(fecal_terms, "fecal")

# environmental_terms = ['wood','metal','soil','environmental','tailing mud',
#                        'freshwater lake anaerobic enchiment culture with carbon source citrate','Creek Sediment','Urban soil',
#                        'environmental swab veterinary clinic','ISS environmental surface','plastic','metal/plastic','lake water',
#                       'zoo environmental surface','contaminated soil','well water','environmental surface of zoo','river water','Zoological Institution',
#                       'EX-LANDFILL SITE','acid mine decant and tailings from uranium mine','glyphosate polluted soil','soil polluted with engine oil',
#                       'crude oil-contaminated soil','Irrigation well water','River','Eutrophic lake']
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(environmental_terms, "environmental")

# clinical_wound = ['patients and hospital environment','Surgical wound','wound','tissues','Wound','clinical sample','Intraperitoneal','Indwelling catheter',
#                  'hospital','Wound secretion','clinical material','Catheter','hospitals','catheter tip','clinical situations','wound swab','puncture fluid',
#                  'intravenous catheter','Hospital environment','shunt fluid','wound abdomin','Decubitus swab','sacral ulcer',
#                  'sample from patient in hospital emergency room','Deep venous catheter','surgical drain','surgical procedure specimen',
#                  'bacterial sepsis female patient','swab from a hand-washing sink as part of the hospital routine surveillance program','abscesses',
#                  'abdominal dropsy','human clinical specimens, and E','clinical','Burn wound','clinical isolate']
# df_metadata["isolation_source"] = df_metadata["isolation_source"].replace(clinical_wound, "clinical/wound")


# # continue with finding more synonyms

In [None]:
df_metadata["isolation_source"] = df_metadata["isolation_source"].fillna("Unknown")

In [None]:
df_metadata[df_metadata.host_name == "Animal"].isolation_source.value_counts()

In [None]:
df_metadata[df_metadata.host_name == "Plant"].isolation_source.value_counts()

In [None]:
df_plant = df_metadata[df_metadata.host_name == "Plant"].copy()
df_plant["isolation_source"] = df_plant["isolation_source"].replace(["root", "root_tubers", "rhizospheric soil",
                                                                     "rhizosphere", "rhizosphere soil", "soil", 
                                                                     'glyphosate polluted soil'], "Soil/Root")
df_plant.loc[df_plant["isolation_source"] != "Soil/Root", "isolation_source"] =  "Other"
df_metadata.update(df_plant)

In [None]:
for index in df_metadata[df_metadata.host_name == "Unknown"].isolation_source.value_counts().index:
     print("'" + index + "'" + ",  " + str(df_metadata[df_metadata.host_name == "Unknown"].isolation_source.value_counts()[index]))

In [None]:
environmental = ['metal','wood','environmental swab veterinary clinic','environmental','plastic','metal/plastic','freshwater stream','animal farm',
                'freshwater lake anaerobic enchiment culture with carbon source citrate', 'zoo environmental surface','well water',
                'environmental surface of zoo','air', 'cucumber rhizosphere', 'activated sludge','Loktak Lake','EX-LANDFILL SITE',
                 'acid mine decant and tailings from uranium mine','Freshwater pond','industrial zone','Zoological Institution','Irrigation well water', 
                'leaves of a local plant near the Kharagpur railway station','Tomato roots', 'Decaying Wood', 'crude oil-contaminated soil',
                'natural water sources close by farm or community', 'atmospheric cloud water','brown patch in grass','Rice shoot','oil reservoir',
                'Pooled sheep faecal samples collected from floor of farm','natural water sources by community and farm',
                'natural water sources by community and farm','River water','Chicken feed','Chicken processing plant','Environmental Samples 1',
                'farmer market','tobacco leaves','Pooled sediment sample collected from floor of pig farm']

wastewater = ['Wastewater influent sample','Wastewater effluent sample','Freshwater sample from downstream of wastewater treatment plant',
             'hospital sewage','urine','Water','Freshwater sample from upstream of wastewater treatment plant', 'Coastal Water','wastewater',
             'ISS environmental surface','water','sewage water','surface water','Treated sewage effluent','sink drain',
             'Stormwater drain','lake water','Drainage','raw sewage','river water','Sewage water','River','Eutrophic lake','hospital sewage water',
             'Medical waste water', 'Hospital sewage']

soil = ['soil','Urban soil','tailing mud','Baby spinach leaves','Creek Sediment','contaminated soil','lettuce','soil polluted with engine oil', 
       'crude oil-contaminated soil', 'Tomato roots','paddy soil', 'Pooled soil sample collected from floor of poultry farm', 
       'Pooled sediment sample collected from floor of poultry farm','Pooled sediment sample collected from floor of cattle farm','rhizosphere',
       'marine sediment','Lycium barbarum rhizosphere soil','agricultural soil','oil-contaminated soil','root rhizomes']

food = ['food','Vegetable','ghanaian yam','Long Beans','Spinach','ginger','Cilantro','cabbage','beef burger','Cucumber Fermentation','vegetable',
       'pickled radish','Imported Fresh Produce','Imported Fresh Produce 3','Imported Fresh Produce 2']

In [None]:
df_unknown = df_metadata[df_metadata.host_name == "Unknown"]

df_unknown.loc[df_unknown["isolation_source"].isin(environmental),"host_name"] = "Environmental"
df_unknown.loc[df_unknown["isolation_source"].isin(environmental),"isolation_source"] = "Other"

df_unknown.loc[df_unknown["isolation_source"].isin(wastewater),"host_name"] = "Environmental"
df_unknown.loc[df_unknown["isolation_source"].isin(wastewater),"isolation_source"] = "Wastewater"

df_unknown.loc[df_unknown["isolation_source"].isin(soil),"host_name"] = "Environmental"
df_unknown.loc[df_unknown["isolation_source"].isin(soil),"isolation_source"] = "Soil"

df_unknown.loc[df_unknown["isolation_source"].isin(food),"host_name"] = "Environmental"
df_unknown.loc[df_unknown["isolation_source"].isin(food),"isolation_source"] = "Food"

new_terms = ["Wastewater", "Soil", "Food", "Other"]

df_unknown.loc[~df_unknown["isolation_source"].isin(food+soil+wastewater+environmental+new_terms),"isolation_source"] = "Unknown/Unclear"


df_metadata.update(df_unknown)

In [None]:
for index in df_metadata[df_metadata.host_name == "Human"].isolation_source.value_counts().index:
     print("'" + index + "'" + ",  " + str(df_metadata[df_metadata.host_name == "Human"].isolation_source.value_counts()[index]))

In [None]:
bodily_fluids = ["Bodily fluid", "bile", "Excreted bodily substance", "excreted bodily substance", "secretion", "pus", 
                 "the stomach sample of a gastric cancer patient", "spinal fluid", "miscellaneous body fluid", "Bile",
                 "Peritoneal drainage fluid", "drainage fluid", "abdominal fluid", "Abdominal surgical drain fluid",
                "Central nervous system", "Secretion", "human bile", "prostatic fluid", 'hydrothorax','Biliary fluid','Pus','ascitic fluid',
                "sputum", "missing; Sputum", "sptum", "Sputum-aspirate", 'ascitic fluid', 'Sputum','bodily fluid','abdomen',
                'sputum expectorated','ascites','Fluid','cerebrospinal fluid']

urine = ["Urinary tract", "urine sample", "Urine/Genitourinary", "urine - permanent urinary catheter", "urea", "Catheter Urine",
         'Urinary Tract Catheters', "urine", "Urine", "Uriue", "missing; Urine",'Urinary','catheter', 'urine cc', 'urine; catheter',
        'Urinary tract infection', 'urine clean catch', 'urine, clean catch']

blood = ["Blood", "blood", "blood culture", "Peripheral blood", "Blood and wound", "Blood culture",'Cardiovascular']

rectal_fecal = ['rectal swab','rectal','Stool','stool','faecal','Rectal swab','feces','Rectal Swab','rectal swab from female',
               'rectal swab of pregnant women','rectum',"perianal skin [UBERON:0012336]",'fecal material [ENVO:00002003]','Perirectal',
               'Feces','missing; Perirectal abscess', 'rectal screen', 'stool sample', 'Rectal carriage']

respiratory_terms = ['respiratory tract','respiratory','Respiratory','throat swab','tracheal aspirate','bronchoalveolar lavage fluid','Nasopharynx',
                    'subgingival plaque','Bronchoalveolar lavage','phlegm','Broncho-alveolar lavage','endotracheal tube','Bronchioles',
                    'bronchoalveolar lavage','Bronchial','mouth swab','Transtracheal aspirate','tongue','missing; Bronchial lavage','throat',
                    'epithelium of nasopharynx [UBERON:0001951]', 'expectorate', 'bronchial','Tracheal Aspirate/Wash','pleural effusion',
                    'Bronchoalvelar lavage (BAL)', 'bronchoalveolar lavage from kidney transplant patient', 'bronchial alveolar lavage fluid',
                    'BAL']

clinical_wound = ['patients and hospital environment','Surgical wound','wound','tissues','Wound','clinical sample','Intraperitoneal','Indwelling catheter',
                 'hospital','Wound secretion','clinical material','Catheter','hospitals','catheter tip','clinical situations','wound swab','puncture fluid',
                 'intravenous catheter','Hospital environment','shunt fluid','wound abdomin','Decubitus swab','sacral ulcer',
                 'sample from patient in hospital emergency room','Deep venous catheter','surgical drain','surgical procedure specimen',
                 'bacterial sepsis female patient','swab from a hand-washing sink as part of the hospital routine surveillance program','abscesses',
                 'abdominal dropsy','human clinical specimens, and E','clinical','Burn wound','clinical isolate', 'Patient','decubitus swab',
                 'Soft tissue biopsy',"Neoplasm", 'screening swab','hip prothesis biopsy','decubitis wound fluid','Would','Leg pus']

In [None]:
df_human = df_metadata[df_metadata.host_name=="Human"]

df_human.loc[df_human["isolation_source"].isin(bodily_fluids),"isolation_source"] = "Bodily Fluids"

df_human.loc[df_human["isolation_source"].isin(urine),"isolation_source"] = "Urine"

df_human.loc[df_human["isolation_source"].isin(blood),"isolation_source"] = "Blood"

df_human.loc[df_human["isolation_source"].isin(rectal_fecal),"isolation_source"] = "Rectal/Fecal"

df_human.loc[df_human["isolation_source"].isin(respiratory_terms),"isolation_source"] = "Respiratory"

df_human.loc[df_human["isolation_source"].isin(clinical_wound),"isolation_source"] = "Clinical/Wound"

df_human.loc[df_human["isolation_source"].isin(["Unknown"]),"host_name"] = "Human" # consider changing back to unknown
df_human.loc[df_human["isolation_source"].isin(["Unknown"]),"isolation_source"] = "Unknown/Unclear"


new_terms = ["Bodily Fluids","Urine","Blood","Rectal/Fecal","Respiratory","Clinical/Wound", "Unknown/Unclear"]

df_human.loc[~df_human["isolation_source"].isin(bodily_fluids+urine+blood+rectal_fecal+
                                                respiratory_terms+clinical_wound+new_terms),"isolation_source"] = "Other"


df_metadata.update(df_human)

In [None]:
for index in df_metadata.isolation_source.value_counts().index:
    print("'" + index + "'" + ",  " + str(df_metadata.isolation_source.value_counts()[index]))

### Generate Plot For Source

In [None]:
df_graph = df_metadata[df_metadata.genome_status == 'Complete']

In [None]:
df_graph.host_name.value_counts()

In [None]:
len([x[1] for x in df_graph.groupby("host_name")["isolation_source"].value_counts().index])

In [None]:
df_graph2 = pd.DataFrame(df_graph.groupby("host_name")["isolation_source"].value_counts()).reset_index()
df_graph2["center"] = " "
df_graph2.head()

In [None]:
df_genome_stats = df_metadata[["genome_name", "genome_id","chromosomes", "plasmids", "genome_length","genome_status", 'gc_content']].fillna(0)

zero_chrom = df_genome_stats[df_genome_stats.genome_status == 'Complete'][df_genome_stats.chromosomes == 0.0].index
df_genome_stats.loc[zero_chrom, 'chromosomes'] = 1

df_genome_stats["genomic_elements"] = df_genome_stats.apply(lambda x : int(float(x['chromosomes']))+int(float(x['plasmids'])), axis=1)

df_genome_stats["species"] = df_genome_stats["genome_name"].apply(lambda x: x.split()[0]+" " +x.split()[1])


df_genome_stats['genome_length'] = pd.to_numeric(df_genome_stats['genome_length'])
df_genome_stats['genomic_elements'] = pd.to_numeric(df_genome_stats['genomic_elements'])
df_genome_stats['gc_content'] = pd.to_numeric(df_genome_stats['gc_content'])

cloacae = ["Enterobacter cloacae", 'Enterobacter asburiae', 'Enterobacter hormaechei',
           "Enterobacter kobei", "Enterobacter ludwigii", "Enterobacter nimipressuralis"] # check which species are officially a part of the complex
df_genome_stats["group"] = "Other"
df_genome_stats.loc[df_genome_stats.species.apply(lambda x: x in cloacae), "group"] = "Cloacae Complex"

df_genome_stats

df_graph3 = df_metadata[df_metadata.genome_id.isin(df_genome_stats[df_genome_stats.species.str.contains('hormaechei')].genome_id)]
df_graph3 = pd.DataFrame(df_graph3.groupby("host_name")["isolation_source"].value_counts()).reset_index()
df_graph3["center"] = " "
df_graph3.head()

In [None]:
custom_colors = ['#1f77b4', '#ff7f0e', '#d62728', '#9467bd','#2ca02c']

fig =px.sunburst(df_graph3, path=["center", 'host_name', 'isolation_source'], values="count", width=1000, height=1000,
                color_discrete_map=custom_colors)
fig.update_traces(textinfo= 'label+value', textfont_size=50)
fig.show()


In [None]:
custom_colors = ['#1f77b4', '#ff7f0e', '#d62728', '#9467bd','#2ca02c']

fig =px.sunburst(df_graph2, path=["center", 'host_name', 'isolation_source'], values="count", width=1000, height=1000,
                color_discrete_map=custom_colors)
fig.update_traces(textinfo= 'label+value', textfont_size=20)
fig.show()
fig.write_image("images/source_pie.svg")

### Completion date

In [None]:
date = pd.to_datetime((df_metadata["completion_date"]))

In [None]:
cum_count = date.groupby(date.dt.year).count().cumsum()

plt.plot(cum_count.index, cum_count)
plt.xlabel('Date Completed')
plt.ylabel('Cumulative Count of Genomes')
plt.title('Cumulative Count of Genomes Over Time')
plt.xlim([2005, 2023]) 
plt.show()


### Collection Date

In [None]:
df_metadata.loc[8, "collection_date"] = '2020-02'

In [None]:
collection_date = df_metadata[~df_metadata["collection_date"].isin(["not applicable", "Not applicable"])].collection_date.apply(lambda x: str(x)[0:4])
collection_date

In [None]:
date_coll = pd.to_datetime(collection_date, errors='coerce')
cum_count = date.groupby(date_coll.dt.year).count().cumsum()

plt.plot(cum_count.index, cum_count)
plt.xlabel('Date Completed')
plt.ylabel('Cumulative Count of Genomes')
plt.title('Cumulative Count of Genomes Over Time')
plt.xlim([2000,2023])
plt.show()

### Geographic Location

In [None]:
df_metadata.geographic_location = df_metadata.geographic_location.fillna("Unknown")

In [None]:
for index in df_metadata.geographic_location.value_counts().index:
    print(index + ": " + str(df_metadata.geographic_location.value_counts()[index]))

### Country of Origin

In [None]:
df_metadata.isolation_country = df_metadata.isolation_country.fillna("Unknown")

In [None]:
for index in df_metadata.isolation_country.value_counts().index:
    print(index + ": " + str(df_metadata.isolation_country.value_counts()[index]))

## Work on scatterplot for figure 1

In [None]:
display(df_metadata[["chromosomes", "plasmids", "genome_length","genome_status", 'gc_content']].head())
display("Len: " + str(df_metadata[["chromosomes", "genome_length"]].shape[0]))

In [None]:
df_genome_stats = df_metadata[["genome_name", "genome_id","chromosomes", "plasmids", "genome_length","genome_status", 'gc_content']].fillna(0)

zero_chrom = df_genome_stats[df_genome_stats.genome_status == 'Complete'][df_genome_stats.chromosomes == 0.0].index
df_genome_stats.loc[zero_chrom, 'chromosomes'] = 1

df_genome_stats["genomic_elements"] = df_genome_stats.apply(lambda x : int(float(x['chromosomes']))+int(float(x['plasmids'])), axis=1)

df_genome_stats["species"] = df_genome_stats["genome_name"].apply(lambda x: x.split()[0]+" " +x.split()[1])


df_genome_stats['genome_length'] = pd.to_numeric(df_genome_stats['genome_length'])
df_genome_stats['genomic_elements'] = pd.to_numeric(df_genome_stats['genomic_elements'])
df_genome_stats['gc_content'] = pd.to_numeric(df_genome_stats['gc_content'])

#set for only complete sequences
df_genome_stats = df_genome_stats[df_genome_stats.genome_status == 'Complete']

cloacae = ["Enterobacter cloacae", 'Enterobacter asburiae', 'Enterobacter hormaechei',
           "Enterobacter kobei", "Enterobacter ludwigii", "Enterobacter nimipressuralis"] # check which species are officially a part of the complex
df_genome_stats["group"] = "Other"
df_genome_stats.loc[df_genome_stats.species.apply(lambda x: x in cloacae), "group"] = "Cloacae Complex"

df_genome_stats

In [None]:
custom_colors = [
    '#1f77b4',  # blue
    '#ff7f0e',  # orange
    '#2ca02c',  # green
    '#d62728',  # red
    '#9467bd',  # purple
    '#8c564b',  # brown
    '#e377c2',  # pink
    '#7f7f7f',  # gray
    '#bcbd22',  # yellow-green
    '#17becf',  # cyan
    '#aec7e8',  # light blue
    '#ffbb78',  # light orange
    '#98df8a',  # light green
    '#ff9896',  # light red
    '#c5b0d5',  # light purple
    '#c49c94',  # light brown
    '#f7b6d2',  # light pink
    '#c7c7c7',  # light gray
    '#dbdb8d',  # light yellow-green
    '#9edae5',  # light cyan
    '#ff5733'   # coral
]

clr = dict(zip((df_genome_stats.species.unique()), custom_colors))
custom_colors = clr
custom_colors

In [None]:
import plotly.express as px
fig = px.scatter(df_genome_stats[df_genome_stats.genome_status == 'Complete'], x="genomic_elements", y="genome_length", color="species", width=1000, height=600,
                 color_discrete_map=custom_colors)
fig.show()
fig.write_image("images/elements_scatterplot.svg")

In [None]:
fig = px.histogram(df_genome_stats[df_genome_stats.genome_status == 'Complete'], x="genomic_elements", color="species", height=300, color_discrete_map=custom_colors, width=800)
fig.update_layout(showlegend=False)
fig.show()


fig.write_image("images/elements_barplot.svg")

In [None]:
fig = px.box(df_genome_stats[df_genome_stats.genome_status == 'Complete'], x="species", y="genome_length", color='species', color_discrete_map=custom_colors, height=600, width=400)
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(zeroline=False, gridcolor='white')
)
fig.update_layout(showlegend=False)

fig.show()
fig.write_image("images/length_boxplot.svg")

In [None]:
fig = px.scatter(df_genome_stats, x="genome_length", y="gc_content", color="species", width=800, height=400, color_discrete_map=custom_colors)
fig.update_layout(showlegend=False)
fig.show()

In [None]:
df_metadata.columns

In [None]:
df_metadata.head()

In [None]:
df_metadata[df_metadata.host_name == 'Mammuthus primigenius']

In [None]:
df_metadata[df_metadata.genome_id == '158836.123']

In [None]:
df_metadata.to_csv('curated_metadata.csv')

## Species analysis

In [None]:
df_genome_stats

In [None]:
df_genome_stats.species.value_counts()

In [None]:
pd.DataFrame(df_genome_stats.groupby('group')['species'].value_counts())

In [None]:
custom_colors_pie = [ '#d62728','#1f77b4', '#ff7f0e', '#9467bd','#2ca02c']

fig =px.sunburst(pd.DataFrame(df_genome_stats.groupby('group')['species'].value_counts()).reset_index(), 
                 path=["group", 'species'], values="count", width=1000, height=1000,
                color_discrete_sequence=custom_colors_pie)
fig.update_traces(textinfo= 'label+value', textfont_size=20)
fig.update_layout(
    uniformtext=dict(minsize=10),
    margin = dict(t=50, l=25, r=25, b=25)
)
fig.show()
fig.write_image("images/species_distribution.svg")

In [None]:
fig = px.treemap(df_genome_stats, path=["species"], width=1000, height=1400, color='species', color_discrete_map=custom_colors)
fig.update_traces(textinfo= 'label+value', textfont_size=25)

fig.update_layout(
    uniformtext=dict(minsize=10),
    margin = dict(t=50, l=25, r=25, b=25)
)
fig.show()
# fig.write_image("images/treemap.svg")

In [None]:
df_genome_stats

In [None]:
df_genome_stats_test = pd.DataFrame([{'species':'hormaechei'}]*3600+[{'species':'cloacae'}]*2277+[{'species':'bugandensis'}]*247+[{'species':'kobei'}]*372
            +[{'species':'ludwigii'}]*227+[{'species':'roggenkampii'}]*534 + [{'species':'asburiae'}]*592+[{'species':'cancerogenous'}]*34+
            [{'species':'mori'}]*63)

In [None]:
fig = px.treemap(df_genome_stats_test, path=["species"], width=1400, height=1400, color='species', color_discrete_map=custom_colors)
fig.update_traces(textinfo= 'label+value', textfont_size=50)

fig.update_layout(
    uniformtext=dict(minsize=10),
    margin = dict(t=50, l=25, r=25, b=25)
)
fig.show()
# fig.write_image("images/treemap.svg")