In [56]:
import pandas as pd
import numpy as np
from pyfish import fish_plot, process_data, setup_figure

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('ticks', rc={"axes.facecolor": (0, 0, 0, 0)})
sns.set_context('talk')

from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

In [57]:
ec_sra_path = "../../data/ec_filtered_d_sra.tsv"
kp_sra_path = "../../data/kp_filtered_d_sra.tsv"
pa_sra_path = "../../data/pa_filtered_d_sra.tsv"
mhh_path = "../../data/StudyDataGalardini_pseudonymized_v1_SENT_2022-11-17.xlsx"
cph_path = "../../data/metadata_marco_04082023.xlsx"
cph_key_path = "../../data/material_key.xlsx"
bs_path = "../../data/KBS_strains+TC_Stand30052023new.xlsx"

In [90]:
ec_sra = pd.read_csv(ec_sra_path, sep="\t")
kp_sra = pd.read_csv(kp_sra_path, sep="\t")
pa_sra = pd.read_csv(pa_sra_path, sep="\t")
mhh = pd.read_excel(mhh_path)
cph = pd.read_excel(cph_path)
cph_k = pd.read_excel(cph_key_path)
bs = pd.read_excel(bs_path)

In [91]:
# get sra df
# drop columns with missing values or useless
ec_sra.drop(columns=["Month", "BioSampleModel", "geo_loc_name", "Strain", "Library Name", "version", 
                      "LibrarySelection", "Organism", "Bytes", "AvgSpotLen", "Bases", "DATASTORE filetype",
                      "DATASTORE provider", "DATASTORE region", "Experiment", "LibrarySelection", "ReleaseDate",
                      "Sample Name", "create_date", "Date"], inplace=True)
kp_sra.drop(columns=["Month", "Library Name", "version", "LibrarySelection", "Organism", "Bytes", "AvgSpotLen", 
                     "Bases", "DATASTORE filetype", "DATASTORE provider", "DATASTORE region", "Experiment", 
                     "LibrarySelection", "ReleaseDate", "Sample Name", "create_date", "Date"], inplace=True)
pa_sra.drop(columns=["Month", "BioSampleModel", "geo_loc_name", "Library Name", "version", 
                      "LibrarySelection", "Organism", "Bytes", "AvgSpotLen", "Bases", "DATASTORE filetype",
                      "DATASTORE provider", "DATASTORE region", "Experiment", "LibrarySelection", "ReleaseDate",
                      "Sample Name", "create_date", "BioSampleModel", "Date"], inplace=True)

# drop rows with assay type == ['WGA', 'CLONE', 'FINISHING', 'WCS']
assay_type = ['WGA', 'CLONE', 'FINISHING', 'WCS']
ec_sra = ec_sra[~ec_sra['Assay Type'].isin(assay_type)]
kp_sra = kp_sra[~kp_sra['Assay Type'].isin(assay_type)]
pa_sra = pa_sra[~pa_sra['Assay Type'].isin(assay_type)]

# drop rows with platform == OXFORD_NANOPORE
platform = ['OXFORD_NANOPORE']
ec_sra = ec_sra[~ec_sra['Platform'].isin(platform)]
kp_sra = kp_sra[~kp_sra['Platform'].isin(platform)]
pa_sra = pa_sra[~pa_sra['Platform'].isin(platform)]

# drop rows with LibraryLayout == SINGLE
layout=['SINGLE']
ec_sra = ec_sra[~ec_sra['LibraryLayout'].isin(layout)]
kp_sra = kp_sra[~kp_sra['LibraryLayout'].isin(layout)]
pa_sra = pa_sra[~pa_sra['LibraryLayout'].isin(layout)]

# drop rows with LibrarySource == ['OTHER', 'METAGENOMIC', 'GENOMIC SINGLE CELL']
l_source = ['OTHER', 'METAGENOMIC', 'GENOMIC SINGLE CELL']
ec_sra = ec_sra[~ec_sra['LibrarySource'].isin(l_source)]
kp_sra = kp_sra[~kp_sra['LibrarySource'].isin(l_source)]
pa_sra = pa_sra[~pa_sra['LibrarySource'].isin(l_source)]

# look at Isolation_source column - might be not human
human_source = ["stool", "feces", "bile", "urine", "Rectal Swab", "liver", "Femur", "bone marrow", "spleen", "human",
                "Peritoneum", "Pericardia", "sputum", "fecal sample", "Rectal swab", "bloodstream", "Rectal abscess",
                "blood", "Bile Duct Fluid", "Wound culture", "Rectal Specimen", "recto anal mucosal swab", "Patient Blood",
                "Patient Urine", "Patient Sputum", "Patient hydatid fluid", "Patient Drainage", "Pus", "clinical sample",
                "respiratory", "wound", "blood culture", "E. coli K-12 from a stool sample of a diphtheria patient in Palo Alto\, CA in 1922",
                "clinical", "oral", "puncture fluid", "ascitic fluid", "intestine", "throat", "wound swab", "endocervical",
                "cerebrospinal fluid", "Abscess", "Peritoneal fluid", "CSF", "Ascitic fluid", "Abscess fluid", "rectal faeces",
                "rectal swab", "CF Sputum", "rectal swap", "Necrotic tissue", "Blood culture", "Human\, Stool", "Human\, Blood culture",
                "pus", "fluid", "Fluid", "lung", "bladder epithelial biopsy from 60-year-old female patient with recurrent urinary tract infection",
                "bladder epithelial biopsy from 30-year-old female patient with recurrent urinary tract infection",
                "bladder epithelial biopsy from 50-year-old female patient with recurrent urinary tract infection", "tissue",
                "swab", "cecal content", "Skin swab", "Human gut", "pancreas", "ear", "peritoneal fluid", "vagina", "skin",
                "stool sample", "lung swab", "uterine swab", "sinus swab", "urine swab", "soft tissue", "kidney", "joint swab",
                "oral swab", "eye", "endometrial swab", "pharyngeal tissue", "abdomen", "respiratory tract", "Anal gland",
                "Right Ear Swab", "Foot Swab", "Prostrate", "Wound/Abscess", "rectum", "Faecal samples", "Abdominal wound discharge",
                "Secretion", "Catheter", "Human Sample", "Faeces or rectal swab", "Umbilical swab", "Wound Swab", "Pooled: Liver\, Peritoneum\, Abscess",
                "nasal swab", "Uterine swab", "fecal microbiome", "heart swab", "liver swab", "vaginal swab", "uterus", "joint fluid",
                "pericardium", "umbilicus", "lungs", "outbreaks of food poisoning", "diarrhea from outbreaks of food poisoning",
                "Uterus", "semen", "nasal cavity", "rectal feces", "Blood_Blood", "swab (superficial and non-surgical)", "swab (cavity)",
                "unspecified diagnostic sample", "swab (wound)", "punctate", "biopsy", "surgical sample", "PUS", "Pus from Wound",
                "Tissue", "Blood specimen", "Small intestine", "Urine_", "Exudate", "Catheter Tip", "Anal glands", "Skin wound",
                "Abdominal", "Perirectal", "FLUID", "TISSUE", "ICU staff rectal swab", "blood draw same patient as MB9267", "Peritoneal Fluid",
                "Wound Discharge", "Cecal Contents", "Diseased Organ", "Perianal Swab", "Perineum Swab", "Rectal Isolate",
                "Abdomen Fluid", "Abdomen Tissue", "Kidney Swab", "Perineal Swab", "Endotracheal Aspirate", "Perianal", "Catheter Urine",
                "rectal", "Blood Specimen", "Cecum", "Faecal swab", "hand swab (farm workers)", "feces (farm workers)",
                "cecal", "nose swab (farm workers)", "nose swab (abattoir operator households)", "anal swab", "hand swab (abattoir operator households)",
                "feces (abattoir operator households)", "abdominal drainage", "midstream urine", "Feaces", "Perianal Abscess Swab",
                "Mid stream Urine", "Anal Swab", "Aortic Tissue", "Screening Swab", "Ankle Wound Tissue", "Periph.Ins.Central Catheter Tip",
                "Cecal content", "secretion"]
hospital_source = ["hospital sewage", "hospital", "Hospital influent", "Hospital efluent", "Baby Bath Drain Special Care Nursery"]
environmental_source = ["food", "beef trim", "ground beed patty", "Washington", "soil", "untreated waste water", 
                      "potable source water", "pizza dough dry mixture", "Raw Beef", "Apple cider", "flour",
                      "venison", "lettuce", "spinach", "environmental", "romaine lettuce", "leafy green",
                      "waste water primary treatment", "Wastewater\, primary treatment", "environmental reclaimed storage",
                      "kale", "drag swab through chicken house (Gallus gallus domesticus)", "milk",
                      "all-purpose wheat flour", "water", "cloaca", "chicken house back", "chicken house middle",
                      "chicken house front", "Sewage water", "pork", "water sewage", "environment", "environmental sample",
                      "river water", "sediment", "household environment", "surface", "doorknob", "control air", "reservoir water",
                      "Environment", "water reservoir", "Raw wastewater from aircraft arriving at an international German airport",
                      "Raw wastewater from three airplanes arriving from Singapore\, Mombasa and Canada", "cattle farm waste water",
                      "wastewater", "environmental swab lairage", "Environmental sample: soil sample", "cloacal swab",
                      "environmental swab", "Cistern water", "pond water", "water cistern", "fresh water stream surface",
                      "Manawatu river", "cattle farm", "dairy farm field soil", "Storm water drain", "Floor swab", "sold in market",
                      "Domestic Wastewaters", "Barn Environment", "Water from irrigation pond", "water pond", "water (slaughter house)",
                      "water (farm)", "lake water", "Soil/Sediment", "marine sediment"] # including food, wastewatter

ec_sra["isolation_source_categ"] = ["human" if x in human_source
                                    else "hospital" if x in hospital_source
                                    else "evironmental" if x in environmental_source
                                    else "other"
                                    for x in ec_sra["Isolation_source"]]
kp_sra["isolation_source_categ"] = ["human" if x in human_source
                                    else "hospital" if x in hospital_source
                                    else "evironmental" if x in environmental_source
                                    else "other"
                                    for x in kp_sra["Isolation_source"]]
pa_sra["isolation_source_categ"] = ["human" if x in human_source
                                    else "hospital" if x in hospital_source
                                    else "evironmental" if x in environmental_source
                                    else "other"
                                    for x in pa_sra["Isolation_source"]]

# Keep Consent column, to add private in the hospital data

# Collection column as new, to add if its public, mhh, cph, bs
ec_sra["collection"] = ['SRA' if x != 0
                       else 'SRA'
                       for x in ec_sra["Run"]]
kp_sra["collection"] = ['SRA' if x != 0
                       else 'SRA'
                       for x in kp_sra["Run"]]
pa_sra["collection"] = ['SRA' if x != 0
                       else 'SRA'
                       for x in pa_sra["Run"]]

# rename geo_loc_name_country and geo_loc_name_country_continent for country and continent
ec_sra.rename(columns={"geo_loc_name_country":"country", "geo_loc_name_country_continent":"continent"})
kp_sra.rename(columns={"geo_loc_name_country":"country", "geo_loc_name_country_continent":"continent"})
pa_sra.rename(columns={"geo_loc_name_country":"country", "geo_loc_name_country_continent":"continent"})

# add sp column
ec_sra["sp"] = ['ecoli' if x != 0
                       else 'ecoli'
                       for x in ec_sra["Run"]]
kp_sra["sp"] = ['kpneumoniae' if x != 0
                       else 'kpneumoniae'
                       for x in kp_sra["Run"]]
pa_sra["sp"] = ['paeruginosa' if x != 0
                       else 'paeruginosa'
                       for x in pa_sra["Run"]]

sra = pd.concat([ec_sra, kp_sra, pa_sra], ignore_index=True)

TypeError: to_csv() got an unexpected keyword argument 'delimiter'

In [92]:
sra.to_csv("../../data/sra_all.tsv", sep="\t", index=None)

In [93]:
sra

Unnamed: 0,Run,Assay Type,BioProject,BioSample,Center Name,Consent,Instrument,LibraryLayout,LibrarySource,Platform,SRA Study,geo_loc_name_country,geo_loc_name_country_continent,Isolation_source,Collection_Date,Year,isolation_source_categ,collection,sp
0,SRR2910655,WGS,PRJNA298331,SAMN04191544,CFSAN,public,Illumina MiSeq,PAIRED,GENOMIC,ILLUMINA,SRP064902,USA,North America,fecal beef heifer,2015-04-16,2015.0,other,SRA,ecoli
1,SRR2035435,WGS,PRJNA230969,SAMN03581169,CFSAN,public,Illumina MiSeq,PAIRED,GENOMIC,ILLUMINA,SRP058582,USA,North America,food,2015-04-16,2015.0,evironmental,SRA,ecoli
2,SRR2035445,WGS,PRJNA230969,SAMN03581170,CFSAN,public,Illumina MiSeq,PAIRED,GENOMIC,ILLUMINA,SRP058582,USA,North America,food,2015-04-16,2015.0,evironmental,SRA,ecoli
3,SRR2136798,WGS,PRJNA284275,SAMN03892147,CFSAN,public,Illumina MiSeq,PAIRED,GENOMIC,ILLUMINA,SRP061878,USA,North America,beef trim,2015-06-03,2015.0,evironmental,SRA,ecoli
4,SRR2121689,WGS,PRJNA243331,SAMN03580930,CFSAN,public,Illumina MiSeq,PAIRED,GENOMIC,ILLUMINA,SRP042343,USA,North America,ground beef patty,2015-04-14,2015.0,other,SRA,ecoli
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42035,SRR23456569,WGS,PRJNA934930,SAMN33288620,CEDARS SINAI MEDICAL CENTER,public,NextSeq 2000,PAIRED,GENOMIC,ILLUMINA,SRP422491,USA,North America,ICU respiratory sample,2017-02-13,2017.0,other,SRA,paeruginosa
42036,SRR23456570,WGS,PRJNA934930,SAMN33288619,CEDARS SINAI MEDICAL CENTER,public,NextSeq 2000,PAIRED,GENOMIC,ILLUMINA,SRP422491,USA,North America,ICU respiratory sample,2017-02-13,2017.0,other,SRA,paeruginosa
42037,SRR23456571,WGS,PRJNA934930,SAMN33288618,CEDARS SINAI MEDICAL CENTER,public,NextSeq 2000,PAIRED,GENOMIC,ILLUMINA,SRP422491,USA,North America,ICU respiratory sample,2018-05-17,2018.0,other,SRA,paeruginosa
42038,SRR23456572,WGS,PRJNA934930,SAMN33288609,CEDARS SINAI MEDICAL CENTER,public,NextSeq 2000,PAIRED,GENOMIC,ILLUMINA,SRP422491,USA,North America,ICU respiratory sample,2016-08-13,2016.0,other,SRA,paeruginosa


In [None]:
# cph

In [94]:
cph

Unnamed: 0,pseudonym,kon,alder,PRV_MAT_TXT,provedato
0,SH001x01,M,74.380822,Urin fra reservoir (Bricker),2022-08-03
1,SH001x02,M,57.660274,Urin fra reservoir (Bricker),2022-08-04
2,SH001x03,M,81.638356,Urin fra KAD,2022-08-04
3,SH001x04,K,39.863014,Blod fra kateter (kolbe),2022-08-05
4,SH001x05,M,61.147945,Ekspektorat,2022-08-04
...,...,...,...,...,...
20154,SH300920x2x95,M,76.167123,Urin (midtstråle),2020-06-22
20155,SH300920x2x96,M,74.947945,Urin (midtstråle),2020-06-23
20156,Sh182x02,K,47.457534,Podning cicatrice,2022-07-01
20157,Sh184x02,K,72.498630,Urin fra KAD,2022-07-10
