# Setup

In [None]:
import os
import numpy as np
import pandas as pd

import plotly.express as px

In [None]:
DOWNLOADED_SUMMARY = '../../data/metadata/filtered_downloaded_species_summary.csv'
DOWNLOADED_METADATA = '../../data/metadata/filtered_downloaded_species_metadata.csv'

In [None]:
downloaded_summary = pd.read_csv(DOWNLOADED_SUMMARY, index_col=0, dtype='object')
downloaded_metadata = pd.read_csv(DOWNLOADED_METADATA, index_col=0, dtype='object')

display(
    downloaded_summary.shape,
    downloaded_summary.head()
)

# De-duplicate entries

## Ensure `biosample_accession` is unique & drop duplicates

In [None]:
downloaded_metadata = downloaded_metadata.drop_duplicates(subset=['biosample_accession'])

display(
    downloaded_metadata.shape,
    downloaded_metadata.head()
)

## Ensure `assembly_accession` is unique

In [None]:
## Skippeing this step because many strains do not have an assembly accession

In [None]:
# downloaded_metadata = downloaded_metadata.drop_duplicates(subset=['assembly_accession'])

# display(
#     downloaded_metadata.shape,
#     downloaded_metadata.head()
# )

## (Optional) Go through and remove any empty or mostly empty columns

In [None]:
mostly_empty_cols = pd.DataFrame(index=downloaded_metadata.columns, columns=['num_NA'])

for col in downloaded_metadata.columns:
    mostly_empty_cols.at[col, 'num_NA'] = downloaded_metadata[col].isna().sum()


# Drop anything for which over 50% of the metadata entries are N/A
px.histogram(mostly_empty_cols, x='num_NA', nbins=100)

In [None]:
cutoff_cols_NA =  downloaded_metadata.shape[0]/2
cond = mostly_empty_cols.num_NA < cutoff_cols_NA

mostly_full_cols = mostly_empty_cols[cond]
mostly_full_cols.loc['plasmids', 'num_NA'] = mostly_empty_cols.loc['plasmids', 'num_NA']

mostly_full_cols

In [None]:
scrubbed_metadata = downloaded_metadata[mostly_full_cols.index]
scrubbed_metadata

## (Optional) Clean comments, hyphens, etc. in metadata

In [None]:
# Go through these columns and make sure they have relevant info
# If they don't, feel free to drop the columns
scrubbed_metadata.columns

### `mlst` column should be removed if it still exists

In [None]:
# We will run MLST seperately, so its worth dropping this column
scrubbed_metadata['mlst'].value_counts()

In [None]:
scrubbed_metadata = scrubbed_metadata.drop(columns=['mlst'])

# Save cleaned summary & metadata files

In [None]:
filepath = DOWNLOADED_METADATA.split('filtered_downloaded_species_metadata')[0]
filepath = os.path.join(filepath, 'scrubbed_species_metadata.csv')

filepath

In [None]:
scrubbed_metadata.to_csv(filepath)

In [None]:
scrubbed_summary = downloaded_summary.loc[scrubbed_metadata.index]

filepath = DOWNLOADED_SUMMARY.split('filtered_downloaded_species_summary')[0]
filepath = os.path.join(filepath, 'scrubbed_species_summary.csv')

filepath

In [None]:
scrubbed_summary.to_csv(filepath)

In [None]:
downloaded_summary.shape[0] - scrubbed_summary.shape[0] # Number of genomes filtered during de-duplication

In [None]:

!ls -l