In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
from pathlib import Path
import re
import nltk
import string

plt.style.use('fivethirtyeight')
sns.set()
sns.set_context("talk")

# Setup - Load the SQL extension and connect to the Mini IMDB dataset we've prepared
db_path = Path('/Users/ryan/Downloads/biosample_basex_data_good_subset.db')

engine = sqlalchemy.create_engine(f"sqlite:///{db_path}")
connection = engine.connect()
inspector = sqlalchemy.inspect(engine)

query_name = """
SELECT *
FROM harmonized_wide_sel_envs
"""
harmonized_wide_sel_envs_df = pd.read_sql(query_name, engine)

aerobe_pattern = "^[Aa]erob(ic)*(-){0}"
anaerobe_pattern = "^(anaero)"
facultative_pattern = "^(facultative)"
microaerophilic_pattern = "microaerophilic" #microaerophilic already looks clean
microanaerobe_pattern = "microanaerobe" #None microanaerobe ?
obligate_aerobe_pattern = "obligate aerobe" #None obligate aerobe ?
obligate_anaerobe_pattern = "^obligate anaerobe" #already looks clean

proper_values = ["aerobe", "anaerobe", "facultative", "microaerophilic", "microanaerobe", "obligate aerobe", "obligate anaerobe"]

harmonized_wide_sel_envs_df['rel_to_oxygen_rep'] = harmonized_wide_sel_envs_df['rel_to_oxygen'].astype(str)
harmonized_wide_sel_envs_df['rel_to_oxygen_rep_status'] = (harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].isin(proper_values) == False) & (harmonized_wide_sel_envs_df['rel_to_oxygen_rep'] != "None")

harmonized_wide_sel_envs_df['rel_to_oxygen_rep_status'] = harmonized_wide_sel_envs_df['rel_to_oxygen_rep_status'].replace(to_replace = False, value="Unchanged")
harmonized_wide_sel_envs_df['rel_to_oxygen_rep_status'] = harmonized_wide_sel_envs_df['rel_to_oxygen_rep_status'].replace(to_replace = True, value="Repaired") 

harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(aerobe_pattern, regex=True), 'rel_to_oxygen_rep'] = 'aerobe'
harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(anaerobe_pattern, regex=True), 'rel_to_oxygen_rep'] = 'anaerobe'
harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(facultative_pattern, regex=True), 'rel_to_oxygen_rep'] = 'facultative'
harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(microaerophilic_pattern, regex=True), 'rel_to_oxygen_rep'] = 'microaerophilic'
harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(microanaerobe_pattern, regex=True), 'rel_to_oxygen_rep'] = 'microanaerobe'
harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(obligate_aerobe_pattern, regex=True), 'rel_to_oxygen_rep'] = 'obligate aerobe'
harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(obligate_anaerobe_pattern, regex=True), 'rel_to_oxygen_rep'] = 'obligate anaerobe'

#harmonized_wide_sel_envs_df.loc[~df["rel_to_oxygen_rep"].isin(proper_values), "rel_to_oxygen_rep"] = None
harmonized_wide_sel_envs_df['rel_to_oxygen_rep'] = np.where(harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].isin(proper_values), harmonized_wide_sel_envs_df['rel_to_oxygen_rep'], None)

harmonized_wide_sel_envs_df['air_temp'] = harmonized_wide_sel_envs_df['air_temp'].str.replace(" degree Celsius", "")
harmonized_wide_sel_envs_df['air_temp'] = harmonized_wide_sel_envs_df['air_temp'] + " degree Celsius"

  harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(aerobe_pattern, regex=True), 'rel_to_oxygen_rep'] = 'aerobe'
  harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(anaerobe_pattern, regex=True), 'rel_to_oxygen_rep'] = 'anaerobe'
  harmonized_wide_sel_envs_df.loc[harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].str.contains(facultative_pattern, regex=True), 'rel_to_oxygen_rep'] = 'facultative'


In [2]:
print("rel_to_oxygen_rep Value Counts: ")
display(harmonized_wide_sel_envs_df['rel_to_oxygen_rep'].value_counts())
print("air_temp Value Counts: ")
display(harmonized_wide_sel_envs_df['air_temp'].value_counts())
print("air_temp Unique Values: ")
display(harmonized_wide_sel_envs_df['air_temp'].unique())

rel_to_oxygen_rep Value Counts: 


aerobe               3979
obligate anaerobe      66
anaerobe               47
facultative            23
microaerophilic        11
Name: rel_to_oxygen_rep, dtype: int64

air_temp Value Counts: 


-1.9 degree Celsius     12
-1 degree Celsius       12
-3.3 degree Celsius     11
-10.8 degree Celsius    11
-7.1 degree Celsius     11
-7.7 degree Celsius      9
-5.5 degree Celsius      6
-14.9 degree Celsius     6
-11.6 degree Celsius     6
-4.9 degree Celsius      6
-3.9 degree Celsius      6
-6 degree Celsius        6
-4.3 degree Celsius      6
-16 degree Celsius       6
-5.8 degree Celsius      6
-2.8 degree Celsius      6
-1.4 degree Celsius      6
-9 degree Celsius        6
-3.8 degree Celsius      6
-4.5 degree Celsius      6
-8.6 degree Celsius      6
-11.9 degree Celsius     6
-14 degree Celsius       6
-13.1 degree Celsius     6
-12.6 degree Celsius     6
-9.7 degree Celsius      6
-10.5 degree Celsius     6
-6.6 degree Celsius      6
-0.9 degree Celsius      5
-2.4 degree Celsius      5
-10.3 degree Celsius     5
-1.6 degree Celsius      5
-2.1 degree Celsius      5
-1.8 degree Celsius      4
-14.2 degree Celsius     4
-5.1 degree Celsius      4
19.5 degree Celsius      3
-

air_temp Unique Values: 


array([nan, '-10.3 degree Celsius', '-11.6 degree Celsius',
       '-10.5 degree Celsius', '-9.7 degree Celsius',
       '-12.6 degree Celsius', '-13.1 degree Celsius',
       '-14 degree Celsius', '-11.9 degree Celsius',
       '-10.8 degree Celsius', '-8.6 degree Celsius',
       '-7.7 degree Celsius', '-4.5 degree Celsius',
       '-3.8 degree Celsius', '-2.1 degree Celsius',
       '-1.4 degree Celsius', '-3.3 degree Celsius',
       '-5.8 degree Celsius', '-7.1 degree Celsius', '-16 degree Celsius',
       '-14.2 degree Celsius', '-6 degree Celsius', '-3.9 degree Celsius',
       '-1 degree Celsius', '-0.82 degree Celsius', '-4.9 degree Celsius',
       '-2.8 degree Celsius', '-1.6 degree Celsius',
       '-14.9 degree Celsius', '-1.9 degree Celsius',
       '-6.6 degree Celsius', '-5.5 degree Celsius',
       '-2.4 degree Celsius', '-0.9 degree Celsius',
       '-1.2 degree Celsius', '-1.8 degree Celsius',
       '-4.3 degree Celsius', '-5.1 degree Celsius', '-9 degree Celsius',
