# Explore RAST Metadata

I have abstracted this out because we need some new functions, and so I want a clean notebook. Will merge with the RAST notebook later!

In [1]:
import os
import sys

from collections import Counter

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import seaborn as sns
import numpy as np

import math
import re

from PhiSpyAnalysis import theils_u, DateConverter, printmd
from PhiSpyAnalysis import read_phages, read_gtdb, read_checkv, read_base_pp, read_categories, read_metadata, read_gbk_metadata

from scipy.stats import pearsonr, f_oneway, entropy
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, tukeyhsd, MultiComparison
from statsmodels.multivariate.manova import MANOVA
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier


## Read the metadata

We just use the small file for development, and then clean up a few known issues

In [2]:
use_small_data=False
metadf = read_metadata(use_small_data=use_small_data)

In [3]:
metadf['isolation_date']

0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
             ...     
320171    2015.674880
320172            NaN
320173    2014.675565
320174    2014.675565
320175            NaN
Name: isolation_date, Length: 320176, dtype: float64

# Theil's U

This work comes from [The Search For Categorical Correlation](https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9) and [The Dython Library](https://github.com/shakedzy/dython) but I have abstracted and simplified it here for my use

_Note_ I have now abstracted this into a class for use in the code

In [4]:
theils_u(metadf['isolation_country'], metadf['geographic_location'])

0.9721908947957727

In [5]:
pd.set_option('display.max_rows', 1000)
metadf[metadf['isolation_site'].notnull() & metadf['other_clinical'].notnull()][['isolation_site', 'other_clinical']]

Unnamed: 0,isolation_site,other_clinical
14557,Respiratory,hospital location:ICU
14558,Urine,hospital location:non-ICU
15424,Urine,hospital location:non-ICU
15426,Urine,hospital location:non-ICU
15430,Urine,hospital location:non-ICU
15431,Urine,hospital location:non-ICU
15432,Tissue,hospital location:non-ICU
15433,Urine,hospital location:non-ICU
15434,Urine,hospital location:non-ICU
15440,Respiratory,hospital location:non-ICU


In [6]:
if False:
    hm = pd.DataFrame(index=metadf.columns, columns=metadf.columns, dtype=float)
    for i in range(len(metadf.columns)):
        for j in range(len(metadf.columns)):
            hm.iloc[i,j] = theils_u(metadf.iloc[:,i], metadf.iloc[:,j])
    sns.heatmap(hm)

In [None]:
acccol = 'assembly_accession'
interesting_cols = [acccol, 'isolation_site', 'geographic_location', 'isolation_country', 'latitude', 'longitude', 'altitude', 'depth',
                    'other_environmental', 'host_name', 'host_gender', 'host_age', 'host_health', 
                    'body_sample_site', 'body_sample_subsite', 'other_clinical', 'gram_stain', 'cell_shape',
                    'motility', 'sporulation', 'temperature_range', 'optimal_temperature', 'salinity',
                    'oxygen_requirement', 'habitat', 'disease', 'isolation_date']

hmi = pd.DataFrame(index=interesting_cols, columns=interesting_cols, dtype=float)
for i,j in enumerate(interesting_cols):
    for x,y in enumerate(interesting_cols):
        hmi.iloc[i,x] = theils_u(metadf.loc[:,j], metadf.loc[:,y])

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 8))
# sns.heatmap(hmi, cmap='Blues', ax=ax)
ax = sns.heatmap(hmi, linecolor='k', linewidths=0.01, ax=ax)
t = ax.set_ylabel("How much can we infer about:")
t = ax.set_xlabel("Given that we know:")

In [None]:
col1 = 'other_clinical'
col2 = 'isolation_site'
u1 = theils_u(metadf.loc[:,col1], metadf.loc[:,col2])
u2 = theils_u(metadf.loc[:,col2], metadf.loc[:,col1])

print(f"{col1} vs {col2} {u1}")
print(f"{col2} vs {col1} {u2}")

if u1 > u2:
    print("Theils U is probability of X given Y")
    print(f"This means that knowing the {col2} we have a lot of information about the {col1} ")
    print(f"but knowing the {col1} does not inform us about the {col2}")
else:
    print("Theils U is probability of X given Y")
    print(f"This means that knowing the {col1} we have a lot of information about the {col2} ")
    print(f"but knowing the {col2} does not inform us about the {col1}")
     

In [None]:
metadf[metadf['isolation_site'] == 'Sputum'].groupby(['other_clinical', 'isolation_site']).size()

In [None]:
metadf.groupby(['isolation_site', 'other_clinical']).size()

In [None]:
metadf.groupby(['other_clinical', 'isolation_site']).size()

In [None]:
col1 = 'geographic_location'
col2 = 'isolation_country'
tmp = metadf[(metadf[col1].notnull()) & (metadf[col2].notnull())]

tmp[[col1, col2]]

In [None]:
col1 = 'geographic_location'
col2 = 'isolation_country'
u1 = theils_u(metadf.loc[:,col1], metadf.loc[:,col2])
u2 = theils_u(metadf.loc[:,col2], metadf.loc[:,col1])

print(f"{col1} vs {col2} {u1}")
print(f"{col2} vs {col1} {u2}")

if u1 > u2:
    print("Theils U is probability of X given Y")
    print(f"This means that knowing the {col2} we have a lot of information about the {col1} ")
    print(f"but knowing the {col1} does not inform us about the {col2}")
else:
    print("Theils U is probability of X given Y")
    print(f"This means that knowing the {col1} we have a lot of information about the {col2} ")
    print(f"but knowing the {col2} does not inform us about the {col1}")
    

In [None]:
col1 = 'geographic_location'
col2 = 'isolation_country'
counts12 = {}
counts21 = {}
for index, row in metadf.iterrows():
    gl = row[col1]
    ic = row[col2]
    
    if gl not in counts12:
        counts12[gl] = {}
    counts12[gl][ic] = counts12[gl].get(ic, 0) + 1
    
    if ic not in counts21:
        counts21[ic] = {}
    counts21[ic][gl] = counts21[ic].get(gl, 0) + 1

c12s = 0
c12m = 0
c21s = 0
c21m = 0
for k in counts12:
    if len(counts12[k]) == 1:
        c12s += 1
    else:
        c12m += 1

for k in counts21:
    if len(counts21[k]) == 1:
        c21s += 1
    else:
        c21m += 1

print(f"Given {len(counts12.keys()):,} {col1} you will have {c12s:,} ({c12s/len(counts12.keys())*100:.2f} %) single {col2}")
print(f"Given {len(counts12.keys()):,} {col1} you will have {c12m:,} ({c12m/len(counts12.keys())*100:.2f} %) multiple {col2}")
print(f"\nGiven {len(counts21.keys()):,} {col2} you will have {c21s:,} ({c21s/len(counts21.keys())*100:.2f} %) single {col1}")
print(f"Given {len(counts21.keys()):,} {col2} you will have {c21m:,} ({c21m/len(counts21.keys())*100:.2f} %) multiple {col1}")