In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
from scipy import stats
from scipy.stats import ttest_ind

In [2]:
# Import data
meta = pd.read_csv('T1_SMDS_metadata_ms.txt', sep='\t', index_col=0)

## Evaluate differences in saliva flow rate after eating versus not

In [3]:
# subset out samples that were collected following eating versus those that were not
ate = meta[meta['food']=='yes']
didnot = meta[meta['food']=='no']

In [4]:
# print average and SEM of saliva flow rate in samples collected after eating
print('SFR after eating avg:', ate['saliva_flow_rate_mL_per_min'].mean())
print('SFR after eating SEM:', ate['saliva_flow_rate_mL_per_min'].sem())

SFR after eating avg: 0.5372222222222223
SFR after eating SEM: 0.03875670582706708


In [5]:
# print average and SEM of saliva flow rate in samples not collected after eating
print('SFR not after eating avg:', didnot['saliva_flow_rate_mL_per_min'].mean())
print('SFR not after eating SEM:', didnot['saliva_flow_rate_mL_per_min'].sem())

SFR not after eating avg: 0.43
SFR not after eating SEM: 0.029611625999328754


In [6]:
# calculate independent T-test between two groups above
stats.ttest_ind(ate['saliva_flow_rate_mL_per_min'], didnot['saliva_flow_rate_mL_per_min'])

Ttest_indResult(statistic=2.232805418765676, pvalue=0.02683704582229939)

## Evaluate differences in microbial concentration after eating versus not

In [9]:
# remove missing values so you can perform statistical tests
atec = ate.dropna(subset=['FC_avg_cells_5_min'])
didnotc = didnot.dropna(subset=['FC_avg_cells_5_min'])

In [10]:
# subset out raw and PMA data from the subset ate and did not eat subsets
Ratec = atec[atec['processing']=='raw']
Patec = atec[atec['processing']=='PMA']
Rdidnotc = didnotc[didnotc['processing']=='raw']
Pdidnotc = didnotc[didnotc['processing']=='PMA']

In [16]:
# calculate independent T-test between PMA samples collected after eating versus not after eating
stats.kruskal(Patec['FC_avg_cells_5_min'], Pdidnotc['FC_avg_cells_5_min'])

KruskalResult(statistic=0.4067142008318001, pvalue=0.5236419762727548)

In [15]:
# calculate independent T-test between raw samples collected after eating versus not after eating
stats.kruskal(Ratec['FC_avg_cells_5_min'], Rdidnotc['FC_avg_cells_5_min'])

KruskalResult(statistic=0.0018568033273709261, pvalue=0.9656292799047517)

## Evaluate percentage reads aligning to chloroplasts

In [34]:
# caclulate percentage of samples with reads aligning to choloroplasts
print('%samples after eating:', 
      len(ate[ate['percent_reads_aligning_to_chloroplast_seqeuneces']>0]) / len(ate))
print('%samples not after eating:', 
      len(didnot[didnot['percent_reads_aligning_to_chloroplast_seqeuneces']>0]) / len(didnot))


%samples after eating: 0.4861111111111111
%samples not after eating: 0.16346153846153846


In [37]:
POSdidnot = didnot[didnot['percent_reads_aligning_to_chloroplast_seqeuneces']>0]
POSate = ate[ate['percent_reads_aligning_to_chloroplast_seqeuneces']>0]

In [42]:
print('avg %abu chloroplast after eating:', POSdidnot['percent_reads_aligning_to_chloroplast_seqeuneces'].mean())
print('avg %abu chloroplast not after eating:', POSate['percent_reads_aligning_to_chloroplast_seqeuneces'].mean())

avg %abu chloroplast after eating: 0.0003530346470588235
avg %abu chloroplast not after eating: 0.005973935771428571


## Evaluate differences in chloroplast hits in raw vs PMA-tx samples

In [50]:
# subset out PMA and raw samples
PMA = meta[meta['processing']=='PMA']
raw = meta[meta['processing']=='raw']

In [51]:
print('# raw samples with chloroplast hits:', 
      len(raw[raw['percent_reads_aligning_to_chloroplast_seqeuneces']>0]))
print('# PMA samples with chloroplast hits:', 
      len(PMA[PMA['percent_reads_aligning_to_chloroplast_seqeuneces']>0]))

# raw samples with chloroplast hits: 42
# PMA samples with chloroplast hits: 10
