# Использование индексов биологического разнообразия для анализа микробиоты

Задание:

- Описать подходы к подсчету разнообразия микробиоты разными критериями
- Подсчитать каждый подход для наших данных

In [30]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, Normalizer

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))

In [6]:
DOCS_DOMAIN_PATH = os.path.join(BASE_DIR, 'data', 'docs', 'domain.xlsx')
DOCS_PHYLUM_PATH = os.path.join(BASE_DIR, 'data', 'docs', 'phylum.xlsx')
DOCS_CLASS_PATH = os.path.join(BASE_DIR, 'data', 'docs', 'class.xlsx')
DOCS_ORDER_PATH = os.path.join(BASE_DIR, 'data', 'docs', 'order.xlsx')
DOCS_FAMILY_PATH = os.path.join(BASE_DIR, 'data', 'docs', 'family.xlsx')
DOCS_SPECIES_PATH = os.path.join(BASE_DIR, 'data', 'docs', 'species.xlsx')

DOCS_FULL_PATH = os.path.join(BASE_DIR, 'data', 'docs', 'full.xlsx')

In [7]:
df_domain = pd.read_excel(DOCS_DOMAIN_PATH)
df_phylum = pd.read_excel(DOCS_PHYLUM_PATH)
df_class = pd.read_excel(DOCS_CLASS_PATH)
df_order = pd.read_excel(DOCS_ORDER_PATH)
df_family = pd.read_excel(DOCS_FAMILY_PATH)
df_species = pd.read_excel(DOCS_SPECIES_PATH)

df_docs = pd.read_excel(DOCS_FULL_PATH)

Загружаем данные для анализа

In [8]:
DATA_CLEAN_PATH = os.path.join(BASE_DIR, 'data', 'clean', 'data.xlsx')
DATA_CLEAN_UNWRAP_PATH = os.path.join(BASE_DIR, 'data', 'clean', 'data_unwrap.xlsx')

In [9]:
df_clean = pd.read_excel(DATA_CLEAN_PATH)
df_clean_unwrap = pd.read_excel(DATA_CLEAN_UNWRAP_PATH)

In [10]:
df_clean.head()

Unnamed: 0,group,number,Arthrobacter,Galactobacter,Corynebacterium,Adlercreutzia,Gordonibacter,Paraeggerthella,Rubneribacter,Slackia,...,Turicimonas,Vampirovibrio,Enterobacter,Escherichia/Shigella,Klebsiella,unclassified_Enterobacteriaceae,Acinetobacter,Stenotrophomonas,unclassified_Bacteria,unclassified_Dothideomycetes
0,ПР,1,0,0,0,5,0,0,77,0,...,3,23,487,0,5,0,3,3,3,0
1,ПР,2,0,0,5,10,0,0,95,3,...,0,8,100,0,0,0,0,3,4,0
2,ПДР,1,0,0,3,0,0,0,22,0,...,0,0,56,6,0,0,0,4,0,0
3,ПДР,2,10,0,0,0,0,0,23,0,...,9,21,316,11,5,0,0,26,3,0
4,Целлюлоза,1,0,4,3,0,0,0,19,0,...,8,0,323,8,6,0,0,0,0,0


In [16]:
df_clean_unwrap_docs = df_clean_unwrap.merge(df_docs, on=['id_species'], how='left')
df_clean_unwrap_docs.head()

Unnamed: 0,group,number,id_species,colonies,species,id_family,family,id_order,order,id_class,class,id_phylum,phylum,id_domain,domain
0,ПР,1,1,0,Arthrobacter,1,Micrococcaceae,1,Micrococcales,1,Actinobacteria,1,Actinobacteria,1,Bacteria
1,ПР,1,2,0,Galactobacter,1,Micrococcaceae,1,Micrococcales,1,Actinobacteria,1,Actinobacteria,1,Bacteria
2,ПР,1,3,0,Corynebacterium,2,Corynebacteriaceae,2,Mycobacteriales,1,Actinobacteria,1,Actinobacteria,1,Bacteria
3,ПР,1,4,5,Adlercreutzia,3,Eggerthellaceae,3,Eggerthellales,2,Coriobacteriia,1,Actinobacteria,1,Bacteria
4,ПР,1,5,0,Gordonibacter,3,Eggerthellaceae,3,Eggerthellales,2,Coriobacteriia,1,Actinobacteria,1,Bacteria


Нормализованное представление по филумам

In [39]:
df_clean_by_phylum= pd.pivot_table(df_clean_unwrap_docs, values='colonies', index=['phylum'], columns=['group', 'number'], aggfunc='sum')
df_clean_by_phylum = df_clean_by_phylum.T
df_clean_by_phylum_normalize = df_clean_by_phylum.copy()
df_clean_by_phylum_normalize.iloc[:,:] = Normalizer(norm='l1').fit_transform(df_clean_by_phylum)
df_clean_by_phylum_normalize.head(5)

Unnamed: 0_level_0,phylum,Actinobacteria,Ascomycota,Bacteroidetes,Firmicutes,Proteobacteria,unclassified_Bacteria
group,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Fe,1,0.000715,0.0,0.619878,0.374672,0.004564,0.00017
Fe,2,0.000973,0.0,0.445806,0.533467,0.019468,0.000286
АУ,1,0.000605,0.0,0.54265,0.455303,0.001443,0.0
АУ,2,0.000954,0.0,0.679295,0.312494,0.007257,0.0
Биф,1,0.001645,0.0,0.430967,0.533662,0.033726,0.0


Нормализованное представление по классам

In [38]:
df_clean_by_class = pd.pivot_table(df_clean_unwrap_docs, values='colonies', index=['class'], columns=['group', 'number'], aggfunc='sum')
df_clean_by_class = df_clean_by_class.T
df_clean_by_class_normalize = df_clean_by_class.copy()
df_clean_by_class_normalize.iloc[:,:] = Normalizer(norm='l1').fit_transform(df_clean_by_class)
df_clean_by_class_normalize.head(5)

Unnamed: 0_level_0,class,Actinobacteria,Alphaproteobacteria,Bacilli,Bacteroidia,Betaproteobacteria,Clostridia,Coriobacteriia,Deltaproteobacteria,Dothideomycetes,Erysipelotrichia,Gammaproteobacteria,unclassified_Bacteria,unclassified_Firmicutes
group,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Fe,1,0.0,0.000102,0.178071,0.619878,0.000272,0.179809,0.000715,0.002316,0.0,0.013761,0.001873,0.00017,0.003031
Fe,2,0.000286,0.0,0.134555,0.445806,0.000229,0.39164,0.000687,0.000172,0.0,0.006127,0.019067,0.000286,0.001145
АУ,1,0.000233,0.0,0.061148,0.54265,0.0,0.393271,0.000372,0.000279,0.0,0.000372,0.001163,0.0,0.000512
АУ,2,0.000341,0.000273,0.068584,0.679295,0.001192,0.234132,0.000613,0.000715,0.0,0.008654,0.005076,0.0,0.001124
Биф,1,0.000316,0.00019,0.358707,0.430967,0.0,0.168438,0.001329,0.001708,0.0,0.005189,0.031827,0.0,0.001329


Нормализованное представление по порядкам

In [40]:
df_clean_by_order = pd.pivot_table(df_clean_unwrap_docs, values='colonies', index=['order'], columns=['group', 'number'], aggfunc='sum')
df_clean_by_order = df_clean_by_order.T
df_clean_by_order_normalize = df_clean_by_order.copy()
df_clean_by_order_normalize.iloc[:,:] = Normalizer(norm='l1').fit_transform(df_clean_by_order)
df_clean_by_order_normalize.head(5)

Unnamed: 0_level_0,order,Bacillales,Bacteroidales,Bdellovibrionales,Burkholderiales,Clostridiales,Eggerthellales,Enterobacteriales,Erysipelotrichales,Kiloniellales,Lactobacillales,Micrococcales,Mycobacteriales,Pseudomonadales,Rhizobiales,Xanthomonadales,unclassified_Bacteria,unclassified_Dothideomycetes,unclassified_Firmicutes
group,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Fe,1,0.000204,0.619878,0.002316,0.000272,0.179809,0.000715,0.001703,0.013761,0.000102,0.177867,0.0,0.0,0.0,0.0,0.00017,0.00017,0.0,0.003031
Fe,2,0.009734,0.445806,0.000172,0.000229,0.39164,0.000687,0.019067,0.006127,0.0,0.124821,0.0,0.000286,0.0,0.0,0.0,0.000286,0.0,0.001145
АУ,1,0.000279,0.54265,0.000279,0.0,0.393271,0.000372,0.001163,0.000372,0.0,0.060868,0.0,0.000233,0.0,0.0,0.0,0.0,0.0,0.000512
АУ,2,0.0,0.679295,0.000715,0.001192,0.234132,0.000613,0.005076,0.008654,0.000273,0.068584,0.000341,0.0,0.0,0.0,0.0,0.0,0.0,0.001124
Биф,1,0.005378,0.430967,0.001708,0.0,0.168438,0.001329,0.031005,0.005189,0.0,0.353328,0.0,0.000316,0.0,0.00019,0.000823,0.0,0.0,0.001329


Нормализованное представление по семействам

In [41]:
df_clean_by_family = pd.pivot_table(df_clean_unwrap_docs, values='colonies', index=['family'], columns=['group', 'number'], aggfunc='sum')
df_clean_by_family = df_clean_by_family.T
df_clean_by_family_normalize = df_clean_by_family.copy()
df_clean_by_family_normalize.iloc[:,:] = Normalizer(norm='l1').fit_transform(df_clean_by_family)
df_clean_by_family_normalize.head(5)

Unnamed: 0_level_0,family,Bacillaceae 1,Bacteroidaceae,Barnesiellaceae,Bdellovibrionaceae,Brucellaceae,Carnobacteriaceae,Catabacteriaceae,Christensenellaceae,Clostridiaceae 1,Clostridiales_Incertae Sedis XIII,...,Streptococcaceae,Sutterellaceae,Xanthomonadaceae,"unclassified_""Bacteroidales""",unclassified_Bacillales,unclassified_Bacteria,unclassified_Clostridiales,unclassified_Dothideomycetes,unclassified_Firmicutes,unclassified_Lactobacillales
group,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Fe,1,0.000204,0.456249,0.027692,0.002316,0.0,0.0,0.000238,0.000102,0,0.000681,...,0.000272,0.000272,0.00017,0.0,0.0,0.00017,0.005756,0.0,0.003031,0.0
Fe,2,0.00939,0.414257,0.000744,0.000172,0.0,0.000573,0.064357,0.0,0,0.001202,...,0.001889,0.000229,0.0,0.0,0.000344,0.000286,0.01231,0.0,0.001145,0.0
АУ,1,0.000279,0.370096,0.000791,0.000279,0.0,0.0,0.000326,0.0,0,0.001117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002559,0.0,0.000512,0.0
АУ,2,0.0,0.639126,0.001158,0.000715,0.0,0.0,0.007632,0.0,0,0.001192,...,0.0,0.001192,0.0,0.0,0.0,0.0,0.018568,0.0,0.001124,0.0
Биф,1,0.004556,0.153632,0.002847,0.001708,0.00019,0.000506,0.006581,0.0,0,0.0,...,0.006011,0.0,0.000823,0.0,0.000823,0.0,0.005062,0.0,0.001329,0.000253


Нормализация по всем таксонам

In [46]:
df_clean_temp = df_clean.set_index(['group', 'number']).sort_index()
df_clean_temp.iloc[:, :] = Normalizer(norm='l1').fit_transform(df_clean_temp)
df_clean_normalize = df_clean_temp
df_clean_normalize.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Arthrobacter,Galactobacter,Corynebacterium,Adlercreutzia,Gordonibacter,Paraeggerthella,Rubneribacter,Slackia,unclassified_Eggerthellaceae,Bacteroides,...,Turicimonas,Vampirovibrio,Enterobacter,Escherichia/Shigella,Klebsiella,unclassified_Enterobacteriaceae,Acinetobacter,Stenotrophomonas,unclassified_Bacteria,unclassified_Dothideomycetes
group,number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Fe,1,0.0,0.0,0.0,0.0,0.0,0,0.000715,0.0,0.0,0.449504,...,0.000272,0.002316,0.001703,0.0,0.0,0.0,0.0,0.00017,0.00017,0.0
Fe,2,0.0,0.0,0.000286,0.0,0.000229,0,0.000458,0.0,0.0,0.408646,...,0.000229,0.000172,0.018551,0.0,0.000515,0.0,0.0,0.0,0.000286,0.0
АУ,1,0.0,0.0,0.000233,0.0,0.0,0,0.000372,0.0,0.0,0.339243,...,0.0,0.000279,0.001163,0.0,0.0,0.0,0.0,0.0,0.0,0.0
АУ,2,0.000341,0.0,0.0,0.0,0.0,0,0.000613,0.0,0.0,0.633334,...,0.001192,0.000715,0.005076,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Биф,1,0.0,0.0,0.000316,0.000443,0.00038,0,0.000506,0.0,0.0,0.145533,...,0.0,0.001708,0.030246,0.0,0.000759,0.0,0.0,0.000823,0.0,0.0


## 1 Альфа разнообразие

$\alpha$ разнообразие описывает разнообразие внутри сообщества. Существует несколько различных индексов, используемых для расчета $\alpha$ разнообразия, поскольку разные индексы отражают разные аспекты разнообразия и имеют разную чувствительность к разным факторам. Эти индексы были разработаны для решения конкретных исследовательских вопросов, учета различных экологических или популяционных характеристик или выделения определенных аспектов разнообразия.

<img src="../img/alphadiversity_metrics.png" width=600>

### 1.1 Индекс разнообразия Шеннона

**Индекс разнообразия Шеннона** (Shannon, 1948) и рассчитывается по формуле:

$$ H = - \sum_{i=1}^{S} p_i\log_{2}p_i $$

где $p_i$ – встречаемость вида $i$ относительного общего содержания всех особей всех видов, $S$ – количество изучаемых видов.

Индекс Шеннона суммирует количество информации о численности и видовом составе организмов, учитывая число видов и степень их доминирования. Индекс независим от биоценотического сходства сравниваемых сообществ и может быть вычислен для каждого ценоза в отдельности, что в медицинских исследованиях означает возможность его вычисления для отдельно взятого пациента. Индекс Шеннона отображает сложность структуры, основываясь на количестве представленного вида: чем больше разнообразие, тем больше индекс.

In [13]:
def shennon_diversity(fracs: list):
    return 0

### 1.2 Индекс Симпсона

**Индекс разнообразия Симпсона** имеет вид:

$$ C = \sum_{i=1}^{S} p_i^2 $$

где $p_i$ – встречаемость вида $i$ относительного общего содержания всех особей всех видов, $S$ – количество изучаемых видов.

Поскольку при возведении в квадрат малых отношений $p_i^2$ получаются очень малые величины, индекс Симпсона тем больше, чем сильнее доминирование одного или нескольких видов. Индекс Симпсона служит мерой связи числа степеней свободы внутривидовых и межвидовых взаимодействий. Его значение варьируется от 0 до 1, что значит бесконечное разнообразие или его отсутствие соответственно.

In [14]:
def simpson_diversity(fracs: list):
    return 0

### 1.3 Индекс Чао

In [15]:
def chao_diversity(fracs: list):
    return 0

## 2 Бета-разнообразие

$\beta$-разнообразие измеряет расстояние между двумя или более отдельными объектами. Таким образом, оно описывает разницу между двумя сообществами или экосистемами.

Для расчёта $\beta$-разнообразия используются несколько индексов, поскольку разные индексы подчёркивают различные аспекты композиционного сходства или различия между сообществами или участками.

Эти индексы были разработаны для решения конкретных исследовательских задач, работы с различными типами данных или для получения информации о различных аспектах $\beta$-разнообразия.

## Ресурсы:

- [Пример](https://7universum.com/ru/med/archive/item/13192)
- [Пример 2](https://training.galaxyproject.org/training-material/topics/microbiome/tutorials/diversity/tutorial.html)
- [Статья с анализом](https://vavilovj-icg.ru/download/12_Druzhinin.pdf)