# Imports

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report
from os import walk

# Preprocessing

In [66]:
def file_get_contents(filename): #Получаем контент файла
    with open(filename) as f:
        return f.read()

In [67]:
def getBacteriesData(content, bacteries): #Парсинг файла, сбор множества бактерий
	rows = content.split("\n")
	result = []
	skip = False
	for i in range(0, len(rows)):
		rows[i] = rows[i].split("\t")
		for j in range(0, len(rows[i])):
			rows[i][j] = rows[i][j].strip()
		if(len(rows[i]) < 4 or rows[i][3] not in ['D', 'P', 'C']):
			continue
		if(float(rows[i][0]) < 3.0):
			continue
		if skip and (rows[i][3] != 'D'):
			continue
		if(rows[i][3] == 'D'):
			if(rows[i][5] not in ['Bacteria', 'Archaea']):
				skip = True
			else:
				skip = False
		result.append({'name': rows[i][5], 'percentage': rows[i][0], 'count': rows[i][1]})
		bacteries.add(rows[i][5])
	return {'result' : result, 'bacteries' : bacteries}

In [68]:
def preproccess(info, key, bacteries): #Получение значений бактерий (key - либо count, либо percentage)
	result = []
	exist_bacteries = []
	for values in info:
		exist_bacteries.append(values['name'])

	for bactery in bacteries:
		if bactery not in exist_bacteries:
			result.append(0)
			continue
		for values in info:
			if values['name'] == bactery:
				result.append(float(values[key]))
				continue
				
	return result

In [69]:
def get_df(file_path, analyses_path, target=0):
	# Загружаем spreadsheet в объект pandas
	xl = pd.ExcelFile(file_path)

	#Преобразуем в DataFrame
	df = xl.parse('Sheet1')

	#Получаем масссив экспериментов
	s_runs = df["Run"].to_numpy()

	df.set_index('Run', inplace=True)

	df['target'] = pd.Series([target] * len(df['Age']), index=df.index)

	runs = next(walk(analyses_path), (None, None, []))[2]  # [] if no file

	bacteries = set() #Список бактерий

	runs_dict = []
	runs_id = []

	for run in s_runs:
		if run in runs:
			result = getBacteriesData(file_get_contents(analyses_path + run), bacteries)
			bacteries = result['bacteries']
			runs_dict.append(result['result'])
			runs_id.append(run)

	bacteries = list(bacteries)
	bacteries.sort()

	#for percentage
	df_bacteries = []

	for run_item in runs_dict:
		df_bacteries.append(preproccess(run_item, 'percentage', bacteries))

	df_cols = []

	for bactery in bacteries:
		df_cols.append(bactery + '_percentage')

	df_bacteries = pd.DataFrame(df_bacteries, columns = df_cols, index=runs_id)
	cols = df_bacteries.columns

	for column in cols:
		df[column] = df_bacteries[column]

	if 'Unnamed: 0' in df.columns:
		df.drop('Unnamed: 0', axis=1, inplace=True)

	return df

# Obesity

In [70]:
file_path = 'obesity.xlsx'
analyses_path = 'C:\\Users\\admin1\\Documents\\Микробиом\\obesity\\' #Путь до исследований
df_obesity = get_df(file_path, analyses_path, 0)

In [71]:
file_path = 'health.xlsx'
analyses_path = 'C:\\Users\\admin1\\Documents\\Микробиом\\health\\' #Путь до исследований
df_health = get_df(file_path, analyses_path, 1)

In [72]:
df_common = df_health.append(df_obesity).fillna(0)

  df_common = df_health.append(df_obesity).fillna(0)


## Tree classifier

In [73]:
feature_cols = list(df_common.columns)
feature_cols = [col for col in feature_cols if col != 'BioSample' and col != 'target' and col != 'Region']
X = df_common[feature_cols] # Features
y = df_common.target # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [74]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [75]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       125
           1       1.00      1.00      1.00        73

    accuracy                           1.00       198
   macro avg       1.00      1.00      1.00       198
weighted avg       1.00      1.00      1.00       198



## Data Groups

In [76]:
print('Возраст от 18 до 30, BMI от 25 до 29.9')

df_obesity_18_30 = df_obesity.query("Age >= 18 and Age <= 30 and BMI >= 25 and BMI <= 29.9")
print(df_obesity_18_30.describe())

Возраст от 18 до 30, BMI от 25 до 29.9
             Age        BMI  target  Actinobacteria_percentage  \
count  53.000000  53.000000    53.0                  53.000000   
mean   26.056604  26.913208     0.0                   6.342453   
std     3.078555   1.258515     0.0                   9.492084   
min    18.000000  25.000000     0.0                   0.000000   
25%    24.000000  26.000000     0.0                   0.000000   
50%    27.000000  26.400000     0.0                   0.000000   
75%    29.000000  27.400000     0.0                   9.050000   
max    30.000000  29.800000     0.0                  35.630000   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                    53.000000                            53.0   
mean                      8.633962                             0.0   
std                      10.428504                             0.0   
min                       0.000000                             0.0   
25%             

In [77]:
print('Возраст от 18 до 30, BMI от 30 до 39.9')

df_obesity_18_30 = df_obesity.query("Age >= 18 and Age <= 30 and BMI >= 30 and BMI <= 39.9")
print(df_obesity_18_30.describe())

Возраст от 18 до 30, BMI от 30 до 39.9
             Age        BMI  target  Actinobacteria_percentage  \
count  13.000000  13.000000    13.0                   13.00000   
mean   28.230769  33.103077     0.0                    1.18000   
std     1.363442   2.349796     0.0                    3.22993   
min    25.000000  30.690000     0.0                    0.00000   
25%    28.000000  31.500000     0.0                    0.00000   
50%    28.000000  32.400000     0.0                    0.00000   
75%    29.000000  33.550000     0.0                    0.00000   
max    30.000000  38.960000     0.0                   11.25000   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                    13.000000                            13.0   
mean                      3.089231                             0.0   
std                       7.132495                             0.0   
min                       0.000000                             0.0   
25%             

In [78]:
print('Возраст от 18 до 30, BMI от 40')

df_obesity_18_30 = df_obesity.query("Age >= 18 and Age <= 30 and BMI >= 40")
print(df_obesity_18_30.describe())

Возраст от 18 до 30, BMI от 40
             Age        BMI  target  Actinobacteria_percentage  \
count   4.000000   4.000000     4.0                        4.0   
mean   27.500000  47.887500     0.0                        0.0   
std     2.081666   5.348653     0.0                        0.0   
min    25.000000  41.370000     0.0                        0.0   
25%    26.500000  45.817500     0.0                        0.0   
50%    27.500000  47.880000     0.0                        0.0   
75%    28.500000  49.950000     0.0                        0.0   
max    30.000000  54.420000     0.0                        0.0   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                          4.0                             4.0   
mean                           0.0                             0.0   
std                            0.0                             0.0   
min                            0.0                             0.0   
25%                     

In [79]:
print('Возраст от 31 до 45, BMI от 25 до 29.9')

df_obesity_18_30 = df_obesity.query("Age >= 31 and Age <= 45 and BMI >= 25 and BMI <= 29.9")
print(df_obesity_18_30.describe())

Возраст от 31 до 45, BMI от 25 до 29.9
              Age         BMI  target  Actinobacteria_percentage  \
count  127.000000  127.000000   127.0                 127.000000   
mean    38.267717   26.740630     0.0                   6.024882   
std      4.667430    1.285127     0.0                   7.892459   
min     31.000000   25.000000     0.0                   0.000000   
25%     34.000000   26.000000     0.0                   0.000000   
50%     38.000000   26.000000     0.0                   3.340000   
75%     43.000000   27.650000     0.0                   9.800000   
max     45.000000   29.900000     0.0                  33.280000   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                   127.000000                      127.000000   
mean                      8.704173                        0.031181   
std                       9.794606                        0.351393   
min                       0.000000                        0.000000  

In [80]:
print('Возраст от 31 до 45, BMI от 30 до 39.9')

df_obesity_18_30 = df_obesity.query("Age >= 31 and Age <= 45 and BMI >= 30 and BMI <= 39.9")
print(df_obesity_18_30.describe())

Возраст от 31 до 45, BMI от 30 до 39.9
             Age        BMI  target  Actinobacteria_percentage  \
count  70.000000  70.000000    70.0                  70.000000   
mean   38.428571  33.493429     0.0                   2.001286   
std     4.732164   2.745087     0.0                   6.191326   
min    31.000000  30.000000     0.0                   0.000000   
25%    34.000000  31.147500     0.0                   0.000000   
50%    39.000000  32.980000     0.0                   0.000000   
75%    43.000000  35.392500     0.0                   0.000000   
max    45.000000  39.900000     0.0                  44.530000   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                    70.000000                            70.0   
mean                      4.397143                             0.0   
std                       8.486619                             0.0   
min                       0.000000                             0.0   
25%             

In [81]:
print('Возраст от 31 до 45, BMI от 40')

df_obesity_18_30 = df_obesity.query("Age >= 31 and Age <= 45 and BMI >= 40")
print(df_obesity_18_30.describe())

Возраст от 31 до 45, BMI от 40
             Age        BMI  target  Actinobacteria_percentage  \
count  30.000000  30.000000    30.0                  30.000000   
mean   36.500000  46.155000     0.0                   0.608667   
std     4.216143   5.405695     0.0                   1.934703   
min    31.000000  40.150000     0.0                   0.000000   
25%    32.000000  42.112500     0.0                   0.000000   
50%    35.000000  45.050000     0.0                   0.000000   
75%    40.000000  48.662500     0.0                   0.000000   
max    44.000000  64.500000     0.0                   8.030000   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                    30.000000                            30.0   
mean                      0.680667                             0.0   
std                       2.175470                             0.0   
min                       0.000000                             0.0   
25%                     

In [82]:
print('Возраст от 46 до 65, BMI от 25 до 29.9')

df_obesity_18_30 = df_obesity.query("Age >= 46 and Age <= 65 and BMI >= 25 and BMI <= 29.9")
print(df_obesity_18_30.describe())

Возраст от 46 до 65, BMI от 25 до 29.9
             Age        BMI  target  Actinobacteria_percentage  \
count  46.000000  46.000000    46.0                   46.00000   
mean   52.630435  26.811522     0.0                    3.52913   
std     4.458992   1.224843     0.0                    5.38833   
min    46.000000  25.100000     0.0                    0.00000   
25%    49.000000  26.000000     0.0                    0.00000   
50%    53.500000  26.300000     0.0                    0.00000   
75%    56.000000  27.675000     0.0                    6.31250   
max    62.000000  29.700000     0.0                   21.07000   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                    46.000000                       46.000000   
mean                      6.224348                        0.143478   
std                       6.728965                        0.683150   
min                       0.000000                        0.000000   
25%             

In [83]:
print('Возраст от 46 до 65, BMI от 30 до 39.9')

df_obesity_18_30 = df_obesity.query("Age >= 46 and Age <= 65 and BMI >= 30 and BMI <= 39.9")
print(df_obesity_18_30.describe())


Возраст от 46 до 65, BMI от 30 до 39.9
             Age        BMI  target  Actinobacteria_percentage  \
count  44.000000  44.000000    44.0                  44.000000   
mean   53.136364  33.061364     0.0                   1.838636   
std     4.618421   2.109691     0.0                   3.809872   
min    46.000000  30.200000     0.0                   0.000000   
25%    49.750000  31.475000     0.0                   0.000000   
50%    53.000000  32.900000     0.0                   0.000000   
75%    56.000000  33.975000     0.0                   3.147500   
max    64.000000  38.200000     0.0                  16.900000   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                    44.000000                            44.0   
mean                      6.029773                             0.0   
std                       8.349475                             0.0   
min                       0.000000                             0.0   
25%             

In [84]:
print('Возраст от 46 до 65, BMI от 40')

df_obesity_18_30 = df_obesity.query("Age >= 46 and Age <= 65 and BMI >= 40")
print(df_obesity_18_30.describe())

Возраст от 46 до 65, BMI от 40
        Age   BMI  target  Actinobacteria_percentage  \
count   1.0   1.0     1.0                        1.0   
mean   50.0  47.2     0.0                        0.0   
std     NaN   NaN     NaN                        NaN   
min    50.0  47.2     0.0                        0.0   
25%    50.0  47.2     0.0                        0.0   
50%    50.0  47.2     0.0                        0.0   
75%    50.0  47.2     0.0                        0.0   
max    50.0  47.2     0.0                        0.0   

       Actinobacteriota_percentage  Alphaproteobacteria_percentage  \
count                          1.0                             1.0   
mean                           0.0                             0.0   
std                            NaN                             NaN   
min                            0.0                             0.0   
25%                            0.0                             0.0   
50%                            0.0          

# Diabetes

In [85]:
file_path = 'diabetes.xlsx'
analyses_path = 'C:\\Users\\admin1\\Documents\\Микробиом\\diabetes\\' #Путь до исследований
df_diabetes = get_df(file_path, analyses_path, 0)

In [86]:
file_path = 'health.xlsx'
analyses_path = 'C:\\Users\\admin1\\Documents\\Микробиом\\health\\' #Путь до исследований
df_health = get_df(file_path, analyses_path, 1)

In [87]:
df_common = df_health.append(df_diabetes).fillna(0)

  df_common = df_health.append(df_diabetes).fillna(0)


## Tree classifier

In [88]:
feature_cols = list(df_common.columns)
feature_cols = [col for col in feature_cols if col != 'BioSample' and col != 'target' and col != 'Region']
X = df_common[feature_cols] # Features
y = df_common.target # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9007633587786259
              precision    recall  f1-score   support

           0       0.93      0.80      0.86        51
           1       0.89      0.96      0.92        80

    accuracy                           0.90       131
   macro avg       0.91      0.88      0.89       131
weighted avg       0.90      0.90      0.90       131



# All classes

In [89]:
file_path = 'diabetes.xlsx'
analyses_path = 'C:\\Users\\admin1\\Documents\\Микробиом\\diabetes\\' #Путь до исследований
df_diabetes = get_df(file_path, analyses_path, 0)

file_path = 'health.xlsx'
analyses_path = 'C:\\Users\\admin1\\Documents\\Микробиом\\health\\' #Путь до исследований
df_health = get_df(file_path, analyses_path, 1)

file_path = 'obesity.xlsx'
analyses_path = 'C:\\Users\\admin1\\Documents\\Микробиом\\obesity\\' #Путь до исследований
df_obesity = get_df(file_path, analyses_path, 2)

df_common = df_diabetes.append(df_obesity).append(df_health).fillna(0)

  df_common = df_diabetes.append(df_obesity).append(df_health).fillna(0)
  df_common = df_diabetes.append(df_obesity).append(df_health).fillna(0)


## Tree classifier

In [90]:
feature_cols = list(df_common.columns)
feature_cols = [col for col in feature_cols if col != 'BioSample' and col != 'target' and col != 'Region']
X = df_common[feature_cols] # Features
y = df_common.target # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8467741935483871
              precision    recall  f1-score   support

           0       0.72      0.44      0.55        52
           1       0.91      0.94      0.92        82
           2       0.84      0.96      0.90       114

    accuracy                           0.85       248
   macro avg       0.82      0.78      0.79       248
weighted avg       0.84      0.85      0.83       248



## Tree classifier by Region

List of regions

In [91]:
regions = np.unique(df_common['Region'].to_numpy()).tolist()
regions

['Africa',
 'America',
 'Asia',
 'Australia',
 'Europe',
 'South America',
 'Unknown']

loop over regions

In [92]:
def classifyByRegion(df_common):
    feature_cols = list(df_common.columns)
    feature_cols = [col for col in feature_cols if col != 'BioSample' and col != 'target' and col != 'Region']
    X = df_common[feature_cols] # Features
    y = df_common.target # Target variable
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)

    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    # Model Accuracy, how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [93]:
for region in regions:
    df_region = df_common.query('Region == @region')
    if(df_region.shape[0] < 10):
        continue
    print(region)
    classifyByRegion(df_region)

Africa
Accuracy: 1.0
              precision    recall  f1-score   support

           2       1.00      1.00      1.00         7

    accuracy                           1.00         7
   macro avg       1.00      1.00      1.00         7
weighted avg       1.00      1.00      1.00         7

America
Accuracy: 0.8809523809523809
              precision    recall  f1-score   support

           0       1.00      0.81      0.90        27
           2       0.75      1.00      0.86        15

    accuracy                           0.88        42
   macro avg       0.88      0.91      0.88        42
weighted avg       0.91      0.88      0.88        42

Asia
Accuracy: 0.9090909090909091
              precision    recall  f1-score   support

           0       0.93      0.65      0.76        20
           1       0.84      0.96      0.90        27
           2       0.95      1.00      0.98        41

    accuracy                           0.91        88
   macro avg       0.91      0.87   