In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from biom import load_table
from sklearn.preprocessing import LabelEncoder

In [53]:
# Step 1: Data Preprocessing
biom = load_table('../snakemake/data/Combined_12datasets/12datasets_1258/all_1258.biom')
biom_table = biom.to_dataframe()
biom_table = biom_table.T
metadata = pd.read_csv('../snakemake/data/Combined_12datasets/12datasets_1258/age_sorted/6groups/metadata_1258_age_sorted_6G.txt',
                      sep = '\t')
# Extract feature matrix from biom table
features = biom_table.iloc[:, :].values

# Encode disease classes
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(metadata['disease'])

# Step 2: Feature Engineering (if necessary)

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 4: XGBoost Classifier Training
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
# Step 5: Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.4166666666666667
                   precision    recall  f1-score   support

               AD       0.12      0.08      0.10        13
              ASD       0.54      0.58      0.56        26
               CD       0.50      0.15      0.24        13
    HealthyAdults       0.50      0.75      0.60        24
  HealthyChildren       0.65      0.46      0.54        24
HealthyMiddleAged       0.47      0.47      0.47        30
   HealthySeniors       0.45      0.64      0.53        28
  HealthyToddlers       0.59      0.76      0.67        21
     HealthyYouth       0.29      0.40      0.33        15
               MS       0.00      0.00      0.00         5
          Obesity       0.00      0.00      0.00         6
               PD       0.00      0.00      0.00         3
    Schizophrenia       0.14      0.06      0.08        17
              T1D       0.75      0.38      0.50         8
              T2D       0.00      0.00      0.00        11
               UC       0.

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Data Preprocessing
biom = load_table('../snakemake/data/Combined_12datasets/12datasets_1258/all_1258.biom')
biom_table = biom.to_dataframe()
biom_table = biom_table.T
metadata = pd.read_csv('../snakemake/data/Combined_12datasets/12datasets_1258/age_sorted/6groups/metadata_1258_age_sorted_6G.txt',
                      sep = '\t')
# Extract feature matrix from biom table
features = biom_table.iloc[:, :].values

# Encode disease classes
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(metadata['disease'])

# Step 2: Feature Engineering (if necessary)

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Gradient Boosting Classifier Training
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

# Step 4: Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.29365079365079366
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.41      0.46      0.44        26
           2       0.12      0.08      0.10        13
           3       0.41      0.50      0.45        24
           4       0.50      0.42      0.45        24
           5       0.33      0.30      0.32        30
           6       0.26      0.36      0.30        28
           7       0.56      0.43      0.49        21
           8       0.55      0.40      0.46        15
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         6
          11       0.00      0.00      0.00         3
          12       0.08      0.06      0.07        17
          13       0.44      0.50      0.47         8
          14       0.00      0.00      0.00        11
          15       0.00      0.00      0.00         8

    accuracy                           0.29       