# Machine Learning with the CF data.

Can we predict either FEV1 directly or probably FEV1 class

In [1]:
import os
import sys

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.colors import ListedColormap
import matplotlib.dates as mdates
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error

from scipy.stats import linregress


# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

try:
  import google.colab
  IN_COLAB = True
  !pip install adjustText
  from google.colab import drive
  drive.mount('/content/drive')
  datadir = '/content/drive/MyDrive/Projects/CF/Adelaide/CF_Data_Analysis'
except ImportError:
  IN_COLAB = False
  datadir = '..'

from adjustText import adjust_text

In [2]:
import cf_analysis_lib
sequence_type = "MGI"
tax = 'genus'
datadir = '..'
pathogens = cf_analysis_lib.pathogens

metadata = cf_analysis_lib.read_metadata(datadir, sequence_type)
metadata.head()

df = cf_analysis_lib.read_taxonomy(datadir, sequence_type, tax)
df = df.T
df = df.rename_axis('NAME')
df

In [3]:
# prompt: I have df with the counts of different bacteria, and metadata with the different data. How do I build a machine learning model to identify which different bacteria from df are best at predicting the values in FEV1_RATIO_SCORE in metadata? Both my data frames have a column called Pseudomonas, and I want to call the one from metadata "Culture Pseudomonas"

# Merge the dataframes
merged_df = df.join(metadata[['FEV1_RATIO_SCORE']])

# this data set models just the pathogens
#X = merged_df[list(pathogens)] # Features (bacteria counts)
#y = merged_df['FEV1_RATIO_SCORE'] # Target variable

# this data set models all bacteria
X = merged_df.drop('FEV1_RATIO_SCORE', axis=1)
y = merged_df['FEV1_RATIO_SCORE']

# Handle missing values (if any) - replace with mean for simplicity
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a RandomForestRegressor model
model = RandomForestRegressor(random_state=42, n_estimators = 100) # You can adjust hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Feature importance
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances_sorted = feature_importances.sort_values(ascending=False)
print("\nFeature Importance:")
feature_importances_sorted

In [4]:
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
top_20_features = feature_importances_sorted.head(20)[::-1]

# Create dotted lines and circles for each feature
for feature, importance in top_20_features.items():
    # plt.plot([0, importance], [feature, feature], linestyle='dotted', marker='o', markersize=5)
    plt.plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    plt.plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

plt.xlabel("Importance")
plt.ylabel(f"Bacterial {tax}")
plt.title(f"Top 20 Bacterial {tax} that predict FEV1 Ratio Score")
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()

## Predict FEV1 Ratio Score Using Pathogenic Bacteria

In [5]:
# Merge the dataframes
merged_df = df.join(metadata[['FEV1_RATIO_SCORE']])

# this data set models just the pathogens
X = merged_df[list(cf_analysis_lib.pathogens)] # Features (bacteria counts)
y = merged_df['FEV1_RATIO_SCORE'] # Target variable

# this data set models all bacteria
#X = merged_df.drop('FEV1_RATIO_SCORE', axis=1)
#y = merged_df['FEV1_RATIO_SCORE']

# Handle missing values (if any) - replace with mean for simplicity
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a RandomForestRegressor model
model = RandomForestRegressor(random_state=42, n_estimators = 100) # You can adjust hyperparameters
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Feature importance
feature_importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

plt.figure(figsize=(10, 6))  # Adjust figure size as needed
# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    plt.plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    plt.plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

plt.xlabel("Importance")
plt.ylabel(f"Bacterial {tax}")
plt.title(f"Top 20 Bacterial {tax} that predict FEV1 Ratio Score")
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()

# Predict FEV1 using pathogenic bacteria

In [6]:
# Merge the dataframes
intcol = 'FEV1/best FEV1' # interesting column!
merged_df = df.join(metadata[[intcol]])

# this data set models all bacteria
X = merged_df.drop(intcol, axis=1)
y = merged_df[intcol]

# Handle missing values (if any) - replace with mean for simplicity
X = X.fillna(X.mean())
y = y.fillna(y.mean())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42, n_estimators = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for all bacteria: {mse}")

# Feature importance
feature_importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

fig, axes = plt.subplots(figsize=(10,6), nrows=1, ncols=2)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[:20][::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[0].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[0].plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[0].set_xlabel("Importance")
axes[0].set_ylabel(f"Bacterial {tax}")
axes[0].set_title(f"Top 20 Bacteria (all) that predict {intcol}")





# this data set models just the pathogens
X = merged_df[list(pathogens)] # Features (bacteria counts)
y = merged_df[intcol] # Target variable

# this data set models all bacteria
#X = merged_df.drop(intcol, axis=1)
#y = merged_df[intcol]

# Handle missing values (if any) - replace with mean for simplicity
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for pathogenic bacteria: {mse}")

# Feature importance
feature_importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[1].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[1].plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[1].set_xlabel("Importance")
axes[1].set_ylabel(f"Bacterial {tax}")
axes[1].set_title(f"Top 20 pathogenic {tax} that predict {intcol}")



plt.tight_layout()
plt.show()

In [7]:
"', '".join(list(metadata.columns))

In [8]:
allintcols = ['IP vs OP', 'Hospital', 'Room', 'Age', 'Age groups', 'Paediatric vs Adult', 'Gender',
              'H2_Corrected', 'CH4_Corrected', 'CH4/H2 ratio_corrected', 'NTM', 'Pseudomonas Culture',
              'IgE', 'Spec IgE', 'Spec IgG', 'Precipitins', 'FVC', 'FEV1', 'Best FEV1', 'FEV1/best FEV1',
              'FEV1_RATIO_SCORE', 'FEV1_Obstruction_Rank', 'Cystic Fibrosis related diabetes (CFRD)',
              'Pancreatic insufficiency (PI)']

fig, axes = plt.subplots(figsize=(20,40), nrows=6, ncols=4)
model = RandomForestRegressor(random_state=42, n_estimators = 1000)
label_encoder = LabelEncoder()

for i, intcol in enumerate(allintcols):
  ax = axes[i//4, i%4]

  if metadata[intcol].dtype == 'object':
    m = metadata[[intcol]].copy()
    m["enc"] = label_encoder.fit_transform(m[intcol])
    merged_df = df.join(m[["enc"]].dropna(), how='inner')
    merged_df = merged_df.rename(columns={"enc": intcol})
  else:
    merged_df = df.join(metadata[[intcol]].dropna(), how='inner')



  # this data set models just the pathogens
  X = merged_df[list(pathogens)] # Features (bacteria counts)
  y = merged_df[intcol] # Target variable

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  accuracy = model.score(X_test, y_test)
  print(f"Model Accuracy for using pathogenic bacteria to predict {intcol}: {accuracy:.2f}")

  mse = mean_squared_error(y_test, y_pred)
  print(f"Mean Squared Error for using pathogenic bacteria to predict {intcol}: {mse:.2f}")



  # Feature importance
  # This is the default method
  # feature_importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
  # this alternative method uses the permutation importance which _should_ be comparable across models?
  pimmse = permutation_importance(model, X_test, y_test, scoring='neg_mean_squared_error')
  pim = permutation_importance(model, X_test, y_test, scoring='accuracy', random_state=42)
  print(f"Permutation importance Using MSE: {pimmse} Using accuracy: {pim}")
  feature_importances = pd.DataFrame(pim.importances_mean, index=X.columns, columns=['importance'])
  feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

  # Create dotted lines and circles for each feature
  for feature in feature_importances_sorted.index[:20][::-1]:
      importance = feature_importances_sorted.loc[feature, 'importance']
      ax.plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
      ax.plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

  ax.set_xlabel("Importance")
  ax.set_ylabel(f"Bacterial {tax}")
  ax.set_title(f"{intcol} (MSE: {mse:.3f})")


plt.tight_layout()
plt.show()


## Testing permutation importance using _Pseudomonas_

In [None]:
# Merge the dataframes
intcol = 'Pseudomonas Culture' # interesting column!
merged_df = df.join(metadata[[intcol]].astype('category').dropna())

X = merged_df[list(pathogens)]
y = merged_df[intcol].cat.codes

# this data set models all bacteria
#X = merged_df.drop(intcol, axis=1

# Handle missing values (if any) - replace with mean for simplicity
X = X.fillna(X.mean())
y = y.fillna(y.mean())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42, n_estimators = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for all bacteria: {mse}")

accuracy = model.score(X_test, y_test)
print(f"Model Accuracy for using pathogenic bacteria to predict {intcol}: {accuracy:.2f}")


fig, axes = plt.subplots(figsize=(10,6), nrows=1, ncols=3)

# Feature importance
feature_importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[0].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[0].plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[0].set_xlabel("Importance")
axes[0].set_ylabel(f"Bacterial {tax}")
axes[0].set_title(f"model.feature_importances_")

# permutation importance

pim = permutation_importance(model, X_test, y_test, scoring='neg_mean_squared_error')
feature_importances = pd.DataFrame(pim.importances_mean, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[1].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[1].plot([feature_importances_sorted.min()[0], importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[1].set_xlabel("Importance")
axes[1].set_ylabel(f"Bacterial {tax}")
axes[1].set_title(f"PI: neg mse")


# permutation importance


pim = permutation_importance(model, X_test, y_test, scoring='d2_absolute_error_score', random_state=42)
feature_importances = pd.DataFrame(pim.importances_mean, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[2].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[2].plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[2].set_xlabel("Importance")
axes[2].set_ylabel(f"Bacterial {tax}")
axes[2].set_title(f"PI: accuracy")

plt.tight_layout()
plt.show()

# Random Forest Classifier

The above is with a _REGRESSOR_ not a classifier

In [9]:
# Merge the dataframes
intcol = 'Pseudomonas Culture' # interesting column!
merged_df = df.join(metadata[[intcol]].astype('category').dropna())

X = merged_df[list(pathogens)]
y = merged_df[intcol]

# this data set models all bacteria
#X = merged_df.drop(intcol, axis=1

# Handle missing values (if any) - replace with mean for simplicity
X = X.fillna(X.mean())
y = y.fillna(y.mean())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42, n_estimators = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error for all bacteria: {mse}")

accuracy = model.score(X_test, y_test)
print(f"Model Accuracy for using pathogenic bacteria to predict {intcol}: {accuracy:.2f}")


fig, axes = plt.subplots(figsize=(10,6), nrows=1, ncols=3)

# Feature importance
feature_importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[0].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[0].plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[0].set_xlabel("Importance")
axes[0].set_ylabel(f"Bacterial {tax}")
axes[0].set_title(f"model.feature_importances_")

# permutation importance

pim = permutation_importance(model, X_test, y_test, scoring='neg_mean_squared_error')
feature_importances = pd.DataFrame(pim.importances_mean, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[1].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[1].plot([feature_importances_sorted.min()[0], importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[1].set_xlabel("Importance")
axes[1].set_ylabel(f"Bacterial {tax}")
axes[1].set_title(f"PI: neg mse")


# permutation importance


pim = permutation_importance(model, X_test, y_test, scoring='accuracy', random_state=42)
feature_importances = pd.DataFrame(pim.importances_mean, index=X.columns, columns=['importance'])
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

# Create dotted lines and circles for each feature
for feature in feature_importances_sorted.index[::-1]:
    importance = feature_importances_sorted.loc[feature, 'importance']
    axes[2].plot([importance], [feature], linestyle='dotted', marker='o', markersize=5, c='blue')
    axes[2].plot([0, importance], [feature, feature], linestyle='dotted', marker='None', markersize=5, c='lightblue')

axes[2].set_xlabel("Importance")
axes[2].set_ylabel(f"Bacterial {tax}")
axes[2].set_title(f"PI: accuracy")

plt.tight_layout()
plt.show()

In [None]:
print(y.unique())  # Check the unique values in the target
print(y.dtypes)

In [None]:
y

In [None]:
feature_importances