In [None]:
%reset

import pandas as pd

otu_table = pd.read_csv("/data/namlhs/omics-data-learners/data/metsim"
                        "/01_raw/clinical_data/formatted/OTUS.txt", 
                        delim_whitespace=True,
                        index_col=0)

tax_table = pd.read_csv("/data/namlhs/omics-data-learners/data/metsim"
                        "/01_raw/clinical_data/formatted/TAXTABLE.txt", 
                        delim_whitespace=True,
                        index_col=0)

pre_df = pd.read_csv('data/metsim/01_raw/clinical_data/formatted/FINAL_MICROBIOME_DATASET.csv', 
                 index_col=0)

In [None]:
pre_df["keep"] = 0

# For rows that prediabetes = 1, dm = 0, and DMType = 1, set the "keep" column to 1
for index, row in pre_df.iterrows():
    if row["DMType"] == 2 and row["dm"] == 0:
        pre_df.loc[index, "keep"] = 1

# Keep only rows where "keep" is equal to 1
cases_df = pre_df.loc[pre_df["keep"] == 1]
patient = cases_df['METSIM_ID']


# control data frame
control_df = pre_df.loc[pre_df['METSIM_ID'].isin(patient) == False]

# Group the data frame by METSIM_ID
control_df = control_df.groupby("METSIM_ID")

# Select only the row that has the lowest time_point for each METSIM_ID
control_df = control_df.apply(lambda x: x.sort_values("Time_Point").iloc[0])

control_df["keep"] = 1

# For rows that prediabetes = 1, dm = 0, and DMType = 1, set the "keep" column to 1
for index, row in control_df.iterrows():
    if row["DMType"] == 2 and row["dm"] == 1:
        control_df.loc[index, "keep"] = 0

control_df = control_df.loc[control_df["keep"] == 1]

df = pd.concat([cases_df, control_df])

In [None]:
markers_genus = [
    # 'Akkermansia', 
    # 'Oscillospira',
    'Bacteroides',
    # 'Clostridium',
    # 'Eggerthella',
    # 'Escherichia',
    'Faecalibaterium',
    'Firmicutes',
    # 'Veillonella'
    ]
markers_species = [
    # 'vulgatus', 
    # 'ovatus', 
    # 'muciniphila', 
    # 'prausnitzii', 
    # 'salivarius',
    # 'coli'
    ]

In [None]:
import numpy as np

#relative abundance
otu_rel_table = (otu_table.T/otu_table.sum(axis=1)).T
otu_rel_table.sum(axis=1)

def create_array(start, end, prefix):
    return np.array([prefix + str(i) for i in range(start, end + 1)])

if __name__ == "__main__":
    top_abund = create_array(1, 15, "ASV")
    lim_abund = create_array(1, 200, "ASV")
    print(top_abund)
    print(lim_abund)

# pick most abundance and 
pick_otu = tax_table[((tax_table['Species'].isin(markers_species)) |
                      (tax_table['Genus'].isin(markers_genus)) |
                      (tax_table.index.isin(top_abund))) &
                     (tax_table.index.isin(lim_abund))].index

otu_fil = otu_rel_table[pick_otu]
otu_fil.index = otu_fil.index.str.replace('_', '.')

In [None]:
#check the data sparsity
sparse = df.isnull().sum()/len(df)
display(sparse)

sparse_filtered = sparse[sparse < 0.2]

# Display the filtered Series
print(sparse_filtered)

#keep only column in sparse_filtered
df_filtered = df.loc[:, sparse_filtered.index]
df_filtered = df_filtered.set_index('SampleID')
display(df_filtered)

In [None]:
alt_df = df_filtered[[
    'DMType', 
    'Age', 'WHR', 'fmass',
    'BMI', 'Freq_veg', 'Freq_fruit',
    'Freq_leanfish',
    'Freq_strongwine', 'Freq_blend',
    'Freq_wine', 'Freq_alclt3', 'Freq_alclt6', 'Freq_alcge6', 'Freq_liqueur',
    'Milk',
    'Spread_no', 'Spread_marg',
    'Cookfat_sat', 'Cookfat_no', 'Cookfat_marg', 'Cookfat_oils',
    'Redmeat_gwk',
    'Cheese_freq', 'Cheese_g', 'Cheese_other',
    'Cereal_24_serv_wholegrain','Cereal_24_serv_pastry'
    ]].copy()


In [None]:
#merge data frame into 1
match_df = pd.merge(alt_df, otu_fil, left_index=True, right_index=True)

# meta_df = match_df.drop(columns=['dm', 'METSIM_ID', 'Time_Point'])
meta_df = match_df

df_cor = meta_df.corr(method='kendall')
df_pairs = df_cor.unstack()

# print(df_pairs)
sorted_pairs = df_pairs.sort_values(kind='quicksort')
remove_pairs = sorted_pairs[(abs(sorted_pairs) >= 0.5) & (sorted_pairs != 1)].reset_index()

display(remove_pairs)

#Check the NaN values
print(meta_df.isnull().sum())

# Get the list of columns to drop
drop_cols = list(remove_pairs['level_0'])

# Drop the columns from meta_df
chosen_df = meta_df.drop(columns=[])

#for combine data
alt_data = chosen_df.values

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2, weights="distance", metric='nan_euclidean')

array_imputed = imputer.fit_transform(alt_data)
#print(alt_df.columns)
df_imputed = pd.DataFrame(array_imputed, columns=chosen_df.columns)

#Check the NaN values
print(df_imputed.isnull().sum())

df_imputed['DMType'].loc[(df_imputed['DMType'] > 0)] = 1
display(df_imputed)
df_imputed['DMType'].value_counts()
df_imputed.dtypes

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

X = df_imputed.drop(columns ='DMType')

y = df_imputed.DMType
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=777)

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# Scale
scaler = MinMaxScaler()
scaler.fit(X_train)

X_scale = scaler.transform(X)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
sample_weights = np.zeros(len(y_train))
sample_weights[y_train == 0] = 0.2
sample_weights[y_train == 1] = 0.8

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import confusion_matrix, classification_report

# # creating a RF classifier
# clf = RandomForestClassifier(n_estimators = 1000,
#                              class_weight='balanced',
#                              max_depth=4,
#                              max_features=None,
#                              random_state=777)  
  
# # Training the model on the training dataset
# # fit function is used to train the model using the training sets as parameters
# clf.fit(X_train, y_train)
  
# # performing predictions on the test dataset
# y_pred = clf.predict(X_test)
  
# # metrics are used to find accuracy or error
# from sklearn import metrics

# # using metrics module for accuracy calculation
# print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

# print("Confusion Matrix:")
# print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
lr_list = [0.005, 0.01, 0.02, 0.05, 0.1, 0.25, 0.5, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=200, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))

In [None]:
# import the metrics class
from sklearn.metrics import confusion_matrix, classification_report

gb_clf2 = GradientBoostingClassifier(n_estimators=200, learning_rate=0.5, max_features=200, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
y_pred = gb_clf2.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

target_names = ['without diabetes', 'with diabetes']
print("Classification Report")
print(classification_report(y_test, y_pred, target_names = target_names))

In [None]:
from sklearn import metrics
# confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

# Visualize
# import required modules
import numpy as np
import seaborn as sns

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.rcParams['figure.facecolor'] = '#f2f2f2'