In [3]:
import pandas as pd
import os
import csv
import math
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
%run feature_reductions.ipynb
%run join_and_normalize.ipynb
%run modeling.ipynb

In [15]:
# Label name is 'Group' for the HIV dataset

def run_entire_model(osu_files, 
                     tax_files, 
                     meta_file, 
                     norm_type,
                     ncomp,
                     levelup, 
                     groups,
                     feat_reduction, 
                     test_frac, 
                     model,
                     plot_pca,
                     plot_lc,
                     plot_ab_comp,
                     cutoff,
                     scorer):
    
    osu_df = join_osus(osu_files,norm_type)
    
    meta_df = get_labels(meta_file)
    osu_df = join_osu_with_labels(osu_df,meta_df)
    tax_df = join_taxonomy(tax_files)
    
    pair_df = select_groups(osu_df,groups)

    
    if levelup != None:
        pair_df = make_df_up_level(pair_df,tax_df,levelup,norm_type)
    
    
    an_df = pair_df
    
    feats = None
    if feat_reduction =='svd':
        X,Y,labels = SVD_truncate(pair_df,ncomp,cutoff)
    elif feat_reduction =='zscore':
        plot_cutoff = 0.3
        X,Y,labels,feats = make_dataset_zscore(pair_df, ncomp,cutoff,plot_cutoff,norm_type)
    elif feat_reduction =='corr':
        X,Y,labels,feats = feature_from_correlation(pair_df,ncomp,cutoff,norm_type)
    elif feat_reduction =='diff':
        X,Y,labels,feats = make_dataset_osu_diff(pair_df, ncomp,cutoff)
    elif feat_reduction == None:
        X,Y,labels,feats = make_dataset(pair_df,cutoff)
    
    print("Comparing the following groups:",labels)

        
    if feats !=None:
        if len(feats)<50:
            print("Top features:",feats)
        
    if plot_pca == True:
        if feat_reduction == 'svd':
            X1 = X[:, 0]
            X2 = X[:, 1]
        else: 
            X_PCA = PCA(n_components=2, random_state=42).fit_transform(np.array(X))
            X1 = X_PCA[:, 0]
            X2 = X_PCA[:, 1]
        c=np.array(Y)
        colors = np.where(c == 0, 'r', 'k')
        plt.figure(figsize=(6, 6))
        plt.scatter(X1, X2, c=colors)
        plt.show()
        

    
    seed = 20

    X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                    test_size=test_frac,
                                                    random_state=seed)
    
    
    if model == 'lg':
        print('-'*50)
        print('Logistic Regression')
        result_table = opt_log_reg(X_train,y_train,X_test,y_test,labels,scorer)
    elif model == 'rf':
        print('-'*50)
        print('Random Forest')
        result_table = opt_random_forest(X_train,y_train,X_test,y_test,labels,scorer)
    elif model == 'xg':
        print('-'*50)
        print('XG Boost')
        opt_xgboost(X_train,y_train,X_test,y_test,labels,scorer)
    elif model == 'all':
        print('-'*50)
        print('Logistic Regression')
        opt_log_reg(X_train,y_train,X_test,y_test,labels,scorer)
        print('-'*50)
        print('Random Forest')
        opt_random_forest(X_train,y_train,X_test,y_test,labels,scorer)
        print('-'*50)
        print('XG Boost')
        opt_xgboost(X_train,y_train,X_test,y_test,labels,scorer)
        
    if plot_ab_comp == True:
        if feats == None:
            print("Plotting abundance only applicable when zscore, diff, corr feature reduction used.")
        elif len(feats) > 20:
            print("Too many features to plot effectively.")
        else:
            IDs = feats
            abundance_comparison(an_df,IDs,norm_type)
        
    if plot_lc == True:

        title = "Learning Curve for Logistic Regression"
        # Cross validation with 100 iterations to get smoother mean test and train
        # score curves, each time with 20% data randomly selected as a validation set.
        cv = ShuffleSplit(n_splits=100, test_size=test_frac, random_state=0)

        estimator = LogisticRegression(random_state = 42, solver ='liblinear');
        plot_learning_curve(estimator, title, X, Y, ylim=(0.0, 1.01), cv=cv, n_jobs=10)

        plt.show()

    return an_df