In [1]:
# No warnings
import warnings
warnings.filterwarnings('ignore') # Filter out warnings

# data analysis and wrangling
import pandas as pd
import numpy as np



Spot Check Algorithm - helps with finding appropriate models

In [12]:
# binary classification spot check script
import warnings
from numpy import mean
from numpy import std
from matplotlib import pyplot
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
 
 
 # create a dict of standard models to evaluate {name:object}
def define_models(models=dict()):
	# linear models
	models['logistic'] = LogisticRegression()
	alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
	for a in alpha:
		models['ridge-'+str(a)] = RidgeClassifier(alpha=a)
	models['sgd'] = SGDClassifier(max_iter=1000, tol=1e-3)
	models['pa'] = PassiveAggressiveClassifier(max_iter=1000, tol=1e-3)
	# non-linear models
	n_neighbors = range(1, 10)
	for k in n_neighbors:
		models['knn-'+str(k)] = KNeighborsClassifier(n_neighbors=k)
	models['cart'] = DecisionTreeClassifier()
	models['extra'] = ExtraTreeClassifier()
	models['svml'] = SVC(kernel='linear')
	models['svmp'] = SVC(kernel='poly')
	c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]
	for c in c_values:
		models['svmr'+str(c)] = SVC(C=c)
	models['bayes'] = GaussianNB()
	# ensemble models
	n_trees = 100
	models['ada'] = AdaBoostClassifier(n_estimators=n_trees)
	models['bag'] = BaggingClassifier(n_estimators=n_trees)
	models['rf'] = RandomForestClassifier(n_estimators=n_trees)
	models['et'] = ExtraTreesClassifier(n_estimators=n_trees)
	models['gbm'] = GradientBoostingClassifier(n_estimators=n_trees)
	print('Defined %d models' % len(models))
	return models
 
# no transforms pipeline
def pipeline_none(model):
	return model
 
# standardize transform pipeline
def pipeline_standardize(model):
	steps = list()
	# standardization
	steps.append(('standardize', StandardScaler()))
	# the model
	steps.append(('model', model))
	# create pipeline
	pipeline = Pipeline(steps=steps)
	return pipeline
 
# normalize transform pipeline
def pipeline_normalize(model):
	steps = list()
	# normalization
	steps.append(('normalize', MinMaxScaler()))
	# the model
	steps.append(('model', model))
	# create pipeline
	pipeline = Pipeline(steps=steps)
	return pipeline
 
# standardize and normalize pipeline
def pipeline_std_norm(model):
	steps = list()
	# standardization
	steps.append(('standardize', StandardScaler()))
	# normalization
	steps.append(('normalize', MinMaxScaler()))
	# the model
	steps.append(('model', model))
	# create pipeline
	pipeline = Pipeline(steps=steps)
	return pipeline
 
# evaluate a single model
def evaluate_model(X_train, Y_train, model, folds, metric, pipe_func):
	# create the pipeline
	pipeline = pipe_func(model)
	# evaluate model
	scores = cross_val_score(pipeline, X_train, Y_train, scoring=metric, cv=folds, n_jobs=-1)
	return scores
 
# evaluate a model and try to trap errors and and hide warnings
def robust_evaluate_model(X_train, Y_train, model, folds, metric, pipe_func):
	scores = None
	try:
		with warnings.catch_warnings():
			warnings.filterwarnings("ignore")
			scores = evaluate_model(X_train, Y_train, model, folds, metric, pipe_func)
	except:
		scores = None
	return scores
 
# evaluate a dict of models {name:object}, returns {name:score}
def evaluate_models(X_train, Y_train, models, pipe_funcs, folds=4, metric='accuracy'):
	results = dict()
	for name, model in models.items():
		# evaluate model under each preparation function
		for i in range(len(pipe_funcs)):
			# evaluate the model
			scores = robust_evaluate_model(X_train, Y_train, model, folds, metric, pipe_funcs[i])
			# update name
			run_name = str(i) + name
			# show process
			if scores is not None:
				# store a result
				results[run_name] = scores
				mean_score, std_score = mean(scores), std(scores)
				print('>%s: %.3f (+/-%.3f)' % (run_name, mean_score, std_score))
			else:
				print('>%s: error' % run_name)
	return results
 
# print and plot the top n results
def summarize_results(results, maximize=True, top_n=20):
	# check for no results
	if len(results) == 0:
		print('no results')
		return
	# determine how many results to summarize
	n = min(top_n, len(results))
	# create a list of (name, mean(scores)) tuples
	mean_scores = [(k,mean(v)) for k,v in results.items()]
	# sort tuples by mean score
	mean_scores = sorted(mean_scores, key=lambda x: x[1])
	# reverse for descending order (e.g. for accuracy)
	if maximize:
		mean_scores = list(reversed(mean_scores))
	# retrieve the top n for summarization
	names = [x[0] for x in mean_scores[:n]]
	scores = [results[x[0]] for x in mean_scores[:n]]
	# print the top n
	print()
	for i in range(n):
		name = names[i]
		mean_score, std_score = mean(results[name]), std(results[name])
		print('Rank=%d, Name=%s, Score=%.3f (+/- %.3f)' % (i+1, name, mean_score, std_score))
	# boxplot for the top n
	pyplot.boxplot(scores, labels=names)
	_, labels = pyplot.xticks()
	pyplot.setp(labels, rotation=90)
	pyplot.savefig('spotcheck.png')
 

# get model list
models = define_models()
# define transform pipelines
pipelines = [pipeline_none, pipeline_standardize, pipeline_normalize, pipeline_std_norm]
# evaluate models
results = evaluate_models(X_train, Y_train, models, pipelines)
# summarize results
summarize_results(results)

Defined 60 models
>0logistic: 0.709 (+/-0.047)
>1logistic: 0.755 (+/-0.042)
>2logistic: 0.690 (+/-0.025)
>3logistic: 0.690 (+/-0.025)
>0ridge-0.1: 0.755 (+/-0.042)
>1ridge-0.1: 0.755 (+/-0.042)
>2ridge-0.1: 0.755 (+/-0.042)
>3ridge-0.1: 0.755 (+/-0.042)
>0ridge-0.2: 0.755 (+/-0.042)
>1ridge-0.2: 0.755 (+/-0.042)
>2ridge-0.2: 0.732 (+/-0.021)
>3ridge-0.2: 0.732 (+/-0.021)
>0ridge-0.3: 0.755 (+/-0.042)
>1ridge-0.3: 0.755 (+/-0.042)
>2ridge-0.3: 0.732 (+/-0.021)
>3ridge-0.3: 0.732 (+/-0.021)
>0ridge-0.4: 0.755 (+/-0.042)
>1ridge-0.4: 0.755 (+/-0.042)
>2ridge-0.4: 0.732 (+/-0.021)
>3ridge-0.4: 0.732 (+/-0.021)
>0ridge-0.5: 0.755 (+/-0.042)
>1ridge-0.5: 0.755 (+/-0.042)
>2ridge-0.5: 0.732 (+/-0.021)
>3ridge-0.5: 0.732 (+/-0.021)
>0ridge-0.6: 0.755 (+/-0.042)
>1ridge-0.6: 0.755 (+/-0.042)
>2ridge-0.6: 0.732 (+/-0.021)
>3ridge-0.6: 0.732 (+/-0.021)
>0ridge-0.7: 0.755 (+/-0.042)
>1ridge-0.7: 0.755 (+/-0.042)
>2ridge-0.7: 0.732 (+/-0.021)
>3ridge-0.7: 0.732 (+/-0.021)
>0ridge-0.8: 0.755 (+/-0.0

Import files below:

1. data1 = stepcount + demographic data in discrete categories
2. d1 = demographic data in discrete categories
3. data2 = stepcount + demographic data as continuous variables
4. d2 = demographic data as a continuous variable

In [14]:
file1 = 'no_band_steps.xlsx'
data1 = pd.read_excel(file1)
data1 = data1.set_index('studyID')

d1 = data1.drop(columns = 'Steps')

file2 = 'band_steps.xlsx'
data2 = pd.read_excel(file2)
data2 = data2.set_index('studyID')

d2 = data2.drop(columns = 'Steps_band')


Demographic: No Band

In [18]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
d1_train_acc = []
d1_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(d1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    d1_train_acc.append(svm_train_acc)
    d1_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(d1_train_acc)) 
print('Mean Test Accuracy:', mean(d1_test_acc)) 

Random State:  100
Training Accuracy: 0.8
Test Accuracy: 0.6666666666666666
Random State:  122
Training Accuracy: 0.7111111111111111
Test Accuracy: 0.5833333333333334
Random State:  200
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.3333333333333333
Random State:  300
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.75
Random State:  368
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.5
Random State:  400
Training Accuracy: 0.7333333333333333
Test Accuracy: 0.5
Random State:  500
Training Accuracy: 0.7555555555555555
Test Accuracy: 0.5
Random State:  600
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.75
Random State:  700
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.5833333333333334
Random State:  22
Training Accuracy: 0.7333333333333333
Test Accuracy: 0.75
Mean Training Accuracy: 0.7666666666666666
Mean Test Accuracy: 0.5916666666666666


Demographic: Band

In [17]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
d2_train_acc = []
d2_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(d2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    d2_train_acc.append(svm_train_acc)
    d2_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(d2_train_acc)) 
print('Mean Test Accuracy:', mean(d2_test_acc)) 

Random State:  100
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.6666666666666666
Random State:  122
Training Accuracy: 0.8
Test Accuracy: 0.75
Random State:  200
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.5833333333333334
Random State:  300
Training Accuracy: 0.8
Test Accuracy: 0.6666666666666666
Random State:  368
Training Accuracy: 0.7555555555555555
Test Accuracy: 0.8333333333333334
Random State:  400
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.6666666666666666
Random State:  500
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.5
Random State:  600
Training Accuracy: 0.7555555555555555
Test Accuracy: 0.8333333333333334
Random State:  700
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.5833333333333334
Random State:  22
Training Accuracy: 0.7555555555555555
Test Accuracy: 0.75
Mean Training Accuracy: 0.7977777777777778
Mean Test Accuracy: 0.6833333333333333


Steps + Demographic: No Band

In [19]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
data1_train_acc = []
data1_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(data1, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    data1_train_acc.append(svm_train_acc)
    data1_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(data1_train_acc)) 
print('Mean Test Accuracy:', mean(data1_test_acc)) 

Random State:  100
Training Accuracy: 0.6666666666666666
Test Accuracy: 0.5833333333333334
Random State:  122
Training Accuracy: 0.6444444444444445
Test Accuracy: 0.6666666666666666
Random State:  200
Training Accuracy: 0.6222222222222222
Test Accuracy: 0.5
Random State:  300
Training Accuracy: 0.7333333333333333
Test Accuracy: 0.5
Random State:  368
Training Accuracy: 0.6888888888888889
Test Accuracy: 0.4166666666666667
Random State:  400
Training Accuracy: 0.5555555555555556
Test Accuracy: 0.5833333333333334
Random State:  500
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.5833333333333334
Random State:  600
Training Accuracy: 0.7111111111111111
Test Accuracy: 0.6666666666666666
Random State:  700
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.8333333333333334
Random State:  22
Training Accuracy: 0.5777777777777777
Test Accuracy: 0.5833333333333334
Mean Training Accuracy: 0.6755555555555556
Mean Test Accuracy: 0.5916666666666666


Steps + Demographic: Band

In [20]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
data2_train_acc = []
data2_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(data2, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    data2_train_acc.append(svm_train_acc)
    data2_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(data2_train_acc)) 
print('Mean Test Accuracy:', mean(data2_test_acc)) 

Random State:  100
Training Accuracy: 0.8
Test Accuracy: 0.75
Random State:  122
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.75
Random State:  200
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.5833333333333334
Random State:  300
Training Accuracy: 0.8
Test Accuracy: 0.75
Random State:  368
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.5
Random State:  400
Training Accuracy: 0.8
Test Accuracy: 0.8333333333333334
Random State:  500
Training Accuracy: 0.8666666666666667
Test Accuracy: 0.5833333333333334
Random State:  600
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.75
Random State:  700
Training Accuracy: 0.8
Test Accuracy: 0.5833333333333334
Random State:  22
Training Accuracy: 0.8
Test Accuracy: 0.9166666666666666
Mean Training Accuracy: 0.8155555555555555
Mean Test Accuracy: 0.7


Important files below:

1. no_band_final = KEGG + Demographic + stepcount data as continuous variables
2. norm_no_band_final = KEGG + Demographic + stepcount data as continuous variables
3. band_final = KEGG + Demographic + stepcount data as discrete variables
4. norm_band_final = KEGG + Demographic + stepcount data as discrete variables

In [21]:
no_band_final = pd.read_excel('final_no_band.xlsx')
no_band_final = no_band_final.set_index('studyID')

norm_no_band_final = pd.read_excel('norm_final_no_band.xlsx')
norm_no_band_final = norm_no_band_final.set_index('studyID')

band_final = pd.read_excel('final_band.xlsx')
band_final = band_final.set_index('studyID')

norm_band_final = pd.read_excel('norm_final_band.xlsx')
norm_band_final = norm_band_final.set_index('studyID')

KEGG + Demographic: No Band

In [23]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
no_band_train_acc = []
no_band_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    no_band_train_acc.append(svm_train_acc)
    no_band_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(no_band_train_acc)) 
print('Mean Test Accuracy:', mean(no_band_test_acc)) 

Random State:  100
Training Accuracy: 1.0
Test Accuracy: 0.75
Random State:  122
Training Accuracy: 1.0
Test Accuracy: 0.9166666666666666
Random State:  200
Training Accuracy: 1.0
Test Accuracy: 0.75
Random State:  300
Training Accuracy: 1.0
Test Accuracy: 0.75
Random State:  368
Training Accuracy: 1.0
Test Accuracy: 0.4166666666666667
Random State:  400
Training Accuracy: 1.0
Test Accuracy: 0.75
Random State:  500
Training Accuracy: 1.0
Test Accuracy: 0.6666666666666666
Random State:  600
Training Accuracy: 1.0
Test Accuracy: 0.5833333333333334
Random State:  700
Training Accuracy: 1.0
Test Accuracy: 0.9166666666666666
Random State:  22
Training Accuracy: 1.0
Test Accuracy: 0.75
Mean Training Accuracy: 1.0
Mean Test Accuracy: 0.7250000000000001


KEGG + Demographic: No Band,
Normalized

In [24]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_no_band_train_acc = []
norm_no_band_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_no_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    norm_no_band_train_acc.append(svm_train_acc)
    norm_no_band_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(norm_no_band_train_acc)) 
print('Mean Test Accuracy:', mean(norm_no_band_test_acc)) 

Random State:  100
Training Accuracy: 0.75
Test Accuracy: 0.5
Random State:  122
Training Accuracy: 0.7727272727272727
Test Accuracy: 0.75
Random State:  200
Training Accuracy: 0.8409090909090909
Test Accuracy: 0.5
Random State:  300
Training Accuracy: 0.7045454545454546
Test Accuracy: 0.5833333333333334
Random State:  368
Training Accuracy: 0.7727272727272727
Test Accuracy: 0.8333333333333334
Random State:  400
Training Accuracy: 0.75
Test Accuracy: 0.5
Random State:  500
Training Accuracy: 0.8181818181818182
Test Accuracy: 0.4166666666666667
Random State:  600
Training Accuracy: 0.7954545454545454
Test Accuracy: 0.6666666666666666
Random State:  700
Training Accuracy: 0.7727272727272727
Test Accuracy: 0.75
Random State:  22
Training Accuracy: 0.6363636363636364
Test Accuracy: 0.75
Mean Training Accuracy: 0.7613636363636365
Mean Test Accuracy: 0.625


KEGG + Demographic: Band

In [25]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
band_train_acc = []
band_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    band_train_acc.append(svm_train_acc)
    band_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(band_train_acc)) 
print('Mean Test Accuracy:', mean(band_test_acc)) 

Random State:  100
Training Accuracy: 1.0
Test Accuracy: 0.6666666666666666
Random State:  122
Training Accuracy: 1.0
Test Accuracy: 0.6666666666666666
Random State:  200
Training Accuracy: 1.0
Test Accuracy: 0.5833333333333334
Random State:  300
Training Accuracy: 1.0
Test Accuracy: 0.6666666666666666
Random State:  368
Training Accuracy: 1.0
Test Accuracy: 0.5833333333333334
Random State:  400
Training Accuracy: 1.0
Test Accuracy: 0.5833333333333334
Random State:  500
Training Accuracy: 1.0
Test Accuracy: 0.6666666666666666
Random State:  600
Training Accuracy: 1.0
Test Accuracy: 0.5833333333333334
Random State:  700
Training Accuracy: 1.0
Test Accuracy: 0.5833333333333334
Random State:  22
Training Accuracy: 1.0
Test Accuracy: 0.6666666666666666
Mean Training Accuracy: 1.0
Mean Test Accuracy: 0.625


KEGG + Demographic: Band, 
Normalized

In [26]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_band_train_acc = []
norm_band_test_acc = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_final, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    norm_band_train_acc.append(svm_train_acc)
    norm_band_test_acc.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(norm_band_train_acc)) 
print('Mean Test Accuracy:', mean(norm_band_test_acc)) 

Random State:  100
Training Accuracy: 0.75
Test Accuracy: 0.5
Random State:  122
Training Accuracy: 0.7954545454545454
Test Accuracy: 0.6666666666666666
Random State:  200
Training Accuracy: 0.8181818181818182
Test Accuracy: 0.5
Random State:  300
Training Accuracy: 0.7045454545454546
Test Accuracy: 0.5833333333333334
Random State:  368
Training Accuracy: 0.7272727272727273
Test Accuracy: 0.8333333333333334
Random State:  400
Training Accuracy: 0.8181818181818182
Test Accuracy: 0.5
Random State:  500
Training Accuracy: 0.8181818181818182
Test Accuracy: 0.5
Random State:  600
Training Accuracy: 0.7954545454545454
Test Accuracy: 0.6666666666666666
Random State:  700
Training Accuracy: 0.7727272727272727
Test Accuracy: 0.75
Random State:  22
Training Accuracy: 0.6363636363636364
Test Accuracy: 0.75
Mean Training Accuracy: 0.7636363636363637
Mean Test Accuracy: 0.625


Import PCA dataset and retain only first 6 PC's which account for 80% variance

In [27]:
PCA = pd.read_csv('PCA_final.csv')

In [28]:
PCA.index = data1.index

In [29]:
PCA = PCA.iloc[:, 1:7]
PCA

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6
studyID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001,-469.941805,0.049335,0.010597,-0.025762,0.020042,-0.004671
2002,8.061646,-5.761836,-4.841193,-2.171323,-2.788218,2.032909
2003,8.430937,6.946562,1.794457,-0.015105,-3.024961,-0.200291
2004,8.256683,8.288072,2.570019,-0.302641,-3.554665,4.007124
2006,8.376866,-0.916554,-6.212482,1.182002,-0.32864,-1.1624
2008,8.258802,0.870977,4.219331,0.654543,-3.580225,1.235964
2010,8.150815,-2.103172,-4.73828,5.019986,1.739006,-0.470654
2012,8.529156,-1.075416,1.555243,0.975567,2.216324,-2.077344
2013,8.691007,1.723237,-5.09574,-5.872935,1.253795,5.000286
2014,8.33059,-3.492446,-2.57333,-3.440519,0.589523,-0.811196


Merge PCA dataframe with demographic and step data

Important files below:

1. no_band_PCA = PCA + Demographic + stepcount as continuous variables
2. norm_no_band_PCA = PCA + Demographic + stepcount, Normalized as continuous variables
3. band_PCA = PCA + Demographic + stepcount as discrete variables
4. norm_band_PCA = PCA + Demographic + stepcount, Normalized as discrete variables


In [30]:
no_band_PCA = pd.concat([PCA, data1], axis = 1, join = 'outer')
band_PCA = pd.concat([PCA, data2], axis = 1, join = 'outer')


In [31]:
from sklearn.preprocessing import MinMaxScaler

norm = MinMaxScaler()

norm_no_band_PCA = pd.DataFrame(norm.fit_transform(no_band_PCA), index = no_band_PCA.index, columns = no_band_PCA.columns.values)
norm_band_PCA = pd.DataFrame(norm.fit_transform(band_PCA), index = band_PCA.index, columns = band_PCA.columns.values)




PCA + Demographic: No Band

In [32]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
no_band_train_PCA = []
no_band_test_PCA = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    no_band_train_PCA.append(svm_train_acc)
    no_band_test_PCA.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(no_band_train_PCA)) 
print('Mean Test Accuracy:', mean(no_band_test_PCA)) 

Random State:  100
Training Accuracy: 0.7333333333333333
Test Accuracy: 0.6666666666666666
Random State:  122
Training Accuracy: 0.6
Test Accuracy: 0.8333333333333334
Random State:  200
Training Accuracy: 0.7111111111111111
Test Accuracy: 0.75
Random State:  300
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.5
Random State:  368
Training Accuracy: 0.6888888888888889
Test Accuracy: 0.5
Random State:  400
Training Accuracy: 0.6222222222222222
Test Accuracy: 0.6666666666666666
Random State:  500
Training Accuracy: 0.7555555555555555
Test Accuracy: 0.75
Random State:  600
Training Accuracy: 0.8
Test Accuracy: 0.5833333333333334
Random State:  700
Training Accuracy: 0.7111111111111111
Test Accuracy: 0.6666666666666666
Random State:  22
Training Accuracy: 0.6888888888888889
Test Accuracy: 0.5833333333333334
Mean Training Accuracy: 0.7088888888888889
Mean Test Accuracy: 0.65


PCA + Demographic: No Band,
Normalized

In [37]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_no_band_train_PCA = []
norm_no_band_test_PCA = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    norm_no_band_train_PCA.append(svm_train_acc)
    norm_no_band_test_PCA.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(norm_no_band_train_PCA)) 
print('Mean Test Accuracy:', mean(norm_no_band_test_PCA)) 

Random State:  100
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.6666666666666666
Random State:  122
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.75
Random State:  200
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.75
Random State:  300
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.6666666666666666
Random State:  368
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.6666666666666666
Random State:  400
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.6666666666666666
Random State:  500
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.5
Random State:  600
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.9166666666666666
Random State:  700
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.5
Random State:  22
Training Accuracy: 0.7333333333333333
Test Accuracy: 0.75
Mean Training Accuracy: 0.8044444444444444
Mean Test Accuracy: 0.6833333333333333


PCA + Demographic: Band

In [134]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_no_band_train_PCA = []
norm_no_band_test_PCA = []

for i in random_state :
    
    train_df, test_df = train_test_split(no_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    norm_no_band_train_PCA.append(svm_train_acc)
    norm_no_band_test_PCA.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(norm_no_band_train_PCA)) 
print('Mean Test Accuracy:', mean(norm_no_band_test_PCA)) 

Random State:  100
Training Accuracy: 0.7333333333333333
Test Accuracy: 0.6666666666666666
Random State:  122
Training Accuracy: 0.6
Test Accuracy: 0.8333333333333334
Random State:  200
Training Accuracy: 0.7111111111111111
Test Accuracy: 0.75
Random State:  300
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.5
Random State:  368
Training Accuracy: 0.6888888888888889
Test Accuracy: 0.5
Random State:  400
Training Accuracy: 0.6222222222222222
Test Accuracy: 0.6666666666666666
Random State:  500
Training Accuracy: 0.7555555555555555
Test Accuracy: 0.75
Random State:  600
Training Accuracy: 0.8
Test Accuracy: 0.5833333333333334
Random State:  700
Training Accuracy: 0.7111111111111111
Test Accuracy: 0.6666666666666666
Random State:  22
Training Accuracy: 0.6888888888888889
Test Accuracy: 0.5833333333333334
Mean Training Accuracy: 0.7088888888888889
Mean Test Accuracy: 0.65


PCA + Demographic: Band,
Normalized

In [39]:
from sklearn.model_selection import train_test_split

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]
norm_band_train_PCA = []
norm_band_test_PCA = []

for i in random_state :
    
    train_df, test_df = train_test_split(norm_band_PCA, test_size=0.2, random_state= i)
    
    X_train = train_df.drop('Weight_loss_band', axis=1)
    Y_train = train_df['Weight_loss_band']  
    
    X_test  = test_df.drop('Weight_loss_band', axis=1)
    Y_test = test_df['Weight_loss_band']  
    
    clf = svm.SVC(kernel = 'poly', gamma = 'scale')
    clf.fit(X_train, Y_train)  

    svm_train_acc = clf.score(X_train, Y_train)
    svm_test_acc = clf.score(X_test, Y_test)
    
    norm_band_train_PCA.append(svm_train_acc)
    norm_band_test_PCA.append(svm_test_acc)
    
    print('Random State: ', i)
    
    print('Training Accuracy:', svm_train_acc)
    
    print('Test Accuracy:', svm_test_acc)

print('Mean Training Accuracy:', mean(norm_band_train_PCA)) 
print('Mean Test Accuracy:', mean(norm_band_test_PCA)) 

Random State:  100
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.6666666666666666
Random State:  122
Training Accuracy: 0.7777777777777778
Test Accuracy: 0.6666666666666666
Random State:  200
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.6666666666666666
Random State:  300
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.5833333333333334
Random State:  368
Training Accuracy: 0.8666666666666667
Test Accuracy: 0.6666666666666666
Random State:  400
Training Accuracy: 0.8222222222222222
Test Accuracy: 0.75
Random State:  500
Training Accuracy: 0.8
Test Accuracy: 0.5
Random State:  600
Training Accuracy: 0.8
Test Accuracy: 0.9166666666666666
Random State:  700
Training Accuracy: 0.8444444444444444
Test Accuracy: 0.5833333333333334
Random State:  22
Training Accuracy: 0.7333333333333333
Test Accuracy: 0.75
Mean Training Accuracy: 0.8088888888888889
Mean Test Accuracy: 0.6749999999999999


The following code creates the model results dataframe

In [96]:
results_train = [d1_train_acc, d1_test_acc,
                 d2_train_acc, d2_test_acc,
                 data1_train_acc, data1_test_acc,
                 data2_train_acc, data2_test_acc,
                 no_band_train_acc, no_band_test_acc,
                 norm_no_band_train_acc, norm_no_band_test_acc,
                 band_train_acc, band_test_acc,
                 norm_band_train_acc, norm_band_test_acc,
                 no_band_train_PCA, no_band_test_PCA,
                 norm_no_band_train_PCA, norm_no_band_test_PCA,
                 band_train_PCA, band_test_PCA,
                 norm_band_train_PCA, norm_band_test_PCA]


results_train_mean = [mean(d1_train_acc), mean(d1_test_acc),
                 mean(d2_train_acc), mean(d2_test_acc),
                 mean(data1_train_acc), mean(data1_test_acc),
                 mean(data2_train_acc), mean(data2_test_acc),
                 mean(no_band_train_acc), mean(no_band_test_acc),
                 mean(norm_no_band_train_acc), mean(norm_no_band_test_acc),
                 mean(band_train_acc), mean(band_test_acc),
                 mean(norm_band_train_acc), mean(norm_band_test_acc),
                 mean(no_band_train_PCA), mean(no_band_test_PCA),
                 mean(norm_no_band_train_PCA), mean(norm_no_band_test_PCA),
                 mean(band_train_PCA), mean(band_test_PCA),
                 mean(norm_band_train_PCA), mean(norm_band_test_PCA)]


index = ['Dem: No Band', '* Dem: No Band',
         'Dem: Band', '* Dem: Band', 
         'Dem + Step: No Band', '* Dem + Step: No Band',
         'Dem + Step: Band', '* Dem + Step: Band',
         'Dem + Step + KEGG: No Band', '* Dem + Step + KEGG: No Band',
         'Dem + Step + KEGG: No Band, Normalized', '* Dem + Step + KEGG: No Band, Normalized',
         'Dem + Step + KEGG: Band', '* Dem + Step + KEGG: Band',
         'Dem + Step + KEGG: Band, Normalized', '* Dem + Step + KEGG: Band, Normalized',
         'Dem + Step + PCA: No Band', '* Dem + Step + PCA: No Band',
         'Dem + Step + PCA: No Band, Normalized', '* Dem + Step + PCA: No Band, Normalized',
         'Dem + Step + PCA: Band', '* Dem + Step + PCA: Band',
         'Dem + Step + PCA: Band, Normalized', '* Dem + Step + PCA: Band, Normalized', ]

random_state = [100, 122, 200, 300, 368, 400, 500, 600, 700, 22]

svm_table = pd.DataFrame(results_train, columns = random_state, index = index)
svm_table['Average Model Performance'] = results_train_mean

In [110]:
svm_table = svm_table.T

svm_table shows the results for all models, the code below selects the top from each category

In [123]:
pd.set_option('display.max_columns', 24)
svm_table = svm_table.copy()
result = svm_table.loc[:,['Dem: Band', '* Dem: Band',
                'Dem + Step: Band', '* Dem + Step: Band', 
                 'Dem + Step + KEGG: No Band', '* Dem + Step + KEGG: No Band',
                'Dem + Step + PCA: No Band, Normalized', '* Dem + Step + PCA: No Band, Normalized']]

In [124]:
result

Unnamed: 0,Dem: Band,* Dem: Band,Dem + Step: Band,* Dem + Step: Band,Dem + Step + KEGG: No Band,* Dem + Step + KEGG: No Band,"Dem + Step + PCA: No Band, Normalized","* Dem + Step + PCA: No Band, Normalized"
100,0.822222,0.666667,0.8,0.75,1.0,0.75,0.777778,0.666667
122,0.8,0.75,0.777778,0.75,1.0,0.916667,0.777778,0.75
200,0.844444,0.583333,0.844444,0.583333,1.0,0.75,0.844444,0.75
300,0.8,0.666667,0.8,0.75,1.0,0.75,0.844444,0.666667
368,0.755556,0.833333,0.844444,0.5,1.0,0.416667,0.822222,0.666667
400,0.777778,0.666667,0.8,0.833333,1.0,0.75,0.822222,0.666667
500,0.844444,0.5,0.866667,0.583333,1.0,0.666667,0.822222,0.5
600,0.755556,0.833333,0.822222,0.75,1.0,0.583333,0.777778,0.916667
700,0.822222,0.583333,0.8,0.583333,1.0,0.916667,0.822222,0.5
22,0.755556,0.75,0.8,0.916667,1.0,0.75,0.733333,0.75


In [101]:
result.to_excel("svm_model_results.xlsx")