In [1]:
# example of a super learner model for binary classification
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
#create a list of base models
def get_models():
  models = []
  models.append(SVC(kernel='rbf',probability=True,random_state=0))
  models.append(XGBClassifier(max_depth=9, eta = 0.4, gamma = 3, n_rounds = 100))
  return models

In [3]:
# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X, y, models):
	meta_X, meta_y = list(), list()
	# define split of data
	kfold = KFold(n_splits=10, shuffle=True)
	# enumerate splits
	for train_ix, test_ix in kfold.split(X):
		fold_yhats = list()
		# get data
		train_X, test_X = X[train_ix], X[test_ix]
		train_y, test_y = y[train_ix], y[test_ix]
		meta_y.extend(test_y)
		# fit and make predictions with each sub-model
		for model in models:
			model.fit(train_X, train_y)
			yhat = model.predict_proba(test_X)
			# store columns
			fold_yhats.append(yhat)
		# store fold yhats as columns
		meta_X.append(hstack(fold_yhats))
	return vstack(meta_X), asarray(meta_y)

In [4]:
# fit all base models on the training dataset
def fit_base_models(X, y, models):
	for model in models:
		model.fit(X, y)

In [5]:
# fit a meta model
def fit_meta_model(X, y):
	model = LogisticRegression(solver='liblinear')
	model.fit(X, y)
	return model

In [6]:
# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
	meta_X = list()
	for model in models:
		yhat = model.predict_proba(X)
		meta_X.append(yhat)
	meta_X = hstack(meta_X)
	# predict
	return meta_model.predict_proba(meta_X)

In [7]:
import io
import pandas as pd
df = pd.read_csv('X_trainData_column_modified_INH.csv') #Enter the file name here.
# Dataset is now stored in a Pandas Dataframe

In [8]:
df

Unnamed: 0,DEL_CF_410280_d918T_307_iniB,DEL_CF_4408101_d102C_34_gid,DEL_F_4408101_d101C_gid_G34G,INS_CF_4242820_i2957G_986_embC,INS_F_409772_i410ATCT_iniB_G137G,INS_F_4247020_i506CC_embB_G169G,INS_F_4247970_i1456GT_embB_G486G,INS_N_4243642_i409GTCCCGGGGCGCCAC_embA_S137S,INS_P_3074519_G.117_thyA,INS_P_3074521_G.115_thyA,...,ahpC_NC_snp,ahpC_C_snp,gid_C_snp,iniA_C_snp,gyrB_F_indel,embC_C_snp,eis_NC_snp,rpsA_C_snp,rrl-rrs_NC_snp,INH
0,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,1,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,1,1,1,0,0,1,1,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
X = df.iloc[:,:-1].values
X.shape

(3356, 219)

In [10]:
y = df.iloc[:,-1].values
y.shape

(3356,)

In [11]:
# get models
models = get_models()
# get out of fold predictions
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)

Parameters: { n_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

In [12]:
# fit base models
fit_base_models(X, y, models)

Parameters: { n_rounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [13]:
# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)

In [14]:
X_test = pd.read_csv('final_X_testData_column_modified_INH.csv') #Enter the file name here.
# Dataset is now stored in a Pandas Dataframe

In [15]:
import numpy as np

In [16]:
X_test

Unnamed: 0,DEL_CF_410280_d918T_307_iniB,DEL_CF_4408101_d102C_34_gid,DEL_F_4408101_d101C_gid_G34G,INS_CF_4242820_i2957G_986_embC,INS_F_409772_i410ATCT_iniB_G137G,INS_F_4247020_i506CC_embB_G169G,INS_F_4247970_i1456GT_embB_G486G,INS_N_4243642_i409GTCCCGGGGCGCCAC_embA_S137S,INS_P_3074519_G.117_thyA,INS_P_3074521_G.115_thyA,...,ahpC_NC_snp,ahpC_C_snp,gid_C_snp,iniA_C_snp,gyrB_F_indel,embC_C_snp,eis_NC_snp,rpsA_C_snp,rrl-rrs_NC_snp,INH
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?
918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?
919,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,?
920,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,?


In [17]:
X_val = X_test.iloc[:,:-1].values
X_val.shape

(922, 219)

In [18]:
yhat = super_learner_predictions(X_val, models, meta_model)


In [19]:
yhat

array([[0.98383834, 0.01616166],
       [0.98440676, 0.01559324],
       [0.98455269, 0.01544731],
       ...,
       [0.03024972, 0.96975028],
       [0.02870852, 0.97129148],
       [0.0303236 , 0.9696764 ]])

In [20]:
submission_data = pd.read_csv('Y_testData_1_nolabels_INH.csv')

In [21]:
ids=submission_data.iloc[:,:1].values

In [22]:
result=list()

for i in range(len(ids)):
    result.append([ ids[i,0], yhat[i,1]])

result.insert(0, ['ID','INH'])
output = pd.DataFrame(result)
output.to_csv('INH_results.csv',header = False, index = False)