In [None]:
import pandas as pd
import numpy as np
import statistics
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import tree,svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt
np.random.seed(31415)

In [None]:
lumABasal_Union = "ANP32B,ATG16L1,BIRC5,BST2,C1R,CCND3,CD46,CLU,FLT3LG,GAS6,GIMAP7,HAPLN3,HIF1A,ICAM2,IFITM1,IGF2R,IKBKG,IL15RA,IL32,IL3RA,ITCH,LAMP2,LY6E,MAP2K2,MAPK8,MAVS,NFATC4,PLEKHO1,PYCARD,RAPGEF6,SLC25A28,TYMS,VAMP5".split(',')
lumABasal_Int = "BIRC5,BST2,CCND3,FLT3LG,GAS6,ICAM2,IFITM1,IGF2R,IL32,ITCH,LAMP2,LY6E,MAP2K2,PLEKHO1,PYCARD,TYMS,VAMP5".split(',')

lumALumB_Union = "AXL,BCL2,BIRC5,CD40,CDK1,CEACAM1,CSF1,CX3CL1,DEFB1,GATA3,IL6R,ITGB4,MKI67,RRM2,SMAD3".split(',')
lumALumB_Int = "AXL,BCL2,CD40,CSF1,CX3CL1,GATA3,ITGB4,MKI67,RRM2,SMAD3".split(',')

lumBBasal_Union = "ALCAM,AXL,CD40,CD59,CFB,CX3CL1,FLT3LG,GAS6,GBP2,IL34,ITGB4,NFATC4,NT5E,SERPING1,SMAD3,SSPN,UBA7,VAMP5".split(',')
lumBBasal_Int = "ALCAM,AXL,CD40,CX3CL1,CD59,FLT3LG,GAS6,GBP2,ITGB4,NFATC4,SMAD3,UBA7,VAMP5".split(',')

In [None]:
#Specify file name and if we are using hand selected features or top percent genes
filename = "LumALumBAllDESorted.csv"
useHandSelected = False
genelist=lumBBasal_Union
pcnt = 100 

#Read in data
df = pd.read_csv(filename, header=0)
df_genes=df.loc[:,~df.columns.isin(['Class', 'Sample'])]
df_scaled = pd.DataFrame(preprocessing.scale(df_genes), columns=df_genes.columns)

#process the labels
sampleNames = df.Sample
y = df.Class
lb = preprocessing.LabelBinarizer()
y=np.ravel(lb.fit_transform(y))

#Determine if we are using top ranked genes or hand selected ones
if not useHandSelected:
    X = df_scaled
    #Take the top ranked based on percent
    pcntGenesUsed = np.int(np.round((pcnt * X.shape[1])/100))
    print("Genes used: {}".format(pcntGenesUsed))
    X_selected = X.iloc[:, :pcntGenesUsed]
else:
    X = df_scaled
    #Take the hand selected genes
    X_selected = X.loc[:, genelist]

print("Data set size: {0}".format(X_selected.shape))
print("Label size: {0}\n".format(y.shape))


#model and scoring
model = svm.SVC(kernel='linear')
scoring = ['accuracy', 'precision', 'recall', 'f1']
ypred = model_selection.cross_validate(model,X,y,cv=10,scoring=scoring, return_train_score=False)

#get metrics
acc = statistics.mean(ypred['test_accuracy']*100)
acc_std = statistics.stdev(ypred['test_accuracy']*100)
prec = statistics.mean(ypred['test_precision']*100)
prec_std = statistics.stdev(ypred['test_precision']*100)
rec = statistics.mean(ypred['test_recall']*100)
rec_std = statistics.stdev(ypred['test_recall']*100)
f1 = statistics.mean(ypred['test_f1']*100)
f1_std = statistics.stdev(ypred['test_f1']*100)

#print results
print("Mean acc: {0:.2f} +/- {1:.2f}".format(acc,acc_std))
print("Mean precision: {0:.2f} +/- {1:.2f}".format(prec,prec_std))
print("Mean recall: {0:.2f} +/- {1:.2f}".format(rec,rec_std))
print("Mean fscore: {0:.2f} +/- {1:.2f}".format(f1,f1_std))