# Supervised Learning Hands On

## 1. Download data of today's example (preprocessed Table):

**Gene expression data for cancer samples from the TCGA database**: using the xenaPython module 

In [1]:
import numpy as np
import pandas as pd

#read precalculated csv table "TCGA-cancer-DF.zip
df_noNA=pd.read_csv("/cluster/courses/ml4h/data_for_users/data/TCGA-cancer-DF.zip", index_col=0,compression="zip")  
df_noNA.head()

Unnamed: 0,?|100130426,?|100133144,?|100134869,?|10357,?|10431,?|136542,?|155060,?|26823,?|280660,?|317712,...,ZYG11B,ZYX,ZZEF1,ZZZ3,stage,age,gender,status,time,type
TCGA-61-1910-01,-0.04126,4.786,5.04,-6.683,-0.5449,-0.000557,3.334,2.583,-0.0893,-0.001292,...,-0.1832,1.616,0.4057,0.5824,Stage IIC,-20779,FEMALE,LIVING,1127,OV
TCGA-61-1728-01,-0.04126,2.064,2.149,-2.271,-0.6134,-0.000557,0.3176,-0.5444,-0.0893,-0.001292,...,-0.3968,1.307,-1.404,0.2832,Stage IV,-21582,FEMALE,LIVING,848,OV
TCGA-09-1666-01,-0.04126,1.772,2.805,-2.959,0.4053,-0.000557,0.5211,1.162,-0.0893,-0.001292,...,0.673,1.392,-0.9993,0.6738,Stage IIIC,-21066,FEMALE,LIVING,1752,OV
TCGA-24-1469-01,-0.04126,2.102,2.402,-1.069,0.4796,-0.000557,0.1432,-0.08483,0.7185,-0.001292,...,-0.5974,0.7653,-0.4181,-0.07337,Stage IIIC,-25937,FEMALE,LIVING,277,OV
TCGA-61-1917-01,-0.04126,0.5883,2.38,-1.513,0.3045,-0.000557,-0.6307,-0.5444,-0.0893,-0.001292,...,-0.4541,1.299,-1.16,0.03223,Stage IIIB,-21972,FEMALE,DECEASED,1321,OV


## 2. Check the characteristics of the input data set:

In [2]:
import ydata_profiling #v2.4: conda install -c conda-forge/label/cf202003 pandas-profiling 

profile = ydata_profiling.ProfileReport(df_noNA[['stage', 'age', 'gender', 'status', 'time','type']], title='Pandas Profiling Report', html={'style':{'full_width':True}},minimal=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Classification task

**k Nearest Neighbors, Logistic regression, Support Vector Machines, Naive Bayes, RandomForest, AdaBoost, Gradient Tree Boosting (gradient boosting machine), multi-layer perceptron (MLP)**

## 3. Create X and y for the classification task

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
sns.set()


X = df_noNA.drop(columns=['stage', 'age', 'gender', 'status', 'time','type'])

y = df_noNA['type']

print("We will use {} patients".format(len(y)))
   
num_cancer_type=len(set(y))
current_palette = sns.color_palette("colorblind")+sns.color_palette("dark")+sns.color_palette("deep")
colors=current_palette[0:num_cancer_type]
#visualize colors if needed:
#sns.palplot(colors)

We will use 4540 patients


## 4. Apply different classification techniques and calculate classification accuracy

In [None]:
import time
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

#will be splitting the dataset into train:test as 7:3
test_size=0.3
start_time_snippet = time.time()

def printAndSaveInfo(name, my_time, accuracy_score, my_F1, scoreDict, timeDict, F1Dict, title):
    scoreDict[name] = accuracy_score
    timeDict[name] = my_time
    F1Dict[name]= my_F1
    print("---{}: {} seconds ---".format (name, my_time))
    print('{}, {} accuracy score: {}, F1-score: {}'.format(title, name, accuracy_score, my_F1) )   

def getF1(y_test, y_predicted):
    if (len(np.unique(y_test))>2):
         return f1_score(y_test, y_predicted, average='micro')
    else:
         return f1_score(y_test, y_predicted, pos_label='DECEASED')      
  

scores=[]
F1=[]
times=[]

# Split the dataset in two for training and testing:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
scoreDict={}
timeDict={}
F1Dict={}

title="Classification of Cancer Types"

# run k-NN:
n_neighbors=15
start_time = time.time()
clf = KNeighborsClassifier(n_neighbors, weights='distance').fit(X_train, y_train) #default 'uniform'
printAndSaveInfo("kNN", time.time() - start_time, clf.score(X_test, y_test), 
                     getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)


    
# run logistic regression with Lasso penalty (no CV):
start_time = time.time()
clf = LogisticRegression(solver='lbfgs', random_state=42,tol=0.001, n_jobs=4).fit(X_train, y_train)
printAndSaveInfo("Log. reg. (lbfgs)", time.time() - start_time, clf.score(X_test, y_test), 
                     getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)

# logistic regression with L1+L2 penalty (no CV):
# will not run here as it takes 1279.4 seconds 
#start_time = time.time()
#clf = LogisticRegression(solver='saga', penalty="elasticnet", l1_ratio=0.5, random_state=42, tol=0.001, n_jobs=4).fit(X_train, y_train)
#printAndSaveInfo("Log. reg. (elasticnet)", time.time() - start_time, clf.score(X_test, y_test), 
#                  getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)

# run Support Vector Machines:
start_time = time.time()
clf = svm.SVC(kernel='linear', C=1,random_state=42).fit(X_train, y_train)
printAndSaveInfo("SVM", time.time() - start_time, clf.score(X_test, y_test), 
                     getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)

#Naive Bayes (Gaussian):
start_time = time.time()
clf = GaussianNB().fit(X_train, y_train)
printAndSaveInfo("GaussianNB", time.time() - start_time, clf.score(X_test, y_test), 
                     getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)

#RandomForest:
start_time = time.time()
clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
printAndSaveInfo("RandomForest", time.time() - start_time, clf.score(X_test, y_test), 
                     getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)

# #AdaBoost - 848.9 seconds to run on X,y:
#start_time = time.time()
#clf = AdaBoostClassifier(random_state=42).fit(X_train, y_train)
#printAndSaveInfo("AdaBoost", time.time() - start_time, clf.score(X_test, y_test), 
#                 getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)
    
# #Gradient Tree Boosting (gradient boosting machine), will not run here as it takes 15848.93 seconds:
#start_time = time.time()
#clf = GradientBoostingClassifier(random_state=42).fit(X_train, y_train)
#printAndSaveInfo("GBM", time.time() - start_time, clf.score(X_test, y_test), 
#                 getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)
    
#multi-layer perceptron (MLP):
start_time = time.time()
clf = MLPClassifier(random_state=42,solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(5, 2)).fit(X_train, y_train)
printAndSaveInfo("MLP", time.time() - start_time, clf.score(X_test, y_test), 
                     getF1(y_test, clf.predict(X_test)), scoreDict, timeDict, F1Dict, title)

scores.append(scoreDict)
times.append(timeDict)
F1.append(F1Dict)
    
print("---Overall time: {} seconds ---".format (time.time()-start_time_snippet))


---kNN: 0.6331815719604492 seconds ---
Classification of Cancer Types, kNN accuracy score: 0.9544787077826725, F1-score: 0.9544787077826725


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


---Log. reg. (lbfgs): 42.824615716934204 seconds ---
Classification of Cancer Types, Log. reg. (lbfgs) accuracy score: 0.9860499265785609, F1-score: 0.9860499265785609
---SVM: 34.46199440956116 seconds ---
Classification of Cancer Types, SVM accuracy score: 0.9845814977973568, F1-score: 0.9845814977973568
---GaussianNB: 1.9179329872131348 seconds ---
Classification of Cancer Types, GaussianNB accuracy score: 0.7701908957415565, F1-score: 0.7701908957415565
---RandomForest: 22.39856195449829 seconds ---
Classification of Cancer Types, RandomForest accuracy score: 0.9765051395007343, F1-score: 0.9765051395007343


## 5. Visualize the output
**Find the best solution: Check execution time and accuracy of the methods tested**

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

time_df=pd.DataFrame(times)
score_df=pd.DataFrame(scores)
F1_df=pd.DataFrame(F1)

time_df.rename(index={0:'Cancer type classification',1:'Prog. pred. with clin.',2:'Prog. pred. w/o clin.'}, 
                 inplace=True)
score_df.rename(index={0:'Cancer type classification',1:'Prog. pred. with clin.',2:'Prog. pred. w/o clin.'}, 
                 inplace=True)
F1_df.rename(index={0:'Cancer type classification',1:'Prog. pred. with clin.',2:'Prog. pred. w/o clin.'}, 
                 inplace=True)
print("Time:")
print(time_df)
print("Accuracy scores:")
print(score_df)
print("F1-scores:")
print(F1_df)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.set_size_inches(18, 7)
    
sns.heatmap(np.log10(time_df), annot=True, ax=ax1)
ax1.set_title("Time to train the model (log10 seconds)")

sns.heatmap(score_df, annot=True, ax=ax2)
ax2.set_title("Model accuracy on the test set")

sns.heatmap(F1_df, annot=True, ax=ax3)
ax3.set_title("F1-score on the test set")


plt.show()


## 6. Visualize the best solution:

In [None]:
#best solution for tumor classification (logistic regression) and kNN:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
clf = LogisticRegression(solver='lbfgs', random_state=42,tol=0.001, n_jobs=4).fit(X_train, y_train)
y_predicted=clf.predict(X_test)

X_embedded = TSNE(n_components=2,random_state=42).fit_transform(PCA(n_components=30).fit_transform(X_test))

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
    
for color, target_name in zip(colors, np.unique(y_test)):
    ax1.scatter(X_embedded[y_test == target_name, 0], X_embedded[y_test == target_name, 1],
                    color=color, lw=2, label=target_name, alpha=0.5)
ax1.set_title("True values, tumor classification task")
ax1.set_xlabel('tSNE 1')
ax1.set_ylabel('tSNE 2')

for color, target_name in zip(colors, np.unique(y_test)):
    ax2.scatter(X_embedded[y_predicted == target_name, 0], X_embedded[y_predicted == target_name, 1],
                    color=color, lw=2, label=target_name, alpha=0.5)
    ax2.legend(loc="best", shadow=False, scatterpoints=1)

ax2.set_title("Best solution for tumor classification: Logistic Regression")
ax2.set_xlabel('tSNE 1')
ax2.set_ylabel('tSNE 2')

plt.show()

clf = KNeighborsClassifier(n_neighbors, weights='distance').fit(X_train, y_train) #default 'uniform'
y_predicted=clf.predict(X_test)

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
    
for color, target_name in zip(colors, np.unique(y_test)):
    ax1.scatter(X_embedded[y_test == target_name, 0], X_embedded[y_test == target_name, 1],
                    color=color, lw=2, label=target_name, alpha=0.5)
ax1.set_title("True values, tumor classification task")
ax1.set_xlabel('tSNE 1')
ax1.set_ylabel('tSNE 2')

for color, target_name in zip(colors, np.unique(y_test)):
    ax2.scatter(X_embedded[y_predicted == target_name, 0], X_embedded[y_predicted == target_name, 1],
                    color=color, lw=2, label=target_name, alpha=0.5)
    ax2.legend(loc="best", shadow=False, scatterpoints=1)

ax2.set_title("Solution for tumor classification with kNN")
ax2.set_xlabel('tSNE 1')
ax2.set_ylabel('tSNE 2')

plt.show()

# Regression task

**Linear models (Ridge, Lasso, Elastic Net), RandomForest, AdaBoost, Gradient Tree Boosting (gradient boosting machine), multi-layer perceptron (MLP)**

## 1. Create X and y for the regression task:

In [None]:
import xenaPython as xena #pip install xenaPython

#drop clinical information from X and select only breast cancer samples
X=df_noNA.drop(columns=['status', 'time','type','stage','gender', 'age'])
X=X[(df_noNA['type']=="BRCA")]
X.head()

hub = "https://tcga.xenahubs.net"
dataset='TCGA.BRCA.sampleMap/RPPA_RBN'
samples=xena.dataset_samples(hub, dataset, None)

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

common_samples=intersection(X.index.values, samples)
prot_exp_list=xena.dataset_fetch(hub, dataset, common_samples, ["HER2","P53"]) 
X=X.loc[common_samples]

y1=prot_exp_list[0] #HER2 coded by ERBB2
y2=prot_exp_list[1] #P53 coded by TP53

from scipy.stats import pearsonr,spearmanr
print(spearmanr(y1,X["ERBB2"]))
print(spearmanr(y2,X["TP53"]))

print("Will use {} patient data points". format(len(y1)))

## 2. Run regression task on train set and calculate prediction accuracy on the test set

In [None]:
#Linear models (Ridge, Lasso, Elastic Net), RandomForest, 
#AdaBoost, Gradient Tree Boosting (gradient boosting machine), multi-layer perceptron (MLP)

import time
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from scipy.stats import pearsonr,spearmanr

#will be splitting the dataset into train:test as 7:3
test_size=0.3

#def c_index3(y_pred, events, times):
    #from https://codereview.stackexchange.com/questions/202140/concordance-index-calculation
    
def printAndSaveInfoReg(name, my_time, y_test, y_predicted, 
                    reg_score, scoreDict, corrDict, rhoDict, timeDict):    
    timeDict[name]=my_time
    corr_coeff=pearsonr(y_test,y_predicted)[0]
    spearman=spearmanr(y_test,y_predicted)[0]
    scoreDict[name]=reg_score
    corrDict[name]=corr_coeff
    rhoDict[name]=spearman    
    print("---{}: {} seconds ---".format (name, my_time))
    print('{} accuracy scores: {} {} {}'.format(name, reg_score, corr_coeff, spearman))   


    
scores=[]
PearsonCorr=[]
SpearmanCorr=[]
times=[]

for X, y, title in [(X, y1, "Prediction of HER2 protein expression"), 
                            (X, y2, "Prediction of p53 protein expression")]:
    
    # Split the dataset in two for training and testing:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    scoreDict={}
    timeDict={}
    corrDict={}
    rhoDict={}
    
    #Ordinary Least Squares:
    start_time = time.time()
    reg = linear_model.LinearRegression().fit(X_train, y_train)
    printAndSaveInfoReg("Ordinary LS", time.time()-start_time, y_test, reg.predict(X_test), reg.score(X_test,y_test), scoreDict, corrDict, rhoDict, timeDict)

    #Lasso (L1 penalty on model coefficients), lambda(here alpha)==0.1
    start_time = time.time()
    reg = linear_model.Lasso(alpha=0.1).fit(X_train, y_train)
    printAndSaveInfoReg("Lasso 0.1", time.time()-start_time, y_test, reg.predict(X_test), 
                        reg.score(X_test,y_test),scoreDict, corrDict, rhoDict, timeDict)

    #Ridge (L2 penalty on model coefficients), lambda(here alpha)==0.5
    start_time = time.time()
    reg = linear_model.Ridge(alpha=.5).fit(X_train, y_train)
    printAndSaveInfoReg("Ridge 0.5", time.time()-start_time, y_test, reg.predict(X_test), 
                        reg.score(X_test,y_test),scoreDict, corrDict, rhoDict, timeDict)

    #Elastic Net (L1 and L2 penalty on model coefficients), chooses alpha and lambda using crossvalidation (5-fold)
    start_time = time.time()
    reg = linear_model.ElasticNetCV(cv=5, random_state=42).fit(X_train, y_train)
    printAndSaveInfoReg("Elastic Net CV", time.time()-start_time, y_test, reg.predict(X_test), 
                        reg.score(X_test,y_test),scoreDict, corrDict, rhoDict, timeDict)

    #RandomForest:
    start_time = time.time()
    reg = RandomForestRegressor(random_state=42).fit(X_train, y_train)
    printAndSaveInfoReg("RandomForest", time.time()-start_time, y_test, reg.predict(X_test), 
                        reg.score(X_test,y_test),scoreDict, corrDict, rhoDict, timeDict)

    #AdaBoost:
    start_time = time.time()
    reg = AdaBoostRegressor(random_state=42).fit(X_train, y_train)
    printAndSaveInfoReg("AdaBoostS", time.time()-start_time, y_test, reg.predict(X_test), 
                        reg.score(X_test,y_test),scoreDict, corrDict, rhoDict, timeDict)

    #Gradient Tree Boosting (gradient boosting machine):
    start_time = time.time()
    reg = GradientBoostingRegressor(random_state=42).fit(X_train, y_train)
    printAndSaveInfoReg("GBM", time.time()-start_time, y_test, reg.predict(X_test), 
                        reg.score(X_test,y_test),scoreDict, corrDict, rhoDict, timeDict)

    #multi-layer perceptron (MLP):
    start_time = time.time()
    reg = MLPRegressor(random_state=42, hidden_layer_sizes=(5, 2), learning_rate_init=0.01, early_stopping=True).fit(X_train, y_train)
    printAndSaveInfoReg("MLP", time.time()-start_time, y_test, reg.predict(X_test), 
                        reg.score(X_test,y_test),scoreDict, corrDict, rhoDict, timeDict)

    scores.append(scoreDict)
    times.append(timeDict)  
    PearsonCorr.append(corrDict)
    SpearmanCorr.append(rhoDict)

## 3. Choose the best model:

In [None]:
time_df=pd.DataFrame(times)
score_df=pd.DataFrame(scores)
pearson_df=pd.DataFrame(PearsonCorr)
spearman_df=pd.DataFrame(SpearmanCorr)

time_df.rename(index={0:'Prediction of HER2 amount',1:'Prediction of p53 amount'}, 
                 inplace=True)
score_df.rename(index={0:'Prediction of HER2 amount',1:'Prediction of p53 amount'}, 
                 inplace=True)
pearson_df.rename(index={0:'Prediction of HER2 amount',1:'Prediction of p53 amount'}, 
                 inplace=True)
spearman_df.rename(index={0:'Prediction of HER2 amount',1:'Prediction of p53 amount'}, 
                 inplace=True)

print(time_df)
print(score_df)
print(pearson_df)
print(spearman_df)

fig, axs = plt.subplots(2, 2)

fig.set_size_inches(18, 18)
    
sns.heatmap(np.log10(time_df), annot=True, ax=axs[0, 0])
axs[0, 0].set_title("Time to train the model (log10 seconds)")

sns.heatmap(score_df, annot=True, ax=axs[0, 1])
axs[0, 1].set_title("Model accuracy on the test set (score)")

sns.heatmap(pearson_df, annot=True, ax=axs[1, 0])
axs[1, 0].set_title("Model accuracy on the test set (Pearson corr)")

sns.heatmap(spearman_df, annot=True, ax=axs[1, 1])
axs[1, 1].set_title("Model accuracy on the test set (Spearman corr)")

plt.show()

## 4. Visualize the best result

In [None]:
#RF:

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)

X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=test_size, random_state=42)
regHer2 = RandomForestRegressor(random_state=42).fit(X_train, y_train)
print(spearmanr(y_test,regHer2.predict(X_test)))
ax1.scatter(y_test,regHer2.predict(X_test))
ax1.set_xlabel('True HER2')
ax1.set_ylabel('Predicted HER2')

X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=test_size, random_state=42)
regP53 = RandomForestRegressor(random_state=42).fit(X_train, y_train)
print(spearmanr(y_test,regP53.predict(X_test)))
ax2.scatter(y_test,regP53.predict(X_test))
ax2.set_xlabel('True p53')
ax2.set_ylabel('Predicted p53')

plt.show()

## 5. Which features were the most important for the prediction?

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)

# get importance
importance = regHer2.feature_importances_

# summarize feature importance (top 10)
n=10
imp_scores = sorted(zip(importance, X.columns))
top = imp_scores[:-(n + 1):-1]
scores =[]
features=[]
print("Important features for prediction of HER2 protein expression:")
for v in top:
    print('%s, importance: %.5f' % (v[1],v[0])) 
    scores.append(v[0])
    features.append(v[1])
    
# plot feature importance
ax1.bar(height=scores,x=range(n))
plt.sca(ax1)
plt.xticks(range(n), features)

# get importance
importance = regP53.feature_importances_
# summarize feature importance (top 10)
n=10
imp_scores = sorted(zip(importance, X.columns))
top = imp_scores[:-(n + 1):-1]
scores =[]
features=[]
print("Important features for prediction of p53 protein expression:")
for v in top:
    print('%s, importance: %.5f' % (v[1],v[0])) 
    scores.append(v[0])
    features.append(v[1])
    
# plot feature importance
ax2.bar(height=scores,x=range(n))
plt.sca(ax2)
plt.xticks(range(n), features)

plt.show()