## Description
In this part of the project, we will use supervised learning algorithms, namely, Decision Trees/Random Forests, SVM, and Logistic Regression algorithms to predict disease type given gene expression data.

## Import Packages

In [115]:
import pandas as pd
import numpy as np
import math
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.tree import plot_tree
# from sklearn.tree.export import export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [26]:
#Settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Load in Data (Run this cell only once - large file)

In [91]:
data = pd.read_csv("combinedFPKM.csv")
data

Unnamed: 0,5ce879cb-fe9c-5cb9-9b2f-54779d6ad6c4,Cancer,ENSG00000000003.13,ENSG00000000005.5,ENSG00000000419.11,ENSG00000000457.12,ENSG00000000460.15,ENSG00000000938.11,ENSG00000000971.14,ENSG00000001036.12,ENSG00000001084.9,ENSG00000001167.13,ENSG00000001460.16,ENSG00000001461.15,ENSG00000001497.15,ENSG00000001561.6,ENSG00000001617.10,ENSG00000001626.13,ENSG00000001629.8,ENSG00000001630.14,ENSG00000001631.13,ENSG00000002016.15,ENSG00000002079.11,ENSG00000002330.12,ENSG00000002549.11,ENSG00000002586.16,ENSG00000002587.8,ENSG00000002726.18,ENSG00000002745.11,ENSG00000002746.13,ENSG00000002822.14,ENSG00000002834.16,ENSG00000002919.13,ENSG00000002933.6,ENSG00000003056.6,ENSG00000003096.12,ENSG00000003137.7,ENSG00000003147.16,ENSG00000003249.12,ENSG00000003393.13,ENSG00000003400.13,ENSG00000003402.18,ENSG00000003436.13,ENSG00000003509.14,ENSG00000003756.15,ENSG00000003987.12,ENSG00000003989.15,ENSG00000004059.9,ENSG00000004139.12,ENSG00000004142.10,...,ENSGR0000124333.13,ENSGR0000124334.15,ENSGR0000167393.15,ENSGR0000168939.9,ENSGR0000169084.11,ENSGR0000169093.13,ENSGR0000169100.11,ENSGR0000178605.11,ENSGR0000182162.8,ENSGR0000182378.11,ENSGR0000182484.13,ENSGR0000185203.10,ENSGR0000185291.9,ENSGR0000185960.11,ENSGR0000196433.10,ENSGR0000197976.9,ENSGR0000198223.13,ENSGR0000205755.9,ENSGR0000214717.8,ENSGR0000223274.4,ENSGR0000223484.5,ENSGR0000223511.4,ENSGR0000223571.4,ENSGR0000223773.5,ENSGR0000225661.5,ENSGR0000226179.4,ENSGR0000227159.6,ENSGR0000228410.4,ENSGR0000228572.5,ENSGR0000229232.4,ENSGR0000230542.4,ENSGR0000234622.4,ENSGR0000234958.4,ENSGR0000236017.6,ENSGR0000236871.5,ENSGR0000237040.4,ENSGR0000237531.4,ENSGR0000237801.4,ENSGR0000263835.4,ENSGR0000263980.4,ENSGR0000264510.4,ENSGR0000264819.4,ENSGR0000265658.4,ENSGR0000270726.4,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,id
0,,Brease,9.342228,0.828756,19.959239,3.713461,1.577598,1.557466,3.895561,25.113650,8.062482,19.397789,0.745033,5.288839,16.220930,5.519227,37.908338,0.004312,23.510775,2.397773,8.385777,1.551339,0.030864,6.536317,26.057553,69.563461,0.361272,0.470571,0.018769,0.023667,2.868146,59.892201,6.735070,1.713565,26.245626,0.499608,0.456454,11.350259,5.723320,3.616736,0.667002,2.787415,0.825559,2.139119,8.430035,0.144091,32.211317,79.499496,1.969681,63.630315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,,Brease,19.658110,0.000000,34.302484,4.674391,3.941016,1.505171,2.325878,47.765837,2.832710,13.435799,1.060891,2.509758,15.209583,12.459083,24.168524,0.077162,17.797365,4.039856,8.870517,3.164233,0.022513,38.594390,32.461384,73.974563,1.686561,0.209221,0.052573,0.006138,7.263930,213.316385,11.291551,2.882817,20.562074,0.053194,0.290579,23.350756,23.366987,2.859008,0.819826,3.557586,0.348815,3.224407,8.113482,0.115459,0.271096,173.976522,0.856925,60.986580,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,,Brease,1.028690,0.098894,129.116200,3.580859,1.491591,0.316237,1.213933,16.241104,4.633766,14.292499,1.556456,8.963247,10.047671,14.436147,57.131994,0.003739,10.763451,2.438696,8.974658,2.409180,0.030108,26.185322,22.156006,24.105826,0.776011,0.008327,0.004883,0.092350,3.957344,80.251992,8.963226,4.407232,25.157986,0.269051,1.755692,11.995929,24.007427,4.043333,0.389769,1.951982,0.440386,4.949691,13.137692,2.601430,75.858994,68.104071,1.572601,52.344739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,,Brease,14.286209,0.000000,26.467142,9.394148,2.610714,1.062695,4.011221,13.538960,8.880911,20.320788,2.266629,7.531598,9.140765,12.365308,17.293903,0.082684,10.475410,1.309518,6.511355,1.981199,0.043716,12.544067,35.716858,46.994581,0.496520,0.066966,0.032720,0.042787,2.762677,52.855634,7.634076,1.435337,22.611148,0.326990,0.599985,24.031937,5.037661,3.958076,0.508361,4.957747,0.586601,3.653093,11.871970,0.284932,0.348241,52.716755,0.770283,57.218317,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,,Brease,26.223014,0.435370,32.506914,4.689478,2.860832,3.011771,7.098766,14.828362,8.059247,15.627057,1.870120,4.607655,11.168068,0.931820,7.327714,0.027099,11.345134,1.833837,5.206854,1.436246,0.044451,5.945644,29.370734,47.147380,0.375799,0.093886,0.026213,0.028769,1.986284,106.762451,4.532444,7.742199,15.242666,0.306033,0.584647,11.403894,15.647456,3.077947,1.044045,4.608902,4.919873,3.806966,5.071321,0.111131,1.064054,52.421735,1.065348,110.404077,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6724,,Kidney,16.863419,1.190972,25.036156,1.546225,0.530593,0.380801,0.198917,21.910257,7.111496,3.217752,0.504725,5.999998,11.247115,16.586646,46.095228,116.247551,7.390337,6.633006,8.301737,1.294841,0.007027,18.527863,38.116406,1.131552,0.333104,0.136047,2.506972,0.007451,1.782454,12.950534,3.932423,0.328983,30.393212,2.292510,0.821006,10.800834,4.405546,2.671065,0.669059,3.926319,0.157345,1.811698,6.014154,0.127958,2.555378,56.063974,0.285616,42.643953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6725,,Kidney,6.163899,0.148058,19.183121,0.750365,0.237029,1.024673,0.316076,14.275677,3.175311,3.335938,0.263273,3.251318,10.133759,11.080710,42.066907,33.551994,1.872219,1.447171,3.583545,0.841988,0.012521,31.950629,23.944429,5.376231,5.069502,0.482067,0.034113,0.012518,2.890377,15.162378,5.997902,1.781464,19.560482,1.121946,0.190129,8.902662,6.196090,1.637518,0.646288,2.183161,0.255047,1.845530,4.573733,0.125636,1.915835,77.096655,0.226880,53.736157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6726,,Kidney,8.674678,0.114501,19.381643,2.343508,0.775450,6.802906,4.488155,25.952476,5.249633,7.294552,1.061333,5.843669,6.966201,11.645059,24.596161,0.523799,17.827111,0.393672,7.357243,2.362634,0.043574,18.780087,38.890656,25.619976,3.264338,30.809678,0.050878,0.013201,2.508956,35.423042,9.963739,152.278412,26.136578,2.441945,3.495794,1.556642,2.435644,2.966525,4.039719,15.943088,16.157515,3.059828,6.464949,0.398361,8.404640,62.614944,1.069302,54.087307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6727,,Kidney,34.396800,0.055600,36.833200,3.158000,3.425500,0.261800,1.700400,11.615200,2.212600,14.548800,1.837500,1.996000,8.085200,4.048500,39.925300,0.071800,9.440800,1.829700,12.563300,2.273700,0.068100,14.646800,10.678300,14.468300,0.932900,0.007800,0.004600,0.011700,3.138500,15.932700,3.307500,2.005500,9.801000,2.094400,4.465200,0.649300,2.987300,4.775300,0.525900,2.666900,1.249800,3.475000,13.847500,1.445800,7.259100,45.483000,0.941100,40.831900,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


## Preprocessing

In [99]:
dataset = data.iloc[0:6728,1:60485]
dataset = dataset.rename(columns = {'Cancer': 'CancerType'})
dataset.loc[dataset.CancerType == 'Brease', 'CancerType'] = 'Breast'
dataset.loc[dataset.CancerType == 'Hema', 'CancerType'] = 'Blood'
dataset.loc[dataset.CancerType == 'Bronchus', 'CancerType'] = 'Lung'
dataset

Unnamed: 0,CancerType,ENSG00000000003.13,ENSG00000000005.5,ENSG00000000419.11,ENSG00000000457.12,ENSG00000000460.15,ENSG00000000938.11,ENSG00000000971.14,ENSG00000001036.12,ENSG00000001084.9,ENSG00000001167.13,ENSG00000001460.16,ENSG00000001461.15,ENSG00000001497.15,ENSG00000001561.6,ENSG00000001617.10,ENSG00000001626.13,ENSG00000001629.8,ENSG00000001630.14,ENSG00000001631.13,ENSG00000002016.15,ENSG00000002079.11,ENSG00000002330.12,ENSG00000002549.11,ENSG00000002586.16,ENSG00000002587.8,ENSG00000002726.18,ENSG00000002745.11,ENSG00000002746.13,ENSG00000002822.14,ENSG00000002834.16,ENSG00000002919.13,ENSG00000002933.6,ENSG00000003056.6,ENSG00000003096.12,ENSG00000003137.7,ENSG00000003147.16,ENSG00000003249.12,ENSG00000003393.13,ENSG00000003400.13,ENSG00000003402.18,ENSG00000003436.13,ENSG00000003509.14,ENSG00000003756.15,ENSG00000003987.12,ENSG00000003989.15,ENSG00000004059.9,ENSG00000004139.12,ENSG00000004142.10,ENSG00000004399.11,...,ENSGR0000002586.16,ENSGR0000124333.13,ENSGR0000124334.15,ENSGR0000167393.15,ENSGR0000168939.9,ENSGR0000169084.11,ENSGR0000169093.13,ENSGR0000169100.11,ENSGR0000178605.11,ENSGR0000182162.8,ENSGR0000182378.11,ENSGR0000182484.13,ENSGR0000185203.10,ENSGR0000185291.9,ENSGR0000185960.11,ENSGR0000196433.10,ENSGR0000197976.9,ENSGR0000198223.13,ENSGR0000205755.9,ENSGR0000214717.8,ENSGR0000223274.4,ENSGR0000223484.5,ENSGR0000223511.4,ENSGR0000223571.4,ENSGR0000223773.5,ENSGR0000225661.5,ENSGR0000226179.4,ENSGR0000227159.6,ENSGR0000228410.4,ENSGR0000228572.5,ENSGR0000229232.4,ENSGR0000230542.4,ENSGR0000234622.4,ENSGR0000234958.4,ENSGR0000236017.6,ENSGR0000236871.5,ENSGR0000237040.4,ENSGR0000237531.4,ENSGR0000237801.4,ENSGR0000263835.4,ENSGR0000263980.4,ENSGR0000264510.4,ENSGR0000264819.4,ENSGR0000265658.4,ENSGR0000270726.4,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1
0,Breast,9.342228,0.828756,19.959239,3.713461,1.577598,1.557466,3.895561,25.113650,8.062482,19.397789,0.745033,5.288839,16.220930,5.519227,37.908338,0.004312,23.510775,2.397773,8.385777,1.551339,0.030864,6.536317,26.057553,69.563461,0.361272,0.470571,0.018769,0.023667,2.868146,59.892201,6.735070,1.713565,26.245626,0.499608,0.456454,11.350259,5.723320,3.616736,0.667002,2.787415,0.825559,2.139119,8.430035,0.144091,32.211317,79.499496,1.969681,63.630315,12.530385,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Breast,19.658110,0.000000,34.302484,4.674391,3.941016,1.505171,2.325878,47.765837,2.832710,13.435799,1.060891,2.509758,15.209583,12.459083,24.168524,0.077162,17.797365,4.039856,8.870517,3.164233,0.022513,38.594390,32.461384,73.974563,1.686561,0.209221,0.052573,0.006138,7.263930,213.316385,11.291551,2.882817,20.562074,0.053194,0.290579,23.350756,23.366987,2.859008,0.819826,3.557586,0.348815,3.224407,8.113482,0.115459,0.271096,173.976522,0.856925,60.986580,7.605533,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Breast,1.028690,0.098894,129.116200,3.580859,1.491591,0.316237,1.213933,16.241104,4.633766,14.292499,1.556456,8.963247,10.047671,14.436147,57.131994,0.003739,10.763451,2.438696,8.974658,2.409180,0.030108,26.185322,22.156006,24.105826,0.776011,0.008327,0.004883,0.092350,3.957344,80.251992,8.963226,4.407232,25.157986,0.269051,1.755692,11.995929,24.007427,4.043333,0.389769,1.951982,0.440386,4.949691,13.137692,2.601430,75.858994,68.104071,1.572601,52.344739,6.889993,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Breast,14.286209,0.000000,26.467142,9.394148,2.610714,1.062695,4.011221,13.538960,8.880911,20.320788,2.266629,7.531598,9.140765,12.365308,17.293903,0.082684,10.475410,1.309518,6.511355,1.981199,0.043716,12.544067,35.716858,46.994581,0.496520,0.066966,0.032720,0.042787,2.762677,52.855634,7.634076,1.435337,22.611148,0.326990,0.599985,24.031937,5.037661,3.958076,0.508361,4.957747,0.586601,3.653093,11.871970,0.284932,0.348241,52.716755,0.770283,57.218317,6.832802,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Breast,26.223014,0.435370,32.506914,4.689478,2.860832,3.011771,7.098766,14.828362,8.059247,15.627057,1.870120,4.607655,11.168068,0.931820,7.327714,0.027099,11.345134,1.833837,5.206854,1.436246,0.044451,5.945644,29.370734,47.147380,0.375799,0.093886,0.026213,0.028769,1.986284,106.762451,4.532444,7.742199,15.242666,0.306033,0.584647,11.403894,15.647456,3.077947,1.044045,4.608902,4.919873,3.806966,5.071321,0.111131,1.064054,52.421735,1.065348,110.404077,18.087277,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6723,Kidney,6.948603,0.121193,16.974004,1.690258,0.465973,3.025936,0.724664,31.996083,3.956931,8.115076,2.550479,5.609914,9.413006,4.735368,24.101956,0.335051,15.362639,0.167629,9.529566,7.450891,0.180638,32.500997,5.677045,14.831573,13.291161,40.150424,0.059834,0.111777,14.785822,41.588434,11.606569,250.574406,6.741644,0.216552,1.429844,7.425220,19.497768,2.374645,1.537618,12.454298,14.252154,5.284172,17.156295,0.428497,0.027769,148.595362,1.706010,100.814413,3.070669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6724,Kidney,16.863419,1.190972,25.036156,1.546225,0.530593,0.380801,0.198917,21.910257,7.111496,3.217752,0.504725,5.999998,11.247115,16.586646,46.095228,116.247551,7.390337,6.633006,8.301737,1.294841,0.007027,18.527863,38.116406,1.131552,0.333104,0.136047,2.506972,0.007451,1.782454,12.950534,3.932423,0.328983,30.393212,2.292510,0.821006,10.800834,4.405546,2.671065,0.669059,3.926319,0.157345,1.811698,6.014154,0.127958,2.555378,56.063974,0.285616,42.643953,2.036215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6725,Kidney,6.163899,0.148058,19.183121,0.750365,0.237029,1.024673,0.316076,14.275677,3.175311,3.335938,0.263273,3.251318,10.133759,11.080710,42.066907,33.551994,1.872219,1.447171,3.583545,0.841988,0.012521,31.950629,23.944429,5.376231,5.069502,0.482067,0.034113,0.012518,2.890377,15.162378,5.997902,1.781464,19.560482,1.121946,0.190129,8.902662,6.196090,1.637518,0.646288,2.183161,0.255047,1.845530,4.573733,0.125636,1.915835,77.096655,0.226880,53.736157,4.313148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6726,Kidney,8.674678,0.114501,19.381643,2.343508,0.775450,6.802906,4.488155,25.952476,5.249633,7.294552,1.061333,5.843669,6.966201,11.645059,24.596161,0.523799,17.827111,0.393672,7.357243,2.362634,0.043574,18.780087,38.890656,25.619976,3.264338,30.809678,0.050878,0.013201,2.508956,35.423042,9.963739,152.278412,26.136578,2.441945,3.495794,1.556642,2.435644,2.966525,4.039719,15.943088,16.157515,3.059828,6.464949,0.398361,8.404640,62.614944,1.069302,54.087307,23.934072,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
check = dataset.iloc[:,0]
check_np = check.to_numpy()
print(check)
print(np.unique(check_np))

0       Breast
1       Breast
2       Breast
3       Breast
4       Breast
         ...  
6723    Kidney
6724    Kidney
6725    Kidney
6726    Kidney
6727    Kidney
Name: CancerType, Length: 6728, dtype: object
['Blood' 'Breast' 'Kidney' 'Lung']


## Decision Tree Model

In [116]:
# Reading data & split into training and testing sets
X = dataset.drop(['CancerType'], 1)
#print(X)
y = dataset['CancerType']
#print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=12345)

#Model Definition & Fitting
#clf = DecisionTreeClassifier()
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

# Predict based on the given features for the test set
y_pred = clf.predict(X_test)
predictions_probability = clf.predict_proba(X_test)
print(y_pred)

['Lung' 'Lung' 'Blood' ... 'Blood' 'Blood' 'Kidney']


## Decision Tree Metrics & Plots

In [117]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='micro'))

with np.printoptions(threshold=np.inf):
    print(clf.feature_importances_)

Accuracy: 0.9955423476968797
Precision: 0.9955423476968797
Recall: 0.9955423476968797
[0.00000000e+00 4.95101866e-05 0.00000000e+00 1.11807318e-05
 4.40186981e-06 0.00000000e+00 0.00000000e+00 2.53236955e-05
 1.13669403e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.10801664e-04
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 5.77913841e-06 0.00000000e+00 0.00000000e+00 2.92278316e-06
 0.00000000e+00 2.93726167e-06 0.00000000e+00 1.05202087e-05
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.25226471e-05 2.91453000e-06 5.84220900e-05
 0.00000000e+00 0.00000000e+00 1.17046944e-04 0.00000000e+00
 8.59084558e-05 0.00000000e+00 0.00000000e+00 2.28815558e-05
 5.14702176e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 4.98851744e-05 2.71001443e-05
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.12094162e-05
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00