In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import decomposition
from sklearn.decomposition import FastICA
from sklearn.feature_selection import SelectPercentile as sp
from matplotlib import pyplot
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv("Lymphoma.csv")
data

Unnamed: 0,0.46,0.7,0.67,-0.23,0,0.09,-0.02,-0.57,-0.17,-0.25,...,0.4.18,0.02.27,0.79.18,0.64.11,0.16.26,1.22.8,1.37.7,-0.04.16,0.16.27,DLBCL
0,0.02,0.59,0.45,0.55,-0.08,-0.15,-0.05,-0.38,-0.55,0.35,...,0.57,0.52,-0.23,0.3,0.09,-0.2,-0.05,-0.14,-1.15,DLBCL
1,-0.32,-0.63,-0.46,-0.28,-0.96,-1.17,-1.13,-0.89,-0.49,-0.23,...,1.62,-0.01,?,0.29,-0.57,1.2,1.4,0.29,0.25,DLBCL
2,-0.51,-0.45,-0.16,-0.51,-0.58,-0.71,-0.65,-0.82,-0.30,-0.22,...,0.34,0.02,0.08,0.49,0.29,1.26,1.24,0.05,0.7,DLBCL
3,0.2,0.13,0.2,0.09,-0.56,0,0.06,-0.15,-0.61,-0.65,...,0.16,0.66,1.11,0.28,0.12,-0.16,-0.72,-0.04,-0.22,DLBCL
4,-0.36,-0.53,-0.36,0.48,-0.06,-0.3,0.16,-0.18,-0.40,-0.5,...,0.64,1.27,0,-0.64,-0.47,-2.03,-0.69,-0.19,-0.8,DLBCL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.77,?,0.8,-0.13,0.52,-0.58,-0.44,0.25,0.15,?,...,-0.37,?,?,?,-0.52,?,?,-0.86,?,CLL
61,-0.03,-0.38,0.01,0.29,-0.11,-0.13,-0.36,0.46,0.19,-0.05,...,-0.23,-0.19,-0.85,-1,-0.89,-1.79,-1.56,-0.26,1.04,CLL
62,0.09,?,0.85,0.55,0.3,0.3,0.16,0.34,0.90,0.85,...,0.09,-0.46,-0.14,?,-0.83,?,?,0.09,-0.53,CLL
63,0.34,?,0.36,0.11,0.15,0.19,-0.11,0.27,0.59,0.55,...,0.41,?,?,-0.42,-0.25,?,?,-0.33,?,CLL


In [3]:
last_row = data["DLBCL"]
last_row.replace({"DLBCL" : 1, "CLL" : 2, "FL" : 3}, inplace = True)
last_row

0     1
1     1
2     1
3     1
4     1
     ..
60    2
61    2
62    2
63    2
64    1
Name: DLBCL, Length: 65, dtype: int64

In [4]:
data.drop("DLBCL", inplace = True, axis = 1)
data

Unnamed: 0,0.46,0.7,0.67,-0.23,0,0.09,-0.02,-0.57,-0.17,-0.25,...,0.24.23,0.4.18,0.02.27,0.79.18,0.64.11,0.16.26,1.22.8,1.37.7,-0.04.16,0.16.27
0,0.02,0.59,0.45,0.55,-0.08,-0.15,-0.05,-0.38,-0.55,0.35,...,0.11,0.57,0.52,-0.23,0.3,0.09,-0.2,-0.05,-0.14,-1.15
1,-0.32,-0.63,-0.46,-0.28,-0.96,-1.17,-1.13,-0.89,-0.49,-0.23,...,0,1.62,-0.01,?,0.29,-0.57,1.2,1.4,0.29,0.25
2,-0.51,-0.45,-0.16,-0.51,-0.58,-0.71,-0.65,-0.82,-0.30,-0.22,...,0.17,0.34,0.02,0.08,0.49,0.29,1.26,1.24,0.05,0.7
3,0.2,0.13,0.2,0.09,-0.56,0,0.06,-0.15,-0.61,-0.65,...,-0.13,0.16,0.66,1.11,0.28,0.12,-0.16,-0.72,-0.04,-0.22
4,-0.36,-0.53,-0.36,0.48,-0.06,-0.3,0.16,-0.18,-0.40,-0.5,...,0.1,0.64,1.27,0,-0.64,-0.47,-2.03,-0.69,-0.19,-0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.77,?,0.8,-0.13,0.52,-0.58,-0.44,0.25,0.15,?,...,-0.12,-0.37,?,?,?,-0.52,?,?,-0.86,?
61,-0.03,-0.38,0.01,0.29,-0.11,-0.13,-0.36,0.46,0.19,-0.05,...,0.15,-0.23,-0.19,-0.85,-1,-0.89,-1.79,-1.56,-0.26,1.04
62,0.09,?,0.85,0.55,0.3,0.3,0.16,0.34,0.90,0.85,...,0.38,0.09,-0.46,-0.14,?,-0.83,?,?,0.09,-0.53
63,0.34,?,0.36,0.11,0.15,0.19,-0.11,0.27,0.59,0.55,...,-0.23,0.41,?,?,-0.42,-0.25,?,?,-0.33,?


In [5]:
data.replace("?", np.nan, inplace = True)

In [6]:
data.isnull().sum().sum()

12266

In [7]:
imputer = SimpleImputer(strategy ='mean')
imputer = imputer.fit(data)
data = pd.DataFrame(imputer.transform(data))
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4016,4017,4018,4019,4020,4021,4022,4023,4024,4025
0,0.02,0.590000,0.45,0.55,-0.08,-0.15,-0.05,-0.38,-0.55,0.3500,...,0.11,0.57,0.520000,-0.230000,0.300000,0.09,-0.200000,-0.050000,-0.14,-1.150000
1,-0.32,-0.630000,-0.46,-0.28,-0.96,-1.17,-1.13,-0.89,-0.49,-0.2300,...,0.00,1.62,-0.010000,0.075714,0.290000,-0.57,1.200000,1.400000,0.29,0.250000
2,-0.51,-0.450000,-0.16,-0.51,-0.58,-0.71,-0.65,-0.82,-0.30,-0.2200,...,0.17,0.34,0.020000,0.080000,0.490000,0.29,1.260000,1.240000,0.05,0.700000
3,0.20,0.130000,0.20,0.09,-0.56,0.00,0.06,-0.15,-0.61,-0.6500,...,-0.13,0.16,0.660000,1.110000,0.280000,0.12,-0.160000,-0.720000,-0.04,-0.220000
4,-0.36,-0.530000,-0.36,0.48,-0.06,-0.30,0.16,-0.18,-0.40,-0.5000,...,0.10,0.64,1.270000,0.000000,-0.640000,-0.47,-2.030000,-0.690000,-0.19,-0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.77,-0.125818,0.80,-0.13,0.52,-0.58,-0.44,0.25,0.15,-0.0225,...,-0.12,-0.37,0.061698,0.075714,0.024098,-0.52,-0.011333,0.015574,-0.86,0.017119
61,-0.03,-0.380000,0.01,0.29,-0.11,-0.13,-0.36,0.46,0.19,-0.0500,...,0.15,-0.23,-0.190000,-0.850000,-1.000000,-0.89,-1.790000,-1.560000,-0.26,1.040000
62,0.09,-0.125818,0.85,0.55,0.30,0.30,0.16,0.34,0.90,0.8500,...,0.38,0.09,-0.460000,-0.140000,0.024098,-0.83,-0.011333,0.015574,0.09,-0.530000
63,0.34,-0.125818,0.36,0.11,0.15,0.19,-0.11,0.27,0.59,0.5500,...,-0.23,0.41,0.061698,0.075714,-0.420000,-0.25,-0.011333,0.015574,-0.33,0.017119


In [8]:
data.isnull().sum().sum()

0

In [9]:
scaler = StandardScaler()
data = scaler.fit_transform(data)
data = pd.DataFrame(data)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4016,4017,4018,4019,4020,4021,4022,4023,4024,4025
0,0.306738,1.878203e+00,0.983037,1.495978,-0.204274,-0.277726,0.040930,-0.906959,-1.079716,0.637010,...,0.251414,1.162242,1.054420e+00,-0.681938,5.374730e-01,0.299153,-2.166382e-01,-8.131183e-02,-0.500360,-1.747062
1,-0.468980,-1.322900e+00,-1.083611,-0.705685,-2.640294,-2.575564,-2.551857,-2.163361,-0.952345,-0.354845,...,-0.005101,3.699613,-1.649566e-01,0.000000,5.179924e-01,-1.468571,1.390924e+00,1.716696e+00,0.831442,0.348600
2,-0.902469,-8.506062e-01,-0.402299,-1.315784,-1.588377,-1.539284,-1.399507,-1.990913,-0.549003,-0.337744,...,0.391331,0.606437,-9.593528e-02,0.009560,9.076045e-01,0.834828,1.459820e+00,1.518295e+00,0.088110,1.022206
3,0.717412,6.712299e-01,0.415276,0.275780,-1.533013,0.060192,0.305011,-0.340347,-1.207088,-1.073084,...,-0.308255,0.171460,1.376519e+00,2.307117,4.985118e-01,0.379505,-1.707078e-01,-9.121154e-01,-0.190639,-0.354943
4,-0.560241,-1.060515e+00,-0.856507,1.310296,-0.148910,-0.615643,0.545084,-0.414252,-0.761288,-0.816570,...,0.228094,1.331400,2.779953e+00,-0.168891,-1.293704e+00,-1.200734,-2.317952e+00,-8.749153e-01,-0.655221,-1.223146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,2.017881,-7.282662e-17,1.777901,-0.307794,1.456648,-1.246422,-0.895354,0.645067,0.406282,0.000000,...,-0.284936,-1.109309,-1.596439e-17,0.000000,6.758692e-18,-1.334653,3.983824e-18,-2.151066e-18,-2.730355,0.000000
61,0.192662,-6.669363e-01,-0.016222,0.806301,-0.287321,-0.232670,-0.703296,1.162408,0.491196,-0.047028,...,0.344692,-0.770992,-5.790845e-01,-2.064933,-1.995005e+00,-2.325650,-2.042370e+00,-1.953720e+00,-0.872026,1.531153
62,0.466445,-7.282662e-17,1.891453,1.495978,0.847643,0.736026,0.545084,0.866784,1.998422,1.492058,...,0.881042,0.002301,-1.200276e+00,-0.481180,6.758692e-18,-2.164948,3.983824e-18,-2.151066e-18,0.211999,-0.818983
63,1.036826,-7.282662e-17,0.778643,0.328832,0.432413,0.488220,-0.103113,0.694337,1.340338,0.979029,...,-0.541451,0.775595,-1.596439e-17,0.000000,-8.651304e-01,-0.611493,3.983824e-18,-2.151066e-18,-1.088831,0.000000


In [10]:
def RFC(data, target_row):
    train_x, val_x, train_y, val_y = train_test_split(data, target_row, test_size = 0.2, random_state = 42)
    model_rfc = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=10)    
    model_rfc.fit(train_x, train_y)
    preds = model_rfc.predict(val_x)
    score = accuracy_score(val_y, preds)
    print("\n Random Forest Classifier\n")
    print("F1 score " + str(f1_score(val_y, preds, average="macro")))
    print("Precision score " + str(precision_score(val_y, preds, average="macro")))
    print("Recall score " + str(recall_score(val_y, preds, average="macro")))  
    return score

In [11]:
def DTC(data, target_row):
    train_x, val_x, train_y, val_y = train_test_split(data, target_row, test_size = 0.3, random_state = 42)
    model_dtc = DecisionTreeClassifier(max_depth=100, random_state=42)
    model_dtc.fit(train_x, train_y)
    preds = model_dtc.predict(val_x)
    score = accuracy_score(val_y, preds)
    print("\n Decision Tree Classifier\n")
    print("F1 score " + str(f1_score(val_y, preds, average="macro")))
    print("Precision score " + str(precision_score(val_y, preds, average="macro")))
    print("Recall score " + str(recall_score(val_y, preds, average="macro")))
    return score

In [12]:
def KNN(data, target_row):
    train_x, val_x, train_y, val_y = train_test_split(data, target_row, test_size = 0.2, random_state = 42)
    model_knn = KNeighborsClassifier(n_neighbors=30, n_jobs=-1)
    model_knn.fit(train_x, train_y)
    preds = model_knn.predict(val_x)
    score = accuracy_score(val_y, preds)
    print("\n K Nearest Neighbour \n")
    print("F1 score " + str(f1_score(val_y, preds, average="macro")))
    print("Precision score " + str(precision_score(val_y, preds, average="macro")))
    print("Recall score " + str(recall_score(val_y, preds, average="macro")))
    return score

In [13]:
print("Accuracy score is " + str(RFC(data, last_row)))
print("Accuracy score is " + str(KNN(data, last_row)))
print("Accuracy score is " + str(DTC(data, last_row)))


 Random Forest Classifier

F1 score 0.6491228070175439
Precision score 0.6333333333333333
Recall score 0.6666666666666666
Accuracy score is 0.9230769230769231

 K Nearest Neighbour 

F1 score 0.2727272727272727
Precision score 0.23076923076923075
Recall score 0.3333333333333333
Accuracy score is 0.6923076923076923

 Decision Tree Classifier

F1 score 0.8781362007168458
Precision score 0.8333333333333334
Recall score 0.9791666666666666
Accuracy score is 0.95


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
pca = decomposition.PCA(n_components=61, random_state=42)
data_pca = pca.fit_transform(data)
data_pca = pd.DataFrame(data_pca)
data_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,-23.223251,43.041533,20.463915,4.140378,-21.019371,-29.261064,0.897549,2.013386,0.925664,-4.739275,...,0.135560,2.711163,1.863811,1.115855,0.092087,0.553210,1.174594,0.671160,-1.254841,-0.898512
1,-16.721692,44.683163,30.562154,-18.177539,-18.581445,-8.134324,6.901006,-7.668035,14.846499,29.771646,...,0.427358,-1.564355,-0.403521,-0.146953,-0.352989,-1.037556,-0.632529,-0.176966,0.285485,0.662826
2,-18.518558,31.276907,7.063619,-9.244061,-6.040720,-3.436452,-7.061147,-3.690953,-8.626160,-2.193293,...,-0.325311,0.946592,-3.835684,1.454779,-0.765202,-1.524858,-2.816164,3.848404,-0.403709,2.775609
3,-14.507562,18.118349,0.299933,-16.176426,-16.276633,-18.664747,-0.483391,1.318471,-9.475590,0.333228,...,0.710631,1.481544,-3.139051,2.103420,-5.525509,3.365648,1.363183,-1.470990,-1.245131,-2.283437
4,-12.978151,6.913426,2.840836,3.933132,-19.663377,-5.709785,4.917202,17.019049,-4.898043,2.384288,...,0.407940,0.501917,0.717662,-1.008006,0.030358,-1.670305,3.142532,0.145489,-1.817541,-1.503789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,55.437520,-18.786532,15.229021,9.068700,7.098349,-17.210447,-18.263303,-12.866122,4.125058,3.662821,...,0.852027,-0.552167,-0.242910,-2.311525,-0.365250,0.105768,0.838257,-0.465861,1.678036,-0.841470
61,46.698242,-8.906894,32.019879,7.404742,2.765605,1.594226,-17.375184,-2.565779,14.563499,8.831228,...,5.065650,0.762171,1.084656,-3.187292,3.212527,0.078939,-0.368841,-0.410129,-0.111700,-1.577973
62,28.967084,-25.318847,13.998294,21.564247,2.287106,-10.107256,6.723486,3.821922,1.500963,-0.266575,...,0.695894,-3.184773,4.999880,0.062766,2.164324,-1.683216,0.694856,-1.044706,1.075466,1.734338
63,35.212447,-19.665372,3.612588,16.329118,0.197347,-4.591080,10.257201,-14.971240,2.981506,-3.986748,...,-3.257657,1.448193,-0.432782,-2.506297,-1.212789,0.341118,-1.413260,1.733397,-2.447110,-0.304321


In [15]:
print("Accuracy score is " + str(RFC(data_pca, last_row)))
print("Accuracy score is " + str(KNN(data_pca, last_row)))
print("Accuracy score is " + str(DTC(data_pca, last_row)))


 Random Forest Classifier

F1 score 0.2727272727272727
Precision score 0.23076923076923075
Recall score 0.3333333333333333
Accuracy score is 0.6923076923076923

 K Nearest Neighbour 

F1 score 0.2727272727272727
Precision score 0.23076923076923075
Recall score 0.3333333333333333
Accuracy score is 0.6923076923076923

 Decision Tree Classifier

F1 score 0.9416282642089094
Precision score 0.9166666666666666
Recall score 0.9791666666666666
Accuracy score is 0.95


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
ica = FastICA(n_components=2000, random_state=42)
data_ica = ica.fit_transform(data)
data_ica = pd.DataFrame(data_ica)



In [17]:
print("Accuracy score is " + str(RFC(data_ica, last_row)))
print("Accuracy score is " + str(KNN(data_ica, last_row)))
print("Accuracy score is " + str(DTC(data_ica, last_row)))


 Random Forest Classifier

F1 score 0.4473684210526316
Precision score 0.6
Recall score 0.4074074074074074
Accuracy score is 0.6923076923076923

 K Nearest Neighbour 

F1 score 0.2727272727272727
Precision score 0.23076923076923075
Recall score 0.3333333333333333
Accuracy score is 0.6923076923076923

 Decision Tree Classifier

F1 score 0.4041666666666666
Precision score 0.4375
Recall score 0.3819444444444444
Accuracy score is 0.7


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
mi = sp(percentile=50)
data_mi = mi.fit(data, last_row)
data_mi = mi.transform(data)
data_mi = pd.DataFrame(data_mi)

In [19]:
print("Accuracy score is " + str(RFC(data_mi, last_row)))
print("Accuracy score is " + str(KNN(data_mi, last_row)))
print("Accuracy score is " + str(DTC(data_mi, last_row)))


 Random Forest Classifier

F1 score 1.0
Precision score 1.0
Recall score 1.0
Accuracy score is 1.0

 K Nearest Neighbour 

F1 score 0.2727272727272727
Precision score 0.23076923076923075
Recall score 0.3333333333333333
Accuracy score is 0.6923076923076923

 Decision Tree Classifier

F1 score 0.7111111111111111
Precision score 0.75
Recall score 0.8472222222222222
Accuracy score is 0.85


  _warn_prf(average, modifier, msg_start, len(result))
