In [30]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [31]:
data = pd.read_csv('predicting_cancer.csv')

In [32]:
data.head()

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses,CancerType
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [33]:
data.shape

(699, 11)

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
CodeNumber                  699 non-null int64
ClumpThickness              699 non-null int64
UniformityCellSize          699 non-null int64
UniformityCellShape         699 non-null int64
MarginalAdhesion            699 non-null int64
SingleEpithelialCellSize    699 non-null int64
BareNuclei                  699 non-null object
BlandChromatin              699 non-null int64
NormalNucleoli              699 non-null int64
Mitoses                     699 non-null int64
CancerType                  699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [35]:
data.describe()

Unnamed: 0,CodeNumber,ClumpThickness,UniformityCellSize,UniformityCellShape,MarginalAdhesion,SingleEpithelialCellSize,BlandChromatin,NormalNucleoli,Mitoses,CancerType
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [36]:
data['CancerType'].unique()

array([2, 4])

In [37]:
data['BareNuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [38]:
data['BareNuclei'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: BareNuclei, dtype: int64

In [39]:
data['BareNuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: BareNuclei, dtype: object

In [40]:
data1 = data[data['BareNuclei'] != '?']
data1['BareNuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '5', '8', '6'], dtype=object)

In [41]:
data.isnull().sum().sort_values(ascending=False)

CancerType                  0
Mitoses                     0
NormalNucleoli              0
BlandChromatin              0
BareNuclei                  0
SingleEpithelialCellSize    0
MarginalAdhesion            0
UniformityCellShape         0
UniformityCellSize          0
ClumpThickness              0
CodeNumber                  0
dtype: int64

In [42]:
x = data1.drop('CancerType',axis=1)
y = data1['CancerType']

In [43]:
from sklearn.model_selection import train_test_split
seed = 7
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=seed)

In [44]:
clf = RandomForestClassifier()
clf = clf.fit(x_train,y_train)

In [45]:
y_predictions = clf.predict(x_test)

In [46]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
print(accuracy_score(y_test,y_predictions))

0.9781021897810219


In [47]:
print(classification_report(y_test,y_predictions))

             precision    recall  f1-score   support

          2       0.99      0.98      0.98        89
          4       0.96      0.98      0.97        48

avg / total       0.98      0.98      0.98       137



In [48]:
print(confusion_matrix(y_test,y_predictions))

[[87  2]
 [ 1 47]]


## Train using Decision Tree

In [49]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)

In [50]:
y_predictions_dt = clf.predict(x_test)

In [51]:
print(accuracy_score(y_test,y_predictions_dt))

0.948905109489051


In [52]:
#try with different parameters for RandomforestClassifier

## Train using SVM

In [53]:
from sklearn.svm import SVC
clf = SVC()
clf = clf.fit(x_train,y_train)

In [54]:
y_predictions = clf.predict(x_test)

In [55]:
print(accuracy_score(y_test,y_predictions))

0.656934306569343


In [68]:
x_train.shape

(546, 10)

### K Nearest Neighbors Classifier

In [69]:
from sklearn.neighbors import KNeighborsClassifier
high_accu = 0
for i in range(1,546):
    clf = KNeighborsClassifier(n_neighbors=i)
    clf = clf.fit(x_train,y_train)
    y_predictions = clf.predict(x_test)
    acc = accuracy_score(y_test,y_predictions)
    if acc > high_accu:
        high_accu = acc
print(high_accu)

0.6934306569343066
