# O seguinte projeto tem o objetivo de investigar a probabilidade de prever corretamente o tipo de câncer de mama (maligno ou benigno) dado as características da massa da mama calculada a partir de imagens digitalizadas. As características descrevem o núcleo da célula presente na massa da mama.

Informação das características:

     Característica                     Domínio
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

In [1]:
import os
import sys
import time
import datetime
import numpy as np
import pandas as pd
import pathlib
import csv

In [2]:
datapath = pathlib.Path("../Data Science/")
outputs = pathlib.Path("../outputs/")
csvfile = "breastcancerdata.csv"
datafile = datapath / csvfile

### Analisando os dados da maneira que foram capturados.

In [3]:
df = pd.read_csv(datafile)
df.head()
#df.info()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [4]:
df.shape

(698, 11)

### Inserindo o nome de cada característica nas colunas.

In [5]:
df.columns = ['Id', 'Thickness', 'UniofCellSize', 'UniformityofCellShape', 'MarginalAdhesion', 'SingleEpithelialCellSize', 'BareNuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

### Removendo a coluna Id pois não agrega muita informação para a análise.

In [6]:
df2 = df[['Thickness', 'UniofCellSize', 'UniformityofCellShape', 'MarginalAdhesion', 'SingleEpithelialCellSize', 'BareNuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']]

In [7]:
df2.head()

Unnamed: 0,Thickness,UniofCellSize,UniformityofCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,4,4,5,7,10,3,2,1,2
1,3,1,1,1,2,2,3,1,1,2
2,6,8,8,1,3,4,3,7,1,2
3,4,1,1,3,2,1,3,1,1,2
4,8,10,10,8,7,10,9,7,1,4


In [8]:
df2.shape

(698, 10)

In [9]:
#Número de casos malignos
maligno = len(df2[df2['Class']==4])

#Número de casos benignos
benigno = len(df2[df2['Class']==2])

#Taxa de tumores malignos
taxa = str((float(maligno)/(len(df2)))*100)

print(u'Taxa de tumores malignos: ' + taxa + '%')

Taxa de tumores malignos: 34.52722063037249%


# Árvore de decisão

In [10]:
import codecs
import scipy.stats as ss
import scipy.signal as signal
from sklearn import tree
from sklearn.externals.six import StringIO
from IPython.display import Image #needed to render in notebook
import pydotplus
import matplotlib.pyplot as plt
import matplotlib.dates as md
from matplotlib import pylab as pl

features = set(zoo.columns)

for feature in features:
    print(u'Feature {} has unique values: {}'.format(feature, zoo[feature].unique()))

#clf = tree.DecisionTreeClassifier(criterion='entropy', max_features="auto", min_samples_leaf=10)
clf = tree.DecisionTreeClassifier(criterion='gini', max_features=None, min_samples_leaf=10)
features_list=['hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic',
       'predator', 'toothed', 'backbone', 'breathes', 'venomous', 'fins',
       'legs', 'tail', 'domestic', 'catsize']
clf.fit(zoo[features_list], zoo.type)

print(clf.n_features_)
print(clf.feature_importances_)

dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=features_list) 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

In [11]:
features = set(df2.columns)

In [12]:
for feature in features:
    print(u'Feature {} has unique values: {}'.format(feature, df2[feature].unique()))

Feature Normal Nucleoli has unique values: [ 2  1  7  4  5  3 10  6  9  8]
Feature Mitoses has unique values: [ 1  5  4  2  3  7 10  8  6]
Feature SingleEpithelialCellSize has unique values: [ 7  2  3  1  6  4  5  8 10  9]
Feature Bland Chromatin has unique values: [ 3  9  1  2  4  5  7  8  6 10]
Feature UniformityofCellShape has unique values: [ 4  1  8 10  2  3  5  6  7  9]
Feature UniofCellSize has unique values: [ 4  1  8 10  2  3  7  5  6  9]
Feature BareNuclei has unique values: ['10' '2' '4' '1' '3' '9' '7' '?' '5' '8' '6']
Feature MarginalAdhesion has unique values: [ 5  1  3  8 10  4  6  2  9  7]
Feature Thickness has unique values: [ 5  3  6  4  8  1  2  7 10  9]
Feature Class has unique values: [2 4]


In [16]:
#clf = tree.DecisionTreeClassifier(criterion='entropy', max_features="auto", min_samples_leaf=10)
clf = tree.DecisionTreeClassifier(criterion='gini', max_features=None, min_samples_leaf=10)
features_list=['Thickness', 'UniofCellSize', 'UniformityofCellShape', 'MarginalAdhesion', 
               'SingleEpithelialCellSize', 'BareNuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']
clf.fit(df2[features_list],df2.Class)

ValueError: could not convert string to float: '?'